chore: seed Peregrine from personal job-seeker (pre-generalization)

App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
2026-02-24 18:25:39 -08:00 · 2026-02-24 18:25:39 -08:00 · 1dc1ca89d7
commit 1dc1ca89d7
61 changed files with 11370 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,20 @@
 .env
 config/notion.yaml
 config/tokens.yaml
 config/email.yaml
 config/adzuna.yaml
 config/craigslist.yaml
 __pycache__/
 *.pyc
 .pytest_cache/
 output/
 aihawk/
 resume_matcher/
 staging.db
 .streamlit.log
 .streamlit.pid
 .coverage
 log/
 unsloth_compiled_cache/
 data/survey_screenshots/*
 !data/survey_screenshots/.gitkeep
--- a/app/.streamlit/config.toml
+++ b/app/.streamlit/config.toml
@ -0,0 +1,7 @@
 [theme]
 base = "dark"
 primaryColor = "#2DD4BF"
 backgroundColor = "#0F172A"
 secondaryBackgroundColor = "#1E293B"
 textColor = "#F1F5F9"
 font = "sans serif"
--- a/app/Home.py
+++ b/app/Home.py
@ -0,0 +1,475 @@
 # app/Home.py
 """
 Job Seeker Dashboard — Home page.
 Shows counts, Run Discovery button, and Sync to Notion button.
 """
 import subprocess
 import sys
 from pathlib import Path
 import streamlit as st
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \
    purge_non_remote, archive_jobs, kill_stuck_tasks, get_task_for_job, get_active_tasks, \
    insert_job, get_existing_urls
 from scripts.task_runner import submit_task
 init_db(DEFAULT_DB)
 def _dismissible(key: str, status: str, msg: str) -> None:
    """Render a dismissible success/error message. key must be unique per task result."""
    if st.session_state.get(f"dismissed_{key}"):
        return
    col_msg, col_x = st.columns([10, 1])
    with col_msg:
        if status == "completed":
            st.success(msg)
        else:
            st.error(msg)
    with col_x:
        st.write("")
        if st.button("✕", key=f"dismiss_{key}", help="Dismiss"):
            st.session_state[f"dismissed_{key}"] = True
            st.rerun()
 def _queue_url_imports(db_path: Path, urls: list) -> int:
    """Insert each URL as a pending manual job and queue a scrape_url task.
    Returns count of newly queued jobs."""
    from datetime import datetime
    from scripts.scrape_url import canonicalize_url
    existing = get_existing_urls(db_path)
    queued = 0
    for url in urls:
        url = canonicalize_url(url.strip())
        if not url.startswith("http"):
            continue
        if url in existing:
            continue
        job_id = insert_job(db_path, {
            "title": "Importing…",
            "company": "",
            "url": url,
            "source": "manual",
            "location": "",
            "description": "",
            "date_found": datetime.now().isoformat()[:10],
        })
        if job_id:
            submit_task(db_path, "scrape_url", job_id)
            queued += 1
    return queued
 st.title("🔍 Alex's Job Search")
 st.caption("Discover → Review → Sync to Notion")
 st.divider()
@st.fragment(run_every=10)
 def _live_counts():
    counts = get_job_counts(DEFAULT_DB)
    col1, col2, col3, col4, col5 = st.columns(5)
    col1.metric("Pending Review", counts.get("pending", 0))
    col2.metric("Approved", counts.get("approved", 0))
    col3.metric("Applied", counts.get("applied", 0))
    col4.metric("Synced to Notion", counts.get("synced", 0))
    col5.metric("Rejected", counts.get("rejected", 0))
 _live_counts()
 st.divider()
 left, enrich_col, mid, right = st.columns(4)
 with left:
    st.subheader("Find New Jobs")
    st.caption("Scrapes all configured boards and adds new listings to your review queue.")
    _disc_task = get_task_for_job(DEFAULT_DB, "discovery", 0)
    _disc_running = _disc_task and _disc_task["status"] in ("queued", "running")
    if st.button("🚀 Run Discovery", use_container_width=True, type="primary",
                 disabled=bool(_disc_running)):
        submit_task(DEFAULT_DB, "discovery", 0)
        st.rerun()
    if _disc_running:
        @st.fragment(run_every=4)
        def _disc_status():
            t = get_task_for_job(DEFAULT_DB, "discovery", 0)
            if t and t["status"] in ("queued", "running"):
                lbl = "Queued…" if t["status"] == "queued" else "Scraping job boards… this may take a minute"
                st.info(f"⏳ {lbl}")
            else:
                st.rerun()
        _disc_status()
    elif _disc_task and _disc_task["status"] == "completed":
        _dismissible(f"disc_{_disc_task['id']}", "completed",
                     f"✅ Discovery complete — {_disc_task.get('error', '')}. Head to Job Review.")
    elif _disc_task and _disc_task["status"] == "failed":
        _dismissible(f"disc_{_disc_task['id']}", "failed",
                     f"Discovery failed: {_disc_task.get('error', '')}")
 with enrich_col:
    st.subheader("Enrich Descriptions")
    st.caption("Re-fetch missing descriptions for any listing (LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, generic).")
    _enrich_task = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
    _enrich_running = _enrich_task and _enrich_task["status"] in ("queued", "running")
    if st.button("🔍 Fill Missing Descriptions", use_container_width=True, type="primary",
                 disabled=bool(_enrich_running)):
        submit_task(DEFAULT_DB, "enrich_descriptions", 0)
        st.rerun()
    if _enrich_running:
        @st.fragment(run_every=4)
        def _enrich_status():
            t = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
            if t and t["status"] in ("queued", "running"):
                st.info("⏳ Fetching descriptions…")
            else:
                st.rerun()
        _enrich_status()
    elif _enrich_task and _enrich_task["status"] == "completed":
        _dismissible(f"enrich_{_enrich_task['id']}", "completed",
                     f"✅ {_enrich_task.get('error', 'Done')}")
    elif _enrich_task and _enrich_task["status"] == "failed":
        _dismissible(f"enrich_{_enrich_task['id']}", "failed",
                     f"Enrich failed: {_enrich_task.get('error', '')}")
 with mid:
    unscored = sum(1 for j in __import__("scripts.db", fromlist=["get_jobs_by_status"])
                   .get_jobs_by_status(DEFAULT_DB, "pending")
                   if j.get("match_score") is None and j.get("description"))
    st.subheader("Score Listings")
    st.caption(f"Run TF-IDF match scoring against Alex's resume. {unscored} pending job{'s' if unscored != 1 else ''} unscored.")
    if st.button("📊 Score All Unscored Jobs", use_container_width=True, type="primary",
                 disabled=unscored == 0):
        with st.spinner("Scoring…"):
            result = subprocess.run(
                ["conda", "run", "-n", "job-seeker", "python", "scripts/match.py"],
                capture_output=True, text=True,
                cwd=str(Path(__file__).parent.parent),
            )
        if result.returncode == 0:
            st.success("Scoring complete!")
            st.code(result.stdout)
        else:
            st.error("Scoring failed.")
            st.code(result.stderr)
        st.rerun()
 with right:
    approved_count = get_job_counts(DEFAULT_DB).get("approved", 0)
    st.subheader("Send to Notion")
    st.caption("Push all approved jobs to your Notion tracking database.")
    if approved_count == 0:
        st.info("No approved jobs yet. Review and approve some listings first.")
    else:
        if st.button(
            f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion",
            use_container_width=True, type="primary",
        ):
            with st.spinner("Syncing to Notion…"):
                from scripts.sync import sync_to_notion
                count = sync_to_notion(DEFAULT_DB)
            st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!")
            st.rerun()
 st.divider()
 # ── Email Sync ────────────────────────────────────────────────────────────────
 email_left, email_right = st.columns([3, 1])
 with email_left:
    st.subheader("Sync Emails")
    st.caption("Pull inbound recruiter emails and match them to active applications. "
               "New recruiter outreach is added to your Job Review queue.")
 with email_right:
    _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
    _email_running = _email_task and _email_task["status"] in ("queued", "running")
    if st.button("📧 Sync Emails", use_container_width=True, type="primary",
                 disabled=bool(_email_running)):
        submit_task(DEFAULT_DB, "email_sync", 0)
        st.rerun()
    if _email_running:
        @st.fragment(run_every=4)
        def _email_status():
            t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
            if t and t["status"] in ("queued", "running"):
                st.info("⏳ Syncing emails…")
            else:
                st.rerun()
        _email_status()
    elif _email_task and _email_task["status"] == "completed":
        _dismissible(f"email_{_email_task['id']}", "completed",
                     f"✅ {_email_task.get('error', 'Done')}")
    elif _email_task and _email_task["status"] == "failed":
        _dismissible(f"email_{_email_task['id']}", "failed",
                     f"Sync failed: {_email_task.get('error', '')}")
 st.divider()
 # ── Add Jobs by URL ───────────────────────────────────────────────────────────
 add_left, _add_right = st.columns([3, 1])
 with add_left:
    st.subheader("Add Jobs by URL")
    st.caption("Paste job listing URLs to import and scrape in the background. "
               "Supports LinkedIn, Indeed, Glassdoor, and most job boards.")
 url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"])
 with url_tab:
    url_text = st.text_area(
        "urls",
        placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc",
        height=100,
        label_visibility="collapsed",
    )
    if st.button("📥 Add Jobs", key="add_urls_btn", use_container_width=True,
                 disabled=not (url_text or "").strip()):
        _urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")]
        if _urls:
            _n = _queue_url_imports(DEFAULT_DB, _urls)
            if _n:
                st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.")
            else:
                st.info("All URLs already in the database.")
            st.rerun()
 with csv_tab:
    csv_file = st.file_uploader("CSV with a URL column", type=["csv"],
                                label_visibility="collapsed")
    if csv_file:
        import csv as _csv
        import io as _io
        reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace")))
        _csv_urls = []
        for row in reader:
            for val in row.values():
                if val and val.strip().startswith("http"):
                    _csv_urls.append(val.strip())
                    break
        if _csv_urls:
            st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.")
            if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True):
                _n = _queue_url_imports(DEFAULT_DB, _csv_urls)
                st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.")
                st.rerun()
        else:
            st.warning("No URLs found — CSV must have a column whose values start with http.")
@st.fragment(run_every=3)
 def _scrape_status():
    import sqlite3 as _sq
    conn = _sq.connect(DEFAULT_DB)
    conn.row_factory = _sq.Row
    rows = conn.execute(
        """SELECT bt.status, bt.error, j.title, j.company, j.url
           FROM background_tasks bt
           JOIN jobs j ON j.id = bt.job_id
           WHERE bt.task_type = 'scrape_url'
             AND bt.updated_at >= datetime('now', '-5 minutes')
           ORDER BY bt.updated_at DESC LIMIT 20"""
    ).fetchall()
    conn.close()
    if not rows:
        return
    st.caption("Recent URL imports:")
    for r in rows:
        if r["status"] == "running":
            st.info(f"⏳ Scraping {r['url']}")
        elif r["status"] == "completed":
            label = r["title"] + (f" @ {r['company']}" if r["company"] else "")
            st.success(f"✅ {label}")
        elif r["status"] == "failed":
            st.error(f"❌ {r['url']} — {r['error'] or 'scrape failed'}")
 _scrape_status()
 st.divider()
 # ── Danger zone: purge + re-scrape ────────────────────────────────────────────
 with st.expander("⚠️ Danger Zone", expanded=False):
    st.caption(
        "**Purge** permanently deletes jobs from the local database. "
        "Applied and synced jobs are never touched."
    )
    purge_col, rescrape_col, email_col, tasks_col = st.columns(4)
    with purge_col:
        st.markdown("**Purge pending & rejected**")
        st.caption("Removes all _pending_ and _rejected_ listings so the next discovery starts fresh.")
        if st.button("🗑 Purge Pending + Rejected", use_container_width=True):
            st.session_state["confirm_purge"] = "partial"
        if st.session_state.get("confirm_purge") == "partial":
            st.warning("Are you sure? This cannot be undone.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, purge", type="primary", use_container_width=True):
                deleted = purge_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
                st.success(f"Purged {deleted} jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    with email_col:
        st.markdown("**Purge email data**")
        st.caption("Clears all email thread logs and email-sourced pending jobs so the next sync starts fresh.")
        if st.button("📧 Purge Email Data", use_container_width=True):
            st.session_state["confirm_purge"] = "email"
        if st.session_state.get("confirm_purge") == "email":
            st.warning("This deletes all email contacts and email-sourced jobs. Cannot be undone.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, purge emails", type="primary", use_container_width=True):
                contacts, jobs = purge_email_data(DEFAULT_DB)
                st.success(f"Purged {contacts} email contacts, {jobs} email jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel  ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    with tasks_col:
        _active = get_active_tasks(DEFAULT_DB)
        st.markdown("**Kill stuck tasks**")
        st.caption(f"Force-fail all queued/running background tasks. Currently **{len(_active)}** active.")
        if st.button("⏹ Kill All Tasks", use_container_width=True, disabled=len(_active) == 0):
            killed = kill_stuck_tasks(DEFAULT_DB)
            st.success(f"Killed {killed} task(s).")
            st.rerun()
    with rescrape_col:
        st.markdown("**Purge all & re-scrape**")
        st.caption("Wipes _all_ non-applied, non-synced jobs then immediately runs a fresh discovery.")
        if st.button("🔄 Purge All + Re-scrape", use_container_width=True):
            st.session_state["confirm_purge"] = "full"
        if st.session_state.get("confirm_purge") == "full":
            st.warning("This will delete ALL pending, approved, and rejected jobs, then re-scrape. Applied and synced records are kept.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, wipe + scrape", type="primary", use_container_width=True):
                purge_jobs(DEFAULT_DB, statuses=["pending", "approved", "rejected"])
                submit_task(DEFAULT_DB, "discovery", 0)
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    st.divider()
    pending_col, nonremote_col, approved_col, _ = st.columns(4)
    with pending_col:
        st.markdown("**Purge pending review**")
        st.caption("Removes only _pending_ listings, keeping your rejected history intact.")
        if st.button("🗑 Purge Pending Only", use_container_width=True):
            st.session_state["confirm_purge"] = "pending_only"
        if st.session_state.get("confirm_purge") == "pending_only":
            st.warning("Deletes all pending jobs. Rejected jobs are kept. Cannot be undone.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, purge pending", type="primary", use_container_width=True):
                deleted = purge_jobs(DEFAULT_DB, statuses=["pending"])
                st.success(f"Purged {deleted} pending jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel   ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    with nonremote_col:
        st.markdown("**Purge non-remote**")
        st.caption("Removes pending/approved/rejected jobs where remote is not set. Keeps anything already in the pipeline.")
        if st.button("🏢 Purge On-site Jobs", use_container_width=True):
            st.session_state["confirm_purge"] = "non_remote"
        if st.session_state.get("confirm_purge") == "non_remote":
            st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, purge on-site", type="primary", use_container_width=True):
                deleted = purge_non_remote(DEFAULT_DB)
                st.success(f"Purged {deleted} non-remote jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel    ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    with approved_col:
        st.markdown("**Purge approved (unapplied)**")
        st.caption("Removes _approved_ jobs you haven't applied to yet — e.g. to reset after a review pass.")
        if st.button("🗑 Purge Approved", use_container_width=True):
            st.session_state["confirm_purge"] = "approved_only"
        if st.session_state.get("confirm_purge") == "approved_only":
            st.warning("Deletes all approved-but-not-applied jobs. Cannot be undone.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, purge approved", type="primary", use_container_width=True):
                deleted = purge_jobs(DEFAULT_DB, statuses=["approved"])
                st.success(f"Purged {deleted} approved jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel     ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    st.divider()
    archive_col1, archive_col2, _, _ = st.columns(4)
    with archive_col1:
        st.markdown("**Archive remaining**")
        st.caption(
            "Move all _pending_ and _rejected_ jobs to archived status. "
            "Archived jobs stay in the DB for dedup — they just won't appear in Job Review."
        )
        if st.button("📦 Archive Pending + Rejected", use_container_width=True):
            st.session_state["confirm_purge"] = "archive_remaining"
        if st.session_state.get("confirm_purge") == "archive_remaining":
            st.info("Jobs will be archived (not deleted) — URLs are kept for dedup.")
            c1, c2 = st.columns(2)
            if c1.button("Yes, archive", type="primary", use_container_width=True):
                archived = archive_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
                st.success(f"Archived {archived} jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel      ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
    with archive_col2:
        st.markdown("**Archive approved (unapplied)**")
        st.caption("Archive _approved_ listings you decided to skip — keeps history without cluttering the apply queue.")
        if st.button("📦 Archive Approved", use_container_width=True):
            st.session_state["confirm_purge"] = "archive_approved"
        if st.session_state.get("confirm_purge") == "archive_approved":
            st.info("Approved jobs will be archived (not deleted).")
            c1, c2 = st.columns(2)
            if c1.button("Yes, archive approved", type="primary", use_container_width=True):
                archived = archive_jobs(DEFAULT_DB, statuses=["approved"])
                st.success(f"Archived {archived} approved jobs.")
                st.session_state.pop("confirm_purge", None)
                st.rerun()
            if c2.button("Cancel       ", use_container_width=True):
                st.session_state.pop("confirm_purge", None)
                st.rerun()
--- a/app/app.py
+++ b/app/app.py
@ -0,0 +1,119 @@
 # app/app.py
 """
 Streamlit entry point — uses st.navigation() to control the sidebar.
 Main workflow pages are listed at the top; Settings is separated into
 a "System" section so it doesn't crowd the navigation.
 Run: streamlit run app/app.py
     bash scripts/manage-ui.sh start
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import streamlit as st
 from scripts.db import DEFAULT_DB, init_db, get_active_tasks
 import sqlite3
 st.set_page_config(
    page_title="Job Seeker",
    page_icon="💼",
    layout="wide",
 )
 init_db(DEFAULT_DB)
 # ── Startup cleanup — runs once per server process via cache_resource ──────────
@st.cache_resource
 def _startup() -> None:
    """Runs exactly once per server lifetime (st.cache_resource).
    1. Marks zombie tasks as failed.
    2. Auto-queues re-runs for any research generated without SearXNG data,
       if SearXNG is now reachable.
    """
    conn = sqlite3.connect(DEFAULT_DB)
    conn.execute(
        "UPDATE background_tasks SET status='failed', error='Interrupted by server restart',"
        " finished_at=datetime('now') WHERE status IN ('queued','running')"
    )
    conn.commit()
    # Auto-recovery: re-run LLM-only research when SearXNG is available
    try:
        import requests as _req
        if _req.get("http://localhost:8888/", timeout=3).status_code == 200:
            from scripts.task_runner import submit_task
            _ACTIVE_STAGES = ("phone_screen", "interviewing", "offer", "hired")
            rows = conn.execute(
                """SELECT cr.job_id FROM company_research cr
                   JOIN jobs j ON j.id = cr.job_id
                   WHERE (cr.scrape_used IS NULL OR cr.scrape_used = 0)
                   AND j.status IN ({})""".format(",".join("?" * len(_ACTIVE_STAGES))),
                _ACTIVE_STAGES,
            ).fetchall()
            for (job_id,) in rows:
                submit_task(str(DEFAULT_DB), "company_research", job_id)
    except Exception:
        pass  # never block startup
    conn.close()
 _startup()
 # ── Navigation ─────────────────────────────────────────────────────────────────
 # st.navigation() must be called before any sidebar writes so it can establish
 # the navigation structure first; sidebar additions come after.
 pages = {
    "": [
        st.Page("Home.py",                   title="Home",            icon="🏠"),
        st.Page("pages/1_Job_Review.py",     title="Job Review",      icon="📋"),
        st.Page("pages/4_Apply.py",          title="Apply Workspace", icon="🚀"),
        st.Page("pages/5_Interviews.py",     title="Interviews",      icon="🎯"),
        st.Page("pages/6_Interview_Prep.py", title="Interview Prep",  icon="📞"),
        st.Page("pages/7_Survey.py",         title="Survey Assistant", icon="📋"),
    ],
    "System": [
        st.Page("pages/2_Settings.py",       title="Settings",        icon="⚙️"),
    ],
 }
 pg = st.navigation(pages)
 # ── Background task sidebar indicator ─────────────────────────────────────────
 # Fragment polls every 3s so stage labels update live without a full page reload.
 # The sidebar context WRAPS the fragment call — do not write to st.sidebar inside it.
@st.fragment(run_every=3)
 def _task_indicator():
    tasks = get_active_tasks(DEFAULT_DB)
    if not tasks:
        return
    st.divider()
    st.markdown(f"**⏳ {len(tasks)} task(s) running**")
    for t in tasks:
        icon = "⏳" if t["status"] == "running" else "🕐"
        task_type = t["task_type"]
        if task_type == "cover_letter":
            label = "Cover letter"
        elif task_type == "company_research":
            label = "Research"
        elif task_type == "email_sync":
            label = "Email sync"
        elif task_type == "discovery":
            label = "Discovery"
        elif task_type == "enrich_descriptions":
            label = "Enriching"
        elif task_type == "scrape_url":
            label = "Scraping URL"
        elif task_type == "enrich_craigslist":
            label = "Enriching listing"
        else:
            label = task_type.replace("_", " ").title()
        stage = t.get("stage") or ""
        detail = f" · {stage}" if stage else (f" — {t.get('company')}" if t.get("company") else "")
        st.caption(f"{icon} {label}{detail}")
 with st.sidebar:
    _task_indicator()
 pg.run()
--- a/app/pages/1_Job_Review.py
+++ b/app/pages/1_Job_Review.py
@ -0,0 +1,203 @@
 # app/pages/1_Job_Review.py
 """
 Job Review — browse listings, approve/reject inline, generate cover letters,
 and mark approved jobs as applied.
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 from scripts.db import (
    DEFAULT_DB, init_db, get_jobs_by_status, update_job_status,
    update_cover_letter, mark_applied, get_email_leads,
 )
 st.title("📋 Job Review")
 init_db(DEFAULT_DB)
 _email_leads = get_email_leads(DEFAULT_DB)
 # ── Sidebar filters ────────────────────────────────────────────────────────────
 with st.sidebar:
    st.header("Filters")
    show_status = st.selectbox(
        "Show",
        ["pending", "approved", "applied", "rejected", "synced"],
        index=0,
    )
    remote_only = st.checkbox("Remote only", value=False)
    min_score = st.slider("Min match score", 0, 100, 0)
    st.header("Sort")
    sort_by = st.selectbox(
        "Sort by",
        ["Date Found (newest)", "Date Found (oldest)", "Match Score (high→low)", "Match Score (low→high)", "Company A–Z", "Title A–Z"],
        index=0,
    )
 jobs = get_jobs_by_status(DEFAULT_DB, show_status)
 if remote_only:
    jobs = [j for j in jobs if j.get("is_remote")]
 if min_score > 0:
    jobs = [j for j in jobs if (j.get("match_score") or 0) >= min_score]
 # Apply sort
 if sort_by == "Date Found (newest)":
    jobs = sorted(jobs, key=lambda j: j.get("date_found") or "", reverse=True)
 elif sort_by == "Date Found (oldest)":
    jobs = sorted(jobs, key=lambda j: j.get("date_found") or "")
 elif sort_by == "Match Score (high→low)":
    jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0, reverse=True)
 elif sort_by == "Match Score (low→high)":
    jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0)
 elif sort_by == "Company A–Z":
    jobs = sorted(jobs, key=lambda j: (j.get("company") or "").lower())
 elif sort_by == "Title A–Z":
    jobs = sorted(jobs, key=lambda j: (j.get("title") or "").lower())
 if not jobs:
    st.info(f"No {show_status} jobs matching your filters.")
    st.stop()
 st.caption(f"Showing {len(jobs)} {show_status} job{'s' if len(jobs) != 1 else ''}")
 st.divider()
 if show_status == "pending" and _email_leads:
    st.subheader(f"📧 Email Leads ({len(_email_leads)})")
    st.caption(
        "Inbound recruiter emails not yet matched to a scraped listing. "
        "Approve to add to Job Review; Reject to dismiss."
    )
    for lead in _email_leads:
        lead_id = lead["id"]
        with st.container(border=True):
            left_l, right_l = st.columns([7, 3])
            with left_l:
                st.markdown(f"**{lead['title']}** — {lead['company']}")
                badge_cols = st.columns(4)
                badge_cols[0].caption("📧 Email Lead")
                badge_cols[1].caption(f"📅 {lead.get('date_found', '')}")
                if lead.get("description"):
                    with st.expander("📄 Email excerpt", expanded=False):
                        st.text(lead["description"][:500])
            with right_l:
                if st.button("✅ Approve", key=f"el_approve_{lead_id}",
                             type="primary", use_container_width=True):
                    update_job_status(DEFAULT_DB, [lead_id], "approved")
                    st.rerun()
                if st.button("❌ Reject", key=f"el_reject_{lead_id}",
                             use_container_width=True):
                    update_job_status(DEFAULT_DB, [lead_id], "rejected")
                    st.rerun()
    st.divider()
 # Filter email leads out of the main pending list (already shown above)
 if show_status == "pending":
    jobs = [j for j in jobs if j.get("source") != "email"]
 # ── Job cards ──────────────────────────────────────────────────────────────────
 for job in jobs:
    job_id = job["id"]
    score = job.get("match_score")
    if score is None:
        score_badge = "⬜ No score"
    elif score >= 70:
        score_badge = f"🟢 {score:.0f}%"
    elif score >= 40:
        score_badge = f"🟡 {score:.0f}%"
    else:
        score_badge = f"🔴 {score:.0f}%"
    remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site"
    src = (job.get("source") or "").lower()
    source_badge = f"🤖 {src.title()}" if src == "linkedin" else f"👤 {src.title() or 'Manual'}"
    with st.container(border=True):
        left, right = st.columns([7, 3])
        # ── Left: job info ─────────────────────────────────────────────────────
        with left:
            st.markdown(f"**{job['title']}** — {job['company']}")
            badge_cols = st.columns(4)
            badge_cols[0].caption(remote_badge)
            badge_cols[1].caption(source_badge)
            badge_cols[2].caption(score_badge)
            badge_cols[3].caption(f"📅 {job.get('date_found', '')}")
            if job.get("keyword_gaps"):
                st.caption(f"**Keyword gaps:** {job['keyword_gaps']}")
            # Cover letter expander (approved view)
            if show_status == "approved":
                _cl_key = f"cl_{job_id}"
                if _cl_key not in st.session_state:
                    st.session_state[_cl_key] = job.get("cover_letter") or ""
                cl_exists = bool(st.session_state[_cl_key])
                with st.expander("📝 Cover Letter", expanded=cl_exists):
                    gen_label = "Regenerate" if cl_exists else "Generate Cover Letter"
                    if st.button(gen_label, key=f"gen_{job_id}"):
                        with st.spinner("Generating via LLM…"):
                            try:
                                from scripts.generate_cover_letter import generate as _gen
                                st.session_state[_cl_key] = _gen(
                                    job.get("title", ""),
                                    job.get("company", ""),
                                    job.get("description", ""),
                                )
                                st.rerun()
                            except Exception as e:
                                st.error(f"Generation failed: {e}")
                    st.text_area(
                        "cover_letter_edit",
                        key=_cl_key,
                        height=300,
                        label_visibility="collapsed",
                    )
                    save_col, _ = st.columns([2, 5])
                    if save_col.button("💾 Save draft", key=f"save_cl_{job_id}"):
                        update_cover_letter(DEFAULT_DB, job_id, st.session_state[_cl_key])
                        st.success("Saved!")
            # Applied date + cover letter preview (applied/synced)
            if show_status in ("applied", "synced") and job.get("applied_at"):
                st.caption(f"✅ Applied: {job['applied_at']}")
            if show_status in ("applied", "synced") and job.get("cover_letter"):
                with st.expander("📝 Cover Letter (sent)"):
                    st.text(job["cover_letter"])
        # ── Right: actions ─────────────────────────────────────────────────────
        with right:
            if job.get("url"):
                st.link_button("View listing →", job["url"], use_container_width=True)
            if job.get("salary"):
                st.caption(f"💰 {job['salary']}")
            if show_status == "pending":
                if st.button("✅ Approve", key=f"approve_{job_id}",
                             type="primary", use_container_width=True):
                    update_job_status(DEFAULT_DB, [job_id], "approved")
                    st.rerun()
                if st.button("❌ Reject", key=f"reject_{job_id}",
                             use_container_width=True):
                    update_job_status(DEFAULT_DB, [job_id], "rejected")
                    st.rerun()
            elif show_status == "approved":
                if st.button("🚀 Apply →", key=f"apply_page_{job_id}",
                             type="primary", use_container_width=True):
                    st.session_state["apply_job_id"] = job_id
                    st.switch_page("pages/4_Apply.py")
                if st.button("✅ Mark Applied", key=f"applied_{job_id}",
                             use_container_width=True):
                    cl_text = st.session_state.get(f"cl_{job_id}", "")
                    if cl_text:
                        update_cover_letter(DEFAULT_DB, job_id, cl_text)
                    mark_applied(DEFAULT_DB, [job_id])
                    st.rerun()
--- a/app/pages/2_Settings.py
+++ b/app/pages/2_Settings.py
@ -0,0 +1,842 @@
 # app/pages/2_Settings.py
 """
 Settings — edit search profiles, LLM backends, Notion connection, services,
 and resume profile (paste-able bullets used in Apply Workspace).
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 import yaml
 st.title("⚙️ Settings")
 CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
 SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
 BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
 LLM_CFG = CONFIG_DIR / "llm.yaml"
 NOTION_CFG = CONFIG_DIR / "notion.yaml"
 RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
 def load_yaml(path: Path) -> dict:
    if path.exists():
        return yaml.safe_load(path.read_text()) or {}
    return {}
 def save_yaml(path: Path, data: dict) -> None:
    path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
 def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict:
    """Call LLM to suggest additional job titles and exclude keywords."""
    import json
    import re
    from scripts.llm_router import LLMRouter
    resume_context = ""
    if resume_path.exists():
        resume = load_yaml(resume_path)
        lines = []
        for exp in (resume.get("experience_details") or [])[:3]:
            pos = exp.get("position", "")
            co = exp.get("company", "")
            skills = ", ".join((exp.get("skills_acquired") or [])[:5])
            lines.append(f"- {pos} at {co}: {skills}")
        resume_context = "\n".join(lines)
    titles_str = "\n".join(f"- {t}" for t in current_titles)
    prompt = f"""You are helping a job seeker optimize their search criteria.
 Their background (from resume):
 {resume_context or "Customer success and technical account management leader"}
 Current job titles being searched:
 {titles_str}
 Suggest:
 1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants)
 2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings)
 Return ONLY valid JSON in this exact format:
 {{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}"""
    result = LLMRouter().complete(prompt).strip()
    m = re.search(r"\{.*\}", result, re.DOTALL)
    if m:
        try:
            return json.loads(m.group())
        except Exception:
            pass
    return {"suggested_titles": [], "suggested_excludes": []}
 tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs(
    ["🔎 Search", "🤖 LLM Backends", "📚 Notion", "🔌 Services", "📝 Resume Profile", "📧 Email", "🏷️ Skills"]
 )
 # ── Search tab ───────────────────────────────────────────────────────────────
 with tab_search:
    cfg = load_yaml(SEARCH_CFG)
    profiles = cfg.get("profiles", [{}])
    p = profiles[0] if profiles else {}
    # Seed session state from config on first load (or when config changes after save)
    _sp_hash = str(p.get("titles", [])) + str(p.get("exclude_keywords", []))
    if st.session_state.get("_sp_hash") != _sp_hash:
        st.session_state["_sp_titles"] = "\n".join(p.get("titles", []))
        st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
        st.session_state["_sp_hash"] = _sp_hash
    # ── Titles ────────────────────────────────────────────────────────────────
    title_row, suggest_btn_col = st.columns([4, 1])
    with title_row:
        st.subheader("Job Titles to Search")
    with suggest_btn_col:
        st.write("")  # vertical align
        _run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
                                  help="Ask the LLM to suggest additional titles and exclude keywords based on your resume")
    titles_text = st.text_area(
        "One title per line",
        key="_sp_titles",
        height=150,
        help="JobSpy will search for any of these titles across all configured boards.",
        label_visibility="visible",
    )
    # ── LLM suggestions panel ────────────────────────────────────────────────
    if _run_suggest:
        current = [t.strip() for t in titles_text.splitlines() if t.strip()]
        with st.spinner("Asking LLM for suggestions…"):
            suggestions = _suggest_search_terms(current, RESUME_PATH)
        st.session_state["_sp_suggestions"] = suggestions
    if st.session_state.get("_sp_suggestions"):
        sugg = st.session_state["_sp_suggestions"]
        s_titles = sugg.get("suggested_titles", [])
        s_excl = sugg.get("suggested_excludes", [])
        existing_titles = {t.lower() for t in titles_text.splitlines() if t.strip()}
        existing_excl = {e.lower() for e in st.session_state.get("_sp_excludes", "").splitlines() if e.strip()}
        if s_titles:
            st.caption("**Suggested titles** — click to add:")
            cols = st.columns(min(len(s_titles), 4))
            for i, title in enumerate(s_titles):
                with cols[i % 4]:
                    if title.lower() not in existing_titles:
                        if st.button(f"+ {title}", key=f"sp_add_title_{i}"):
                            st.session_state["_sp_titles"] = (
                                st.session_state.get("_sp_titles", "").rstrip("\n") + f"\n{title}"
                            )
                            st.rerun()
                    else:
                        st.caption(f"✓ {title}")
        if s_excl:
            st.caption("**Suggested exclusions** — click to add:")
            cols2 = st.columns(min(len(s_excl), 4))
            for i, kw in enumerate(s_excl):
                with cols2[i % 4]:
                    if kw.lower() not in existing_excl:
                        if st.button(f"+ {kw}", key=f"sp_add_excl_{i}"):
                            st.session_state["_sp_excludes"] = (
                                st.session_state.get("_sp_excludes", "").rstrip("\n") + f"\n{kw}"
                            )
                            st.rerun()
                    else:
                        st.caption(f"✓ {kw}")
        if st.button("✕ Clear suggestions", key="sp_clear_sugg"):
            st.session_state.pop("_sp_suggestions", None)
            st.rerun()
    st.subheader("Locations")
    locations_text = st.text_area(
        "One location per line",
        value="\n".join(p.get("locations", [])),
        height=100,
    )
    st.subheader("Exclude Keywords")
    st.caption("Jobs whose **title or description** contain any of these words are silently dropped before entering the queue. Case-insensitive.")
    exclude_text = st.text_area(
        "One keyword or phrase per line",
        key="_sp_excludes",
        height=150,
        help="e.g. 'sales', 'account executive', 'SDR'",
    )
    st.subheader("Job Boards")
    board_options = ["linkedin", "indeed", "glassdoor", "zip_recruiter", "google"]
    selected_boards = st.multiselect(
        "Standard boards (via JobSpy)", board_options,
        default=[b for b in p.get("boards", board_options) if b in board_options],
        help="Google Jobs aggregates listings from many sources and often finds roles the other boards miss.",
    )
    _custom_board_options = ["adzuna", "theladders"]
    _custom_board_labels = {
        "adzuna":     "Adzuna (free API — requires app_id + app_key in config/adzuna.yaml)",
        "theladders": "The Ladders (curl_cffi scraper — $100K+ roles, requires curl_cffi)",
    }
    st.caption("**Custom boards** — scrapers built into this app, not part of JobSpy.")
    selected_custom = st.multiselect(
        "Custom boards",
        options=_custom_board_options,
        default=[b for b in p.get("custom_boards", []) if b in _custom_board_options],
        format_func=lambda b: _custom_board_labels.get(b, b),
    )
    col1, col2 = st.columns(2)
    results_per = col1.slider("Results per board", 5, 100, p.get("results_per_board", 25))
    hours_old = col2.slider("How far back to look (hours)", 24, 720, p.get("hours_old", 72))
    if st.button("💾 Save search settings", type="primary"):
        profiles[0] = {
            **p,
            "titles": [t.strip() for t in titles_text.splitlines() if t.strip()],
            "locations": [loc.strip() for loc in locations_text.splitlines() if loc.strip()],
            "boards": selected_boards,
            "custom_boards": selected_custom,
            "results_per_board": results_per,
            "hours_old": hours_old,
            "exclude_keywords": [k.strip() for k in exclude_text.splitlines() if k.strip()],
        }
        save_yaml(SEARCH_CFG, {"profiles": profiles})
        st.session_state["_sp_hash"] = ""  # force re-seed on next load
        st.session_state.pop("_sp_suggestions", None)
        st.success("Search settings saved!")
    st.divider()
    # ── Blocklist ──────────────────────────────────────────────────────────────
    with st.expander("🚫 Blocklist — companies, industries, and locations I will never work at", expanded=False):
        st.caption(
            "Listings matching any rule below are **silently dropped before entering the review queue**, "
            "across all search profiles and custom boards. Changes take effect on the next discovery run."
        )
        bl = load_yaml(BLOCKLIST_CFG)
        bl_companies = st.text_area(
            "Company names (partial match, one per line)",
            value="\n".join(bl.get("companies", [])),
            height=120,
            help="e.g. 'Amazon' blocks any listing where the company name contains 'amazon' (case-insensitive).",
            key="bl_companies",
        )
        bl_industries = st.text_area(
            "Industry / content keywords (one per line)",
            value="\n".join(bl.get("industries", [])),
            height=100,
            help="Blocked if the keyword appears in the company name OR job description. "
                 "e.g. 'gambling', 'crypto', 'tobacco', 'defense contractor'.",
            key="bl_industries",
        )
        bl_locations = st.text_area(
            "Location strings to exclude (one per line)",
            value="\n".join(bl.get("locations", [])),
            height=80,
            help="e.g. 'Dallas' blocks any listing whose location contains 'dallas'.",
            key="bl_locations",
        )
        if st.button("💾 Save blocklist", type="primary", key="save_blocklist"):
            save_yaml(BLOCKLIST_CFG, {
                "companies":  [c.strip() for c in bl_companies.splitlines() if c.strip()],
                "industries": [i.strip() for i in bl_industries.splitlines() if i.strip()],
                "locations":  [loc.strip() for loc in bl_locations.splitlines() if loc.strip()],
            })
            st.success("Blocklist saved — takes effect on next discovery run.")
 # ── LLM Backends tab ─────────────────────────────────────────────────────────
 with tab_llm:
    import requests as _req
    def _ollama_models(base_url: str) -> list[str]:
        """Fetch installed model names from the Ollama /api/tags endpoint."""
        try:
            r = _req.get(base_url.rstrip("/v1").rstrip("/") + "/api/tags", timeout=2)
            if r.ok:
                return [m["name"] for m in r.json().get("models", [])]
        except Exception:
            pass
        return []
    cfg = load_yaml(LLM_CFG)
    backends = cfg.get("backends", {})
    fallback_order = cfg.get("fallback_order", list(backends.keys()))
    # Persist reordering across reruns triggered by ↑↓ buttons.
    # Reset to config order whenever the config file is fresher than the session key.
    _cfg_key = str(fallback_order)
    if st.session_state.get("_llm_order_cfg_key") != _cfg_key:
        st.session_state["_llm_order"] = list(fallback_order)
        st.session_state["_llm_order_cfg_key"] = _cfg_key
    new_order: list[str] = st.session_state["_llm_order"]
    # All known backends (in current order first, then any extras)
    all_names = list(new_order) + [n for n in backends if n not in new_order]
    st.caption("Enable/disable backends and drag their priority with the ↑ ↓ buttons. "
               "First enabled + reachable backend wins on each call.")
    updated_backends = {}
    for name in all_names:
        b = backends.get(name, {})
        enabled = b.get("enabled", True)
        label = name.replace("_", " ").title()
        pos = new_order.index(name) + 1 if name in new_order else "—"
        header = f"{'🟢' if enabled else '⚫'} **{pos}. {label}**"
        with st.expander(header, expanded=False):
            col_tog, col_up, col_dn, col_spacer = st.columns([2, 1, 1, 4])
            new_enabled = col_tog.checkbox("Enabled", value=enabled, key=f"{name}_enabled")
            # Up / Down only apply to backends currently in the order
            if name in new_order:
                idx = new_order.index(name)
                if col_up.button("↑", key=f"{name}_up", disabled=idx == 0):
                    new_order[idx], new_order[idx - 1] = new_order[idx - 1], new_order[idx]
                    st.session_state["_llm_order"] = new_order
                    st.rerun()
                if col_dn.button("↓", key=f"{name}_dn", disabled=idx == len(new_order) - 1):
                    new_order[idx], new_order[idx + 1] = new_order[idx + 1], new_order[idx]
                    st.session_state["_llm_order"] = new_order
                    st.rerun()
            if b.get("type") == "openai_compat":
                url = st.text_input("URL", value=b.get("base_url", ""), key=f"{name}_url")
                # Ollama gets a live model picker; other backends get a text input
                if name == "ollama":
                    ollama_models = _ollama_models(b.get("base_url", "http://localhost:11434"))
                    current_model = b.get("model", "")
                    if ollama_models:
                        options = ollama_models
                        idx_default = options.index(current_model) if current_model in options else 0
                        model = st.selectbox(
                            "Model",
                            options,
                            index=idx_default,
                            key=f"{name}_model",
                            help="Lists models currently installed in Ollama. Pull new ones with `ollama pull <name>`.",
                        )
                    else:
                        st.caption("_Ollama not reachable — enter model name manually_")
                        model = st.text_input("Model", value=current_model, key=f"{name}_model")
                else:
                    model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model")
                updated_backends[name] = {**b, "base_url": url, "model": model, "enabled": new_enabled}
            elif b.get("type") == "anthropic":
                model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model")
                updated_backends[name] = {**b, "model": model, "enabled": new_enabled}
            else:
                updated_backends[name] = {**b, "enabled": new_enabled}
            if b.get("type") == "openai_compat":
                if st.button(f"Test connection", key=f"test_{name}"):
                    with st.spinner("Testing…"):
                        try:
                            from scripts.llm_router import LLMRouter
                            r = LLMRouter()
                            reachable = r._is_reachable(b.get("base_url", ""))
                            if reachable:
                                st.success("Reachable ✓")
                            else:
                                st.warning("Not reachable ✗")
                        except Exception as e:
                            st.error(f"Error: {e}")
    st.divider()
    st.caption("Current priority: " + " → ".join(
        f"{'✓' if backends.get(n, {}).get('enabled', True) else '✗'} {n}"
        for n in new_order
    ))
    if st.button("💾 Save LLM settings", type="primary"):
        save_yaml(LLM_CFG, {**cfg, "backends": updated_backends, "fallback_order": new_order})
        st.session_state.pop("_llm_order", None)
        st.session_state.pop("_llm_order_cfg_key", None)
        st.success("LLM settings saved!")
 # ── Notion tab ────────────────────────────────────────────────────────────────
 with tab_notion:
    cfg = load_yaml(NOTION_CFG) if NOTION_CFG.exists() else {}
    st.subheader("Notion Connection")
    token = st.text_input(
        "Integration Token",
        value=cfg.get("token", ""),
        type="password",
        help="Find this at notion.so/my-integrations → your integration → Internal Integration Token",
    )
    db_id = st.text_input(
        "Database ID",
        value=cfg.get("database_id", ""),
        help="The 32-character ID from your Notion database URL",
    )
    col_save, col_test = st.columns(2)
    if col_save.button("💾 Save Notion settings", type="primary"):
        save_yaml(NOTION_CFG, {**cfg, "token": token, "database_id": db_id})
        st.success("Notion settings saved!")
    if col_test.button("🔌 Test connection"):
        with st.spinner("Connecting…"):
            try:
                from notion_client import Client
                n = Client(auth=token)
                db = n.databases.retrieve(db_id)
                st.success(f"Connected to: **{db['title'][0]['plain_text']}**")
            except Exception as e:
                st.error(f"Connection failed: {e}")
 # ── Services tab ───────────────────────────────────────────────────────────────
 with tab_services:
    import socket
    import subprocess as _sp
    TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
    PFP_DIR = Path("/Library/Documents/Post Fight Processing")
    # Service definitions: (display_name, port, start_cmd, stop_cmd, notes)
    SERVICES = [
        {
            "name": "Streamlit UI",
            "port": 8501,
            "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "start"],
            "stop":  ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "stop"],
            "cwd":   str(Path(__file__).parent.parent.parent),
            "note":  "Job Seeker web interface",
        },
        {
            "name": "Ollama (local LLM)",
            "port": 11434,
            "start": ["sudo", "systemctl", "start", "ollama"],
            "stop":  ["sudo", "systemctl", "stop", "ollama"],
            "cwd":   "/",
            "note":  "Local inference engine — systemd service",
        },
        {
            "name": "Claude Code Wrapper",
            "port": 3009,
            "start": ["bash", str(PFP_DIR / "manage-services.sh"), "start"],
            "stop":  ["bash", str(PFP_DIR / "manage-services.sh"), "stop"],
            "cwd":   str(PFP_DIR),
            "note":  "OpenAI-compat proxy → Claude Code (port 3009)",
        },
        {
            "name": "GitHub Copilot Wrapper",
            "port": 3010,
            "start": ["bash", str(PFP_DIR / "manage-copilot.sh"), "start"],
            "stop":  ["bash", str(PFP_DIR / "manage-copilot.sh"), "stop"],
            "cwd":   str(PFP_DIR),
            "note":  "OpenAI-compat proxy → GitHub Copilot (port 3010)",
        },
        {
            "name": "vLLM Server",
            "port": 8000,
            "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "start"],
            "stop":  ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "stop"],
            "cwd":   str(Path(__file__).parent.parent.parent),
            "model_dir": "/Library/Assets/LLM/vllm/models",
            "note":  "Local vLLM inference — Ouro model family (port 8000, GPU 1)",
        },
        {
            "name": "Vision Service (moondream2)",
            "port": 8002,
            "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "start"],
            "stop":  ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "stop"],
            "cwd":   str(Path(__file__).parent.parent.parent),
            "note":  "Survey screenshot analysis — moondream2 (port 8002, optional)",
        },
        {
            "name": "SearXNG (company scraper)",
            "port": 8888,
            "start": ["docker", "compose", "up", "-d"],
            "stop":  ["docker", "compose", "down"],
            "cwd":   str(Path("/Library/Development/scrapers/SearXNG")),
            "note":  "Privacy-respecting meta-search used for company research (port 8888)",
        },
    ]
    def _port_open(port: int) -> bool:
        try:
            with socket.create_connection(("127.0.0.1", port), timeout=1):
                return True
        except OSError:
            return False
    st.caption("Monitor and control the LLM backend services. Status is checked live on each page load.")
    for svc in SERVICES:
        up = _port_open(svc["port"])
        badge = "🟢 Running" if up else "🔴 Stopped"
        header = f"**{svc['name']}** — {badge}"
        with st.container(border=True):
            left_col, right_col = st.columns([3, 1])
            with left_col:
                st.markdown(header)
                st.caption(f"Port {svc['port']} · {svc['note']}")
                # Model selector for services backed by a local model directory (e.g. vLLM)
                if "model_dir" in svc:
                    _mdir = Path(svc["model_dir"])
                    _models = (
                        sorted(d.name for d in _mdir.iterdir() if d.is_dir())
                        if _mdir.exists() else []
                    )
                    _mk = f"svc_model_{svc['port']}"
                    _loaded_file = Path("/tmp/vllm-server.model")
                    _loaded = _loaded_file.read_text().strip() if (_loaded_file.exists()) else ""
                    if _models:
                        _default = _models.index(_loaded) if _loaded in _models else 0
                        st.selectbox(
                            "Model",
                            _models,
                            index=_default,
                            key=_mk,
                            disabled=up,
                            help="Model to load on start. Stop then Start to swap models.",
                        )
                    else:
                        st.caption(f"_No models found in {svc['model_dir']}_")
            with right_col:
                if svc["start"] is None:
                    st.caption("_Manual start only_")
                elif up:
                    if st.button("⏹ Stop", key=f"svc_stop_{svc['port']}", use_container_width=True):
                        with st.spinner(f"Stopping {svc['name']}…"):
                            r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"])
                        if r.returncode == 0:
                            st.success("Stopped.")
                        else:
                            st.error(f"Error: {r.stderr or r.stdout}")
                        st.rerun()
                else:
                    # Build start command, appending selected model for services with model_dir
                    _start_cmd = list(svc["start"])
                    if "model_dir" in svc:
                        _sel = st.session_state.get(f"svc_model_{svc['port']}")
                        if _sel:
                            _start_cmd.append(_sel)
                    if st.button("▶ Start", key=f"svc_start_{svc['port']}", use_container_width=True, type="primary"):
                        with st.spinner(f"Starting {svc['name']}…"):
                            r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"])
                        if r.returncode == 0:
                            st.success("Started!")
                        else:
                            st.error(f"Error: {r.stderr or r.stdout}")
                        st.rerun()
    st.divider()
    st.subheader("🤗 Hugging Face")
    st.caption(
        "Used for uploading training data and running fine-tune jobs on HF infrastructure. "
        "Token is stored in `config/tokens.yaml` (git-ignored). "
        "Create a **write-permission** token at huggingface.co/settings/tokens."
    )
    tok_cfg = load_yaml(TOKENS_CFG) if TOKENS_CFG.exists() else {}
    hf_token = st.text_input(
        "HF Token",
        value=tok_cfg.get("hf_token", ""),
        type="password",
        placeholder="hf_…",
    )
    col_save_hf, col_test_hf = st.columns(2)
    if col_save_hf.button("💾 Save HF token", type="primary"):
        save_yaml(TOKENS_CFG, {**tok_cfg, "hf_token": hf_token})
        TOKENS_CFG.chmod(0o600)
        st.success("Saved!")
    if col_test_hf.button("🔌 Test HF token"):
        with st.spinner("Checking…"):
            try:
                import requests as _r
                resp = _r.get(
                    "https://huggingface.co/api/whoami",
                    headers={"Authorization": f"Bearer {hf_token}"},
                    timeout=5,
                )
                if resp.ok:
                    info = resp.json()
                    name = info.get("name") or info.get("fullname") or "unknown"
                    auth = info.get("auth", {})
                    perm = auth.get("accessToken", {}).get("role", "read")
                    st.success(f"Logged in as **{name}** · permission: `{perm}`")
                    if perm == "read":
                        st.warning("Token is read-only — create a **write** token to upload datasets and run Jobs.")
                else:
                    st.error(f"Invalid token ({resp.status_code})")
            except Exception as e:
                st.error(f"Error: {e}")
 # ── Resume Profile tab ────────────────────────────────────────────────────────
 with tab_resume:
    st.caption(
        "Edit Alex's application profile. "
        "Bullets are used as paste-able shortcuts in the Apply Workspace."
    )
    if not RESUME_PATH.exists():
        st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?")
        st.stop()
    _data = yaml.safe_load(RESUME_PATH.read_text()) or {}
    def _field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str:
        needs_attention = str(value).startswith("FILL_IN") or value == ""
        if needs_attention:
            st.markdown(
                '<p style="color:#F59E0B;font-size:0.8em;margin-bottom:2px">⚠️ Needs attention</p>',
                unsafe_allow_html=True,
            )
        return st.text_input(label, value=value or "", key=key, help=help,
                             type="password" if password else "default")
    # ── Personal Info ─────────────────────────────────────────────────────────
    with st.expander("👤 Personal Information", expanded=True):
        _info = _data.get("personal_information", {})
        _c1, _c2 = st.columns(2)
        with _c1:
            _name     = _field("First Name", _info.get("name", ""),    "rp_name")
            _email    = _field("Email",      _info.get("email", ""),   "rp_email")
            _phone    = _field("Phone",      _info.get("phone", ""),   "rp_phone")
            _city     = _field("City",       _info.get("city", ""),    "rp_city")
        with _c2:
            _surname  = _field("Last Name",  _info.get("surname", ""), "rp_surname")
            _linkedin = _field("LinkedIn URL", _info.get("linkedin", ""), "rp_linkedin")
            _zip_code = _field("Zip Code",   _info.get("zip_code", ""), "rp_zip")
            _dob      = _field("Date of Birth", _info.get("date_of_birth", ""), "rp_dob",
                               help="MM/DD/YYYY")
    # ── Experience ────────────────────────────────────────────────────────────
    with st.expander("💼 Work Experience"):
        _exp_list = _data.get("experience_details", [{}])
        if "rp_exp_count" not in st.session_state:
            st.session_state.rp_exp_count = len(_exp_list)
        if st.button("+ Add Experience Entry", key="rp_add_exp"):
            st.session_state.rp_exp_count += 1
            _exp_list.append({})
        _updated_exp = []
        for _i in range(st.session_state.rp_exp_count):
            _exp = _exp_list[_i] if _i < len(_exp_list) else {}
            st.markdown(f"**Position {_i + 1}**")
            _ec1, _ec2 = st.columns(2)
            with _ec1:
                _pos    = _field("Job Title",    _exp.get("position", ""),          f"rp_pos_{_i}")
                _co     = _field("Company",      _exp.get("company", ""),           f"rp_co_{_i}")
                _period = _field("Period",        _exp.get("employment_period", ""), f"rp_period_{_i}",
                                 help="e.g. 01/2022 - Present")
            with _ec2:
                _loc = st.text_input("Location", _exp.get("location", ""), key=f"rp_loc_{_i}")
                _ind = st.text_input("Industry", _exp.get("industry", ""), key=f"rp_ind_{_i}")
            _resp_raw = st.text_area(
                "Key Responsibilities (one per line)",
                value="\n".join(
                    r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r)
                    for j, r in enumerate(_exp.get("key_responsibilities", []))
                ),
                key=f"rp_resp_{_i}", height=100,
            )
            _skills_raw = st.text_input(
                "Skills (comma-separated)",
                value=", ".join(_exp.get("skills_acquired", [])),
                key=f"rp_skills_{_i}",
            )
            _updated_exp.append({
                "position": _pos, "company": _co, "employment_period": _period,
                "location": _loc, "industry": _ind,
                "key_responsibilities": [{"responsibility_1": r.strip()} for r in _resp_raw.splitlines() if r.strip()],
                "skills_acquired": [s.strip() for s in _skills_raw.split(",") if s.strip()],
            })
            st.divider()
    # ── Preferences ───────────────────────────────────────────────────────────
    with st.expander("⚙️ Preferences & Availability"):
        _wp   = _data.get("work_preferences", {})
        _sal  = _data.get("salary_expectations", {})
        _avail = _data.get("availability", {})
        _pc1, _pc2 = st.columns(2)
        with _pc1:
            _salary_range = st.text_input("Salary Range (USD)", _sal.get("salary_range_usd", ""),
                                          key="rp_salary", help="e.g. 120000 - 180000")
            _notice = st.text_input("Notice Period", _avail.get("notice_period", "2 weeks"), key="rp_notice")
        with _pc2:
            _remote      = st.checkbox("Open to Remote",     value=_wp.get("remote_work", "Yes") == "Yes",         key="rp_remote")
            _reloc       = st.checkbox("Open to Relocation", value=_wp.get("open_to_relocation", "No") == "Yes",   key="rp_reloc")
            _assessments = st.checkbox("Willing to complete assessments",
                                       value=_wp.get("willing_to_complete_assessments", "Yes") == "Yes",           key="rp_assess")
            _bg          = st.checkbox("Willing to undergo background checks",
                                       value=_wp.get("willing_to_undergo_background_checks", "Yes") == "Yes",      key="rp_bg")
    # ── Self-ID ───────────────────────────────────────────────────────────────
    with st.expander("🏳️‍🌈 Self-Identification (optional)"):
        _sid = _data.get("self_identification", {})
        _sc1, _sc2 = st.columns(2)
        with _sc1:
            _gender    = st.text_input("Gender identity", _sid.get("gender", "Non-binary"),   key="rp_gender")
            _pronouns  = st.text_input("Pronouns",        _sid.get("pronouns", "Any"),         key="rp_pronouns")
            _ethnicity = _field("Ethnicity", _sid.get("ethnicity", ""), "rp_ethnicity")
        with _sc2:
            _vet_opts = ["No", "Yes", "Prefer not to say"]
            _veteran  = st.selectbox("Veteran status", _vet_opts,
                                     index=_vet_opts.index(_sid.get("veteran", "No")), key="rp_vet")
            _dis_opts = ["Prefer not to say", "No", "Yes"]
            _disability = st.selectbox("Disability disclosure", _dis_opts,
                                       index=_dis_opts.index(_sid.get("disability", "Prefer not to say")),
                                       key="rp_dis")
    st.divider()
    if st.button("💾 Save Resume Profile", type="primary", use_container_width=True, key="rp_save"):
        _data["personal_information"] = {
            **_data.get("personal_information", {}),
            "name": _name, "surname": _surname, "email": _email, "phone": _phone,
            "city": _city, "zip_code": _zip_code, "linkedin": _linkedin, "date_of_birth": _dob,
        }
        _data["experience_details"] = _updated_exp
        _data["salary_expectations"] = {"salary_range_usd": _salary_range}
        _data["availability"] = {"notice_period": _notice}
        _data["work_preferences"] = {
            **_data.get("work_preferences", {}),
            "remote_work": "Yes" if _remote else "No",
            "open_to_relocation": "Yes" if _reloc else "No",
            "willing_to_complete_assessments": "Yes" if _assessments else "No",
            "willing_to_undergo_background_checks": "Yes" if _bg else "No",
        }
        _data["self_identification"] = {
            "gender": _gender, "pronouns": _pronouns, "veteran": _veteran,
            "disability": _disability, "ethnicity": _ethnicity,
        }
        RESUME_PATH.write_text(yaml.dump(_data, default_flow_style=False, allow_unicode=True))
        st.success("✅ Resume profile saved!")
        st.balloons()
 # ── Email tab ─────────────────────────────────────────────────────────────────
 with tab_email:
    EMAIL_CFG = CONFIG_DIR / "email.yaml"
    EMAIL_EXAMPLE = CONFIG_DIR / "email.yaml.example"
    st.caption(
        "Connect Alex's email via IMAP to automatically associate recruitment "
        "emails with job applications. Only emails that mention the company name "
        "AND contain a recruitment keyword are ever imported — no personal emails "
        "are touched."
    )
    if not EMAIL_CFG.exists():
        st.info("No email config found — fill in your credentials below and click **Save** to create it.")
    em_cfg = load_yaml(EMAIL_CFG) if EMAIL_CFG.exists() else {}
    col_a, col_b = st.columns(2)
    with col_a:
        em_host = st.text_input("IMAP Host", em_cfg.get("host", "imap.gmail.com"), key="em_host")
        em_port = st.number_input("Port", value=int(em_cfg.get("port", 993)),
                                  min_value=1, max_value=65535, key="em_port")
        em_ssl  = st.checkbox("Use SSL", value=em_cfg.get("use_ssl", True), key="em_ssl")
    with col_b:
        em_user = st.text_input("Username (email address)", em_cfg.get("username", ""), key="em_user")
        em_pass = st.text_input("Password / App Password", em_cfg.get("password", ""),
                                type="password", key="em_pass")
        em_sent = st.text_input("Sent folder (blank = auto-detect)",
                                em_cfg.get("sent_folder", ""), key="em_sent",
                                placeholder='e.g. "[Gmail]/Sent Mail"')
    em_days = st.slider("Look-back window (days)", 14, 365,
                        int(em_cfg.get("lookback_days", 90)), key="em_days")
    st.caption(
        "**Gmail users:** create an App Password at "
        "myaccount.google.com/apppasswords (requires 2-Step Verification). "
        "Enable IMAP at Gmail Settings → Forwarding and POP/IMAP."
    )
    col_save, col_test = st.columns(2)
    if col_save.button("💾 Save email settings", type="primary", key="em_save"):
        save_yaml(EMAIL_CFG, {
            "host": em_host, "port": int(em_port), "use_ssl": em_ssl,
            "username": em_user, "password": em_pass,
            "sent_folder": em_sent, "lookback_days": int(em_days),
        })
        EMAIL_CFG.chmod(0o600)
        st.success("Saved!")
    if col_test.button("🔌 Test connection", key="em_test"):
        with st.spinner("Connecting…"):
            try:
                import imaplib as _imap
                _conn = (_imap.IMAP4_SSL if em_ssl else _imap.IMAP4)(em_host, int(em_port))
                _conn.login(em_user, em_pass)
                _, _caps = _conn.capability()
                _conn.logout()
                st.success(f"Connected successfully to {em_host}")
            except Exception as e:
                st.error(f"Connection failed: {e}")
 # ── Skills & Keywords tab ─────────────────────────────────────────────────────
 with tab_skills:
    st.subheader("🏷️ Skills & Keywords")
    st.caption(
        "These are matched against job descriptions to select Alex's most relevant "
        "experience and highlight keyword overlap in the research brief."
    )
    if not KEYWORDS_CFG.exists():
        st.warning("resume_keywords.yaml not found — create it at config/resume_keywords.yaml")
    else:
        kw_data = load_yaml(KEYWORDS_CFG)
        changed = False
        for category in ["skills", "domains", "keywords"]:
            st.markdown(f"**{category.title()}**")
            tags: list[str] = kw_data.get(category, [])
            if not tags:
                st.caption("No tags yet — add one below.")
            # Render existing tags as removable chips (value-based keys for stability)
            n_cols = min(max(len(tags), 1), 6)
            cols = st.columns(n_cols)
            to_remove = None
            for i, tag in enumerate(tags):
                with cols[i % n_cols]:
                    if st.button(f"× {tag}", key=f"rm_{category}_{tag}", use_container_width=True):
                        to_remove = tag
            if to_remove:
                tags.remove(to_remove)
                kw_data[category] = tags
                changed = True
            # Add new tag
            new_col, btn_col = st.columns([4, 1])
            new_tag = new_col.text_input(
                "Add",
                key=f"new_{category}",
                label_visibility="collapsed",
                placeholder=f"Add {category[:-1] if category.endswith('s') else category}…",
            )
            if btn_col.button("＋ Add", key=f"add_{category}"):
                tag = new_tag.strip()
                if tag and tag not in tags:
                    tags.append(tag)
                    kw_data[category] = tags
                    changed = True
            st.markdown("---")
        if changed:
            save_yaml(KEYWORDS_CFG, kw_data)
            st.success("Saved.")
            st.rerun()
--- a/app/pages/3_Resume_Editor.py
+++ b/app/pages/3_Resume_Editor.py
@ -0,0 +1,191 @@
 # app/pages/3_Resume_Editor.py
 """
 Resume Editor — form-based editor for Alex's AIHawk profile YAML.
 FILL_IN fields highlighted in amber.
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 import yaml
 st.set_page_config(page_title="Resume Editor", page_icon="📝", layout="wide")
 st.title("📝 Resume Editor")
 st.caption("Edit Alex's application profile used by AIHawk for LinkedIn Easy Apply.")
 RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 if not RESUME_PATH.exists():
    st.error(f"Resume file not found at `{RESUME_PATH}`. Is AIHawk cloned?")
    st.stop()
 data = yaml.safe_load(RESUME_PATH.read_text()) or {}
 def field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str:
    """Render a text input, highlighted amber if value is FILL_IN or empty."""
    needs_attention = str(value).startswith("FILL_IN") or value == ""
    if needs_attention:
        st.markdown(
            '<p style="color:#F59E0B;font-size:0.8em;margin-bottom:2px">⚠️ Needs your attention</p>',
            unsafe_allow_html=True,
        )
    return st.text_input(label, value=value or "", key=key, help=help,
                         type="password" if password else "default")
 st.divider()
 # ── Personal Info ─────────────────────────────────────────────────────────────
 with st.expander("👤 Personal Information", expanded=True):
    info = data.get("personal_information", {})
    col1, col2 = st.columns(2)
    with col1:
        name = field("First Name", info.get("name", ""), "pi_name")
        email = field("Email", info.get("email", ""), "pi_email")
        phone = field("Phone", info.get("phone", ""), "pi_phone")
        city = field("City", info.get("city", ""), "pi_city")
    with col2:
        surname = field("Last Name", info.get("surname", ""), "pi_surname")
        linkedin = field("LinkedIn URL", info.get("linkedin", ""), "pi_linkedin")
        zip_code = field("Zip Code", info.get("zip_code", ""), "pi_zip")
        dob = field("Date of Birth", info.get("date_of_birth", ""), "pi_dob",
                    help="Format: MM/DD/YYYY")
 # ── Education ─────────────────────────────────────────────────────────────────
 with st.expander("🎓 Education"):
    edu_list = data.get("education_details", [{}])
    updated_edu = []
    degree_options = ["Bachelor's Degree", "Master's Degree", "Some College",
                      "Associate's Degree", "High School", "Other"]
    for i, edu in enumerate(edu_list):
        st.markdown(f"**Entry {i+1}**")
        col1, col2 = st.columns(2)
        with col1:
            inst = field("Institution", edu.get("institution", ""), f"edu_inst_{i}")
            field_study = st.text_input("Field of Study", edu.get("field_of_study", ""), key=f"edu_field_{i}")
            start = st.text_input("Start Year", edu.get("start_date", ""), key=f"edu_start_{i}")
        with col2:
            current_level = edu.get("education_level", "Some College")
            level_idx = degree_options.index(current_level) if current_level in degree_options else 2
            level = st.selectbox("Degree Level", degree_options, index=level_idx, key=f"edu_level_{i}")
            end = st.text_input("Completion Year", edu.get("year_of_completion", ""), key=f"edu_end_{i}")
        updated_edu.append({
            "education_level": level, "institution": inst, "field_of_study": field_study,
            "start_date": start, "year_of_completion": end, "final_evaluation_grade": "", "exam": {},
        })
        st.divider()
 # ── Experience ────────────────────────────────────────────────────────────────
 with st.expander("💼 Work Experience"):
    exp_list = data.get("experience_details", [{}])
    if "exp_count" not in st.session_state:
        st.session_state.exp_count = len(exp_list)
    if st.button("+ Add Experience Entry"):
        st.session_state.exp_count += 1
        exp_list.append({})
    updated_exp = []
    for i in range(st.session_state.exp_count):
        exp = exp_list[i] if i < len(exp_list) else {}
        st.markdown(f"**Position {i+1}**")
        col1, col2 = st.columns(2)
        with col1:
            pos = field("Job Title", exp.get("position", ""), f"exp_pos_{i}")
            company = field("Company", exp.get("company", ""), f"exp_co_{i}")
            period = field("Employment Period", exp.get("employment_period", ""), f"exp_period_{i}",
                           help="e.g. 01/2022 - Present")
        with col2:
            location = st.text_input("Location", exp.get("location", ""), key=f"exp_loc_{i}")
            industry = st.text_input("Industry", exp.get("industry", ""), key=f"exp_ind_{i}")
        responsibilities = st.text_area(
            "Key Responsibilities (one per line)",
            value="\n".join(
                r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r)
                for j, r in enumerate(exp.get("key_responsibilities", []))
            ),
            key=f"exp_resp_{i}", height=100,
        )
        skills = st.text_input(
            "Skills (comma-separated)",
            value=", ".join(exp.get("skills_acquired", [])),
            key=f"exp_skills_{i}",
        )
        resp_list = [{"responsibility_1": r.strip()} for r in responsibilities.splitlines() if r.strip()]
        skill_list = [s.strip() for s in skills.split(",") if s.strip()]
        updated_exp.append({
            "position": pos, "company": company, "employment_period": period,
            "location": location, "industry": industry,
            "key_responsibilities": resp_list, "skills_acquired": skill_list,
        })
        st.divider()
 # ── Preferences ───────────────────────────────────────────────────────────────
 with st.expander("⚙️ Preferences & Availability"):
    wp = data.get("work_preferences", {})
    sal = data.get("salary_expectations", {})
    avail = data.get("availability", {})
    col1, col2 = st.columns(2)
    with col1:
        salary_range = st.text_input("Salary Range (USD)", sal.get("salary_range_usd", ""),
                                     key="pref_salary", help="e.g. 120000 - 180000")
        notice = st.text_input("Notice Period", avail.get("notice_period", "2 weeks"), key="pref_notice")
    with col2:
        remote_work = st.checkbox("Open to Remote", value=wp.get("remote_work", "Yes") == "Yes", key="pref_remote")
        relocation = st.checkbox("Open to Relocation", value=wp.get("open_to_relocation", "No") == "Yes", key="pref_reloc")
        assessments = st.checkbox("Willing to complete assessments",
                                  value=wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="pref_assess")
        bg_checks = st.checkbox("Willing to undergo background checks",
                                value=wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="pref_bg")
        drug_tests = st.checkbox("Willing to undergo drug tests",
                                 value=wp.get("willing_to_undergo_drug_tests", "No") == "Yes", key="pref_drug")
 # ── Self-ID ───────────────────────────────────────────────────────────────────
 with st.expander("🏳️‍🌈 Self-Identification (optional)"):
    sid = data.get("self_identification", {})
    col1, col2 = st.columns(2)
    with col1:
        gender = st.text_input("Gender identity", sid.get("gender", "Non-binary"), key="sid_gender",
                               help="Select 'Non-binary' or 'Prefer not to say' when options allow")
        pronouns = st.text_input("Pronouns", sid.get("pronouns", "Any"), key="sid_pronouns")
        ethnicity = field("Ethnicity", sid.get("ethnicity", ""), "sid_ethnicity",
                          help="'Prefer not to say' is always an option")
    with col2:
        vet_options = ["No", "Yes", "Prefer not to say"]
        veteran = st.selectbox("Veteran status", vet_options,
                               index=vet_options.index(sid.get("veteran", "No")), key="sid_vet")
        dis_options = ["Prefer not to say", "No", "Yes"]
        disability = st.selectbox("Disability disclosure", dis_options,
                                  index=dis_options.index(sid.get("disability", "Prefer not to say")),
                                  key="sid_dis")
 st.divider()
 # ── Save ──────────────────────────────────────────────────────────────────────
 if st.button("💾 Save Resume Profile", type="primary", use_container_width=True):
    data["personal_information"] = {
        **data.get("personal_information", {}),
        "name": name, "surname": surname, "email": email, "phone": phone,
        "city": city, "zip_code": zip_code, "linkedin": linkedin, "date_of_birth": dob,
    }
    data["education_details"] = updated_edu
    data["experience_details"] = updated_exp
    data["salary_expectations"] = {"salary_range_usd": salary_range}
    data["availability"] = {"notice_period": notice}
    data["work_preferences"] = {
        **data.get("work_preferences", {}),
        "remote_work": "Yes" if remote_work else "No",
        "open_to_relocation": "Yes" if relocation else "No",
        "willing_to_complete_assessments": "Yes" if assessments else "No",
        "willing_to_undergo_background_checks": "Yes" if bg_checks else "No",
        "willing_to_undergo_drug_tests": "Yes" if drug_tests else "No",
    }
    data["self_identification"] = {
        "gender": gender, "pronouns": pronouns, "veteran": veteran,
        "disability": disability, "ethnicity": ethnicity,
    }
    RESUME_PATH.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
    st.success("✅ Profile saved!")
    st.balloons()
--- a/app/pages/4_Apply.py
+++ b/app/pages/4_Apply.py
@ -0,0 +1,388 @@
 # app/pages/4_Apply.py
 """
 Apply Workspace — side-by-side cover letter tools and job description.
 Generates a PDF cover letter saved to the JobSearch docs folder.
 """
 import re
 import sys
 from datetime import datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 import streamlit.components.v1 as components
 import yaml
 from scripts.db import (
    DEFAULT_DB, init_db, get_jobs_by_status,
    update_cover_letter, mark_applied, update_job_status,
    get_task_for_job,
 )
 from scripts.task_runner import submit_task
 DOCS_DIR = Path("/Library/Documents/JobSearch")
 RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 st.title("🚀 Apply Workspace")
 init_db(DEFAULT_DB)
 # ── PDF generation ─────────────────────────────────────────────────────────────
 def _make_cover_letter_pdf(job: dict, cover_letter: str, output_dir: Path) -> Path:
    from reportlab.lib.pagesizes import letter
    from reportlab.lib.units import inch
    from reportlab.lib.colors import HexColor
    from reportlab.lib.styles import ParagraphStyle
    from reportlab.lib.enums import TA_LEFT
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable
    output_dir.mkdir(parents=True, exist_ok=True)
    company_safe = re.sub(r"[^a-zA-Z0-9]", "", job.get("company", "Company"))
    date_str = datetime.now().strftime("%Y-%m-%d")
    out_path = output_dir / f"CoverLetter_{company_safe}_{date_str}.pdf"
    doc = SimpleDocTemplate(
        str(out_path),
        pagesize=letter,
        leftMargin=inch, rightMargin=inch,
        topMargin=inch, bottomMargin=inch,
    )
    teal  = HexColor("#2DD4BF")
    dark  = HexColor("#0F172A")
    slate = HexColor("#64748B")
    name_style = ParagraphStyle(
        "Name", fontName="Helvetica-Bold", fontSize=22,
        textColor=teal, spaceAfter=6,
    )
    contact_style = ParagraphStyle(
        "Contact", fontName="Helvetica", fontSize=9,
        textColor=slate, spaceAfter=4,
    )
    date_style = ParagraphStyle(
        "Date", fontName="Helvetica", fontSize=11,
        textColor=dark, spaceBefore=16, spaceAfter=14,
    )
    body_style = ParagraphStyle(
        "Body", fontName="Helvetica", fontSize=11,
        textColor=dark, leading=16, spaceAfter=12, alignment=TA_LEFT,
    )
    story = [
        Paragraph("ALEX RIVERA", name_style),
        Paragraph(
            "alex@example.com  ·  (555) 867-5309  ·  "
            "linkedin.com/in/AlexMcCann  ·  hirealexmccann.site",
            contact_style,
        ),
        HRFlowable(width="100%", thickness=1, color=teal, spaceBefore=8, spaceAfter=0),
        Paragraph(datetime.now().strftime("%B %d, %Y"), date_style),
    ]
    for para in cover_letter.strip().split("\n\n"):
        para = para.strip()
        if para:
            story.append(Paragraph(para.replace("\n", "<br/>"), body_style))
    story += [
        Spacer(1, 6),
        Paragraph("Warm regards,<br/><br/>Alex Rivera", body_style),
    ]
    doc.build(story)
    return out_path
 # ── Application Q&A helper ─────────────────────────────────────────────────────
 def _answer_question(job: dict, question: str) -> str:
    """Call the LLM to answer an application question in Alex's voice.
    Uses research_fallback_order (claude_code → vllm → ollama_research)
    rather than the default cover-letter order — the fine-tuned cover letter
    model is not suited for answering general application questions.
    """
    from scripts.llm_router import LLMRouter
    router = LLMRouter()
    fallback = router.config.get("research_fallback_order") or router.config.get("fallback_order")
    description_snippet = (job.get("description") or "")[:1200].strip()
    prompt = f"""You are answering job application questions for Alex Rivera, a customer success leader.
 Background:
 - 6+ years in customer success, technical account management, and CS leadership
 - Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), NPS consistently ≥95
 - Also founder of M3 Consulting, a CS advisory practice for SaaS startups
 - Based in SF Bay Area; open to remote/hybrid; pronouns: any
 Role she's applying to: {job.get("title", "")} at {job.get("company", "")}
 {f"Job description excerpt:{chr(10)}{description_snippet}" if description_snippet else ""}
 Application Question:
 {question}
 Answer in Alex's voice — specific, warm, and confident. If the question specifies a word or character limit, respect it. Answer only the question with no preamble or sign-off."""
    return router.complete(prompt, fallback_order=fallback).strip()
 # ── Copy-to-clipboard button ───────────────────────────────────────────────────
 def _copy_btn(text: str, label: str = "📋 Copy", done: str = "✅ Copied!", height: int = 44) -> None:
    import json
    # Each components.html call renders in its own sandboxed iframe, so a fixed
    # element id is fine. json.dumps handles all special chars (quotes, newlines,
    # backslashes, etc.) — avoids the fragile inline-onclick escaping approach.
    components.html(
        f"""<button id="b"
            style="width:100%;background:#2DD4BF;color:#0F172A;border:none;
                   padding:6px 10px;border-radius:6px;cursor:pointer;
                   font-size:13px;font-weight:600">{label}</button>
        <script>
        document.getElementById('b').addEventListener('click', function() {{
            navigator.clipboard.writeText({json.dumps(text)});
            this.textContent = {json.dumps(done)};
            setTimeout(() => this.textContent = {json.dumps(label)}, 2000);
        }});
        </script>""",
        height=height,
    )
 # ── Job selection ──────────────────────────────────────────────────────────────
 approved = get_jobs_by_status(DEFAULT_DB, "approved")
 if not approved:
    st.info("No approved jobs — head to Job Review to approve some listings first.")
    st.stop()
 preselect_id = st.session_state.pop("apply_job_id", None)
 job_options = {j["id"]: f"{j['title']} — {j['company']}" for j in approved}
 ids = list(job_options.keys())
 default_idx = ids.index(preselect_id) if preselect_id in ids else 0
 selected_id = st.selectbox(
    "Job",
    options=ids,
    format_func=lambda x: job_options[x],
    index=default_idx,
    label_visibility="collapsed",
 )
 job = next(j for j in approved if j["id"] == selected_id)
 st.divider()
 # ── Two-column workspace ───────────────────────────────────────────────────────
 col_tools, col_jd = st.columns([2, 3])
 # ════════════════════════════════════════════════
 #  RIGHT — job description
 # ════════════════════════════════════════════════
 with col_jd:
    score = job.get("match_score")
    score_badge = (
        "⬜ No score" if score is None else
        f"🟢 {score:.0f}%" if score >= 70 else
        f"🟡 {score:.0f}%" if score >= 40 else f"🔴 {score:.0f}%"
    )
    remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site"
    src = (job.get("source") or "").lower()
    source_badge = f"🤖 {src.title()}" if src == "linkedin" else f"👤 {src.title() or 'Manual'}"
    st.subheader(job["title"])
    st.caption(
        f"**{job['company']}**  ·  {job.get('location', '')}  ·  "
        f"{remote_badge}  ·  {source_badge}  ·  {score_badge}"
    )
    if job.get("salary"):
        st.caption(f"💰 {job['salary']}")
    if job.get("keyword_gaps"):
        st.caption(f"**Gaps to address in letter:** {job['keyword_gaps']}")
    st.divider()
    st.markdown(job.get("description") or "_No description scraped for this listing._")
 # ════════════════════════════════════════════════
 #  LEFT — copy tools
 # ════════════════════════════════════════════════
 with col_tools:
    # ── Cover letter ──────────────────────────────
    st.subheader("📝 Cover Letter")
    _cl_key = f"cl_{selected_id}"
    if _cl_key not in st.session_state:
        st.session_state[_cl_key] = job.get("cover_letter") or ""
    _cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
    _cl_running = _cl_task and _cl_task["status"] in ("queued", "running")
    if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)):
        submit_task(DEFAULT_DB, "cover_letter", selected_id)
        st.rerun()
    if _cl_running:
        @st.fragment(run_every=3)
        def _cl_status_fragment():
            t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
            if t and t["status"] in ("queued", "running"):
                lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…"
                st.info(f"⏳ {lbl}")
            else:
                st.rerun()  # full page rerun — reloads cover letter from DB
        _cl_status_fragment()
    elif _cl_task and _cl_task["status"] == "failed":
        st.error(f"Generation failed: {_cl_task.get('error', 'unknown error')}")
    # Refresh session state only when a NEW task has just completed — not on every rerun.
    # Without this guard, every Save Draft click would overwrite the edited text with the
    # old DB value before cl_text could be captured.
    _cl_loaded_key = f"cl_loaded_{selected_id}"
    if not _cl_running and _cl_task and _cl_task["status"] == "completed":
        if st.session_state.get(_cl_loaded_key) != _cl_task["id"]:
            st.session_state[_cl_key] = job.get("cover_letter") or ""
            st.session_state[_cl_loaded_key] = _cl_task["id"]
    cl_text = st.text_area(
        "cover_letter_body",
        key=_cl_key,
        height=280,
        label_visibility="collapsed",
    )
    # Copy + Save row
    c1, c2 = st.columns(2)
    with c1:
        if cl_text:
            _copy_btn(cl_text, label="📋 Copy Letter")
    with c2:
        if st.button("💾 Save draft", use_container_width=True):
            update_cover_letter(DEFAULT_DB, selected_id, cl_text)
            st.success("Saved!")
    # PDF generation
    if cl_text:
        if st.button("📄 Export PDF → JobSearch folder", use_container_width=True, type="primary"):
            with st.spinner("Generating PDF…"):
                try:
                    pdf_path = _make_cover_letter_pdf(job, cl_text, DOCS_DIR)
                    update_cover_letter(DEFAULT_DB, selected_id, cl_text)
                    st.success(f"Saved: `{pdf_path.name}`")
                except Exception as e:
                    st.error(f"PDF error: {e}")
    st.divider()
    # Open listing + Mark Applied
    c3, c4 = st.columns(2)
    with c3:
        if job.get("url"):
            st.link_button("Open listing ↗", job["url"], use_container_width=True)
    with c4:
        if st.button("✅ Mark as Applied", use_container_width=True, type="primary"):
            if cl_text:
                update_cover_letter(DEFAULT_DB, selected_id, cl_text)
            mark_applied(DEFAULT_DB, [selected_id])
            st.success("Marked as applied!")
            st.rerun()
    if st.button("🚫 Reject listing", use_container_width=True):
        update_job_status(DEFAULT_DB, [selected_id], "rejected")
        # Advance selectbox to next job so list doesn't snap to first item
        current_idx = ids.index(selected_id) if selected_id in ids else 0
        if current_idx + 1 < len(ids):
            st.session_state["apply_job_id"] = ids[current_idx + 1]
        st.rerun()
    st.divider()
    # ── Resume highlights ─────────────────────────
    with st.expander("📄 Resume Highlights"):
        if RESUME_YAML.exists():
            resume = yaml.safe_load(RESUME_YAML.read_text()) or {}
            for exp in resume.get("experience_details", []):
                position = exp.get("position", "")
                company  = exp.get("company", "")
                period   = exp.get("employment_period", "")
                # Parse start / end dates (handles "MM/YYYY - Present" style)
                if " - " in period:
                    date_start, date_end = [p.strip() for p in period.split(" - ", 1)]
                else:
                    date_start, date_end = period, ""
                # Flatten bullets
                bullets = [
                    v
                    for resp_dict in exp.get("key_responsibilities", [])
                    for v in resp_dict.values()
                ]
                all_duties = "\n".join(f"• {b}" for b in bullets)
                # ── Header ────────────────────────────────────────────────────
                st.markdown(
                    f"**{position}** &nbsp;·&nbsp; "
                    f"{company} &nbsp;·&nbsp; "
                    f"*{period}*"
                )
                # ── Copy row: title | start | end | all duties ────────────────
                cp_t, cp_s, cp_e, cp_d = st.columns(4)
                with cp_t:
                    st.caption("Title")
                    _copy_btn(position, label="📋 Copy", height=34)
                with cp_s:
                    st.caption("Start")
                    _copy_btn(date_start, label="📋 Copy", height=34)
                with cp_e:
                    st.caption("End")
                    _copy_btn(date_end or period, label="📋 Copy", height=34)
                with cp_d:
                    st.caption("All Duties")
                    if bullets:
                        _copy_btn(all_duties, label="📋 Copy", height=34)
                # ── Individual bullets ────────────────────────────────────────
                for bullet in bullets:
                    b_col, cp_col = st.columns([6, 1])
                    b_col.caption(f"• {bullet}")
                    with cp_col:
                        _copy_btn(bullet, label="📋", done="✅", height=32)
                st.markdown("---")
        else:
            st.warning("Resume YAML not found — check that AIHawk is cloned.")
    # ── Application Q&A ───────────────────────────────────────────────────────
    with st.expander("💬 Answer Application Questions"):
        st.caption("Paste a question from the application and get an answer in your voice.")
        _qa_key = f"qa_list_{selected_id}"
        if _qa_key not in st.session_state:
            st.session_state[_qa_key] = []
        q_input = st.text_area(
            "Paste question",
            placeholder="In 200 words or less, explain why you're a strong fit for this role.",
            height=80,
            key=f"qa_input_{selected_id}",
            label_visibility="collapsed",
        )
        if st.button("✨ Generate Answer", key=f"qa_gen_{selected_id}",
                     use_container_width=True,
                     disabled=not (q_input or "").strip()):
            with st.spinner("Generating answer…"):
                _answer = _answer_question(job, q_input.strip())
            st.session_state[_qa_key].append({"q": q_input.strip(), "a": _answer})
            st.rerun()
        for _i, _pair in enumerate(reversed(st.session_state[_qa_key])):
            _real_idx = len(st.session_state[_qa_key]) - 1 - _i
            st.markdown(f"**Q:** {_pair['q']}")
            _a_key = f"qa_ans_{selected_id}_{_real_idx}"
            if _a_key not in st.session_state:
                st.session_state[_a_key] = _pair["a"]
            _answer_text = st.text_area(
                "answer",
                key=_a_key,
                height=120,
                label_visibility="collapsed",
            )
            _copy_btn(_answer_text, label="📋 Copy Answer")
            if _i < len(st.session_state[_qa_key]) - 1:
                st.markdown("---")
--- a/app/pages/5_Interviews.py
+++ b/app/pages/5_Interviews.py
@ -0,0 +1,539 @@
 # app/pages/5_Interviews.py
 """
 Interviews — Kanban board for tracking post-application engagement.
 Pipeline: applied → phone_screen → interviewing → offer → hired
          (or rejected at any stage, with stage captured for analytics)
 Features:
  - Kanban columns for each interview stage
  - Company research brief auto-generated when advancing to Phone Screen
  - Contact / email log per job
  - Email reply drafter via LLM
  - Interview date tracking with calendar push hint
  - Rejection analytics
 """
 import sys
 from collections import Counter
 from datetime import date, datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 from scripts.db import (
    DEFAULT_DB, init_db,
    get_interview_jobs, advance_to_stage, reject_at_stage,
    set_interview_date, add_contact, get_contacts,
    get_research, get_task_for_job, get_job_by_id,
    get_unread_stage_signals, dismiss_stage_signal,
 )
 from scripts.task_runner import submit_task
 st.title("🎯 Interviews")
 init_db(DEFAULT_DB)
 # ── Sidebar: Email sync ────────────────────────────────────────────────────────
 with st.sidebar:
    st.markdown("### 📧 Email Sync")
    _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
    _email_running = _email_task and _email_task["status"] in ("queued", "running")
    if st.button("🔄 Sync Emails", use_container_width=True, type="primary",
                 disabled=bool(_email_running)):
        submit_task(DEFAULT_DB, "email_sync", 0)
        st.rerun()
    if _email_running:
        @st.fragment(run_every=4)
        def _email_sidebar_status():
            t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
            if t and t["status"] in ("queued", "running"):
                st.info("⏳ Syncing…")
            else:
                st.rerun()
        _email_sidebar_status()
    elif _email_task and _email_task["status"] == "completed":
        st.success(_email_task.get("error", "Done"))
    elif _email_task and _email_task["status"] == "failed":
        msg = _email_task.get("error", "")
        if "not configured" in msg.lower():
            st.error("Email not configured. Go to **Settings → Email**.")
        else:
            st.error(f"Sync failed: {msg}")
 # ── Constants ─────────────────────────────────────────────────────────────────
 STAGE_LABELS = {
    "phone_screen": "📞 Phone Screen",
    "interviewing":  "🎯 Interviewing",
    "offer":         "📜 Offer / Hired",
 }
 STAGE_NEXT = {
    "survey":       "phone_screen",
    "applied":      "phone_screen",
    "phone_screen": "interviewing",
    "interviewing": "offer",
    "offer":        "hired",
 }
 STAGE_NEXT_LABEL = {
    "survey":       "📞 Phone Screen",
    "applied":      "📞 Phone Screen",
    "phone_screen": "🎯 Interviewing",
    "interviewing": "📜 Offer",
    "offer":        "🎉 Hired",
 }
 # ── Data ──────────────────────────────────────────────────────────────────────
 jobs_by_stage = get_interview_jobs(DEFAULT_DB)
 # ── Helpers ───────────────────────────────────────────────────────────────────
 def _days_ago(date_str: str | None) -> str:
    if not date_str:
        return "—"
    try:
        d = date.fromisoformat(date_str[:10])
        delta = (date.today() - d).days
        if delta == 0:
            return "today"
        if delta == 1:
            return "yesterday"
        return f"{delta}d ago"
    except Exception:
        return date_str[:10]
@st.dialog("🔬 Company Research", width="large")
 def _research_modal(job: dict) -> None:
    job_id = job["id"]
    st.caption(f"**{job.get('company')}** — {job.get('title')}")
    research = get_research(DEFAULT_DB, job_id=job_id)
    task = get_task_for_job(DEFAULT_DB, "company_research", job_id)
    running = task and task["status"] in ("queued", "running")
    if running:
        task_stage = (task.get("stage") or "")
        lbl = "Queued…" if task["status"] == "queued" else (task_stage or "Generating…")
        st.info(f"⏳ {lbl}")
    elif research:
        scrape_used = research.get("scrape_used")
        if not scrape_used:
            import socket as _sock
            _searxng_up = False
            try:
                with _sock.create_connection(("127.0.0.1", 8888), timeout=1):
                    _searxng_up = True
            except OSError:
                pass
            if _searxng_up:
                st.warning(
                    "⚠️ This brief was generated without live web data and may contain "
                    "inaccuracies. SearXNG is now available — re-run to get verified facts."
                )
                if st.button("🔄 Re-run with live data", key=f"modal_rescrape_{job_id}", type="primary"):
                    submit_task(DEFAULT_DB, "company_research", job_id)
                    st.rerun()
                st.divider()
            else:
                st.warning(
                    "⚠️ Generated without live web data (SearXNG was offline). "
                    "Key facts like CEO, investors, and founding date may be hallucinated — "
                    "verify before the call. Start SearXNG in Settings → Services to re-run."
                )
                st.divider()
        st.caption(
            f"Generated {research.get('generated_at', '')} "
            f"{'· web data used ✓' if scrape_used else '· LLM knowledge only'}"
        )
        st.markdown(research["raw_output"])
        if st.button("🔄 Refresh", key=f"modal_regen_{job_id}", disabled=bool(running)):
            submit_task(DEFAULT_DB, "company_research", job_id)
            st.rerun()
    else:
        st.info("No research brief yet.")
        if task and task["status"] == "failed":
            st.error(f"Last attempt failed: {task.get('error', '')}")
        if st.button("🔬 Generate now", key=f"modal_gen_{job_id}"):
            submit_task(DEFAULT_DB, "company_research", job_id)
            st.rerun()
@st.dialog("📧 Email History", width="large")
 def _email_modal(job: dict) -> None:
    job_id = job["id"]
    st.caption(f"**{job.get('company')}** — {job.get('title')}")
    contacts = get_contacts(DEFAULT_DB, job_id=job_id)
    if not contacts:
        st.info("No emails logged yet. Use the form below to add one.")
    else:
        for c in contacts:
            icon = "📥" if c["direction"] == "inbound" else "📤"
            st.markdown(
                f"{icon} **{c.get('subject') or '(no subject)'}** "
                f"· _{c.get('received_at', '')[:10]}_"
            )
            if c.get("from_addr"):
                st.caption(f"From: {c['from_addr']}")
            if c.get("body"):
                st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else ""))
            st.divider()
        inbound = [c for c in contacts if c["direction"] == "inbound"]
        if inbound:
            last = inbound[-1]
            if st.button("✍️ Draft reply", key=f"modal_draft_{job_id}"):
                with st.spinner("Drafting…"):
                    try:
                        from scripts.llm_router import complete
                        draft = complete(
                            prompt=(
                                f"Draft a professional, warm reply to this email.\n\n"
                                f"From: {last.get('from_addr', '')}\n"
                                f"Subject: {last.get('subject', '')}\n\n"
                                f"{last.get('body', '')}\n\n"
                                f"Context: Alex Rivera is a Customer Success / "
                                f"Technical Account Manager applying for "
                                f"{job.get('title')} at {job.get('company')}."
                            ),
                            system=(
                                "You are Alex Rivera's professional email assistant. "
                                "Write concise, warm, and professional replies in her voice. "
                                "Keep it to 3–5 sentences unless more is needed."
                            ),
                        )
                        st.session_state[f"modal_draft_text_{job_id}"] = draft
                        st.rerun()
                    except Exception as e:
                        st.error(f"Draft failed: {e}")
            if f"modal_draft_text_{job_id}" in st.session_state:
                st.text_area(
                    "Draft (edit before sending)",
                    value=st.session_state[f"modal_draft_text_{job_id}"],
                    height=160,
                    key=f"modal_draft_area_{job_id}",
                )
    st.divider()
    st.markdown("**Log a contact**")
    with st.form(key=f"contact_form_modal_{job_id}", clear_on_submit=True):
        col_a, col_b = st.columns(2)
        direction = col_a.radio(
            "Direction", ["inbound", "outbound"],
            horizontal=True, key=f"dir_modal_{job_id}",
        )
        recv_at = col_b.text_input(
            "Date (YYYY-MM-DD)", value=str(date.today()), key=f"recv_modal_{job_id}"
        )
        subject = st.text_input("Subject", key=f"subj_modal_{job_id}")
        from_addr = st.text_input("From", key=f"from_modal_{job_id}")
        body_text = st.text_area("Body / notes", height=80, key=f"body_modal_{job_id}")
        if st.form_submit_button("📧 Save contact"):
            add_contact(
                DEFAULT_DB, job_id=job_id,
                direction=direction, subject=subject,
                from_addr=from_addr, body=body_text, received_at=recv_at,
            )
            st.rerun()
 def _render_card(job: dict, stage: str, compact: bool = False) -> None:
    """Render a single job card appropriate for the given stage."""
    job_id = job["id"]
    contacts = get_contacts(DEFAULT_DB, job_id=job_id)
    last_contact = contacts[-1] if contacts else None
    with st.container(border=True):
        st.markdown(f"**{job.get('company', '?')}**")
        st.caption(job.get("title", ""))
        col_a, col_b = st.columns(2)
        col_a.caption(f"Applied: {_days_ago(job.get('applied_at'))}")
        if last_contact:
            col_b.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}")
        # Interview date picker (phone_screen / interviewing stages)
        if stage in ("phone_screen", "interviewing"):
            current_idate = job.get("interview_date") or ""
            with st.form(key=f"idate_form_{job_id}"):
                new_date = st.date_input(
                    "Interview date",
                    value=date.fromisoformat(current_idate) if current_idate else None,
                    key=f"idate_{job_id}",
                    format="YYYY-MM-DD",
                )
                if st.form_submit_button("📅 Save date"):
                    set_interview_date(DEFAULT_DB, job_id=job_id, date_str=str(new_date))
                    st.success("Saved!")
                    st.rerun()
        if not compact:
            if stage in ("applied", "phone_screen", "interviewing"):
                signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
                if signals:
                    sig = signals[-1]
                    _SIGNAL_TO_STAGE = {
                        "interview_scheduled": ("phone_screen", "📞 Phone Screen"),
                        "positive_response":   ("phone_screen", "📞 Phone Screen"),
                        "offer_received":      ("offer",        "📜 Offer"),
                        "survey_received":     ("survey",       "📋 Survey"),
                    }
                    target_stage, target_label = _SIGNAL_TO_STAGE.get(
                        sig["stage_signal"], (None, None)
                    )
                    with st.container(border=True):
                        st.caption(
                            f"💡 Email suggests: **{sig['stage_signal'].replace('_', ' ')}**  \n"
                            f"_{sig.get('subject', '')}_ · {(sig.get('received_at') or '')[:10]}"
                        )
                        b1, b2 = st.columns(2)
                        if sig["stage_signal"] == "rejected":
                            if b1.button("✗ Reject", key=f"sig_rej_{sig['id']}",
                                         use_container_width=True):
                                reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
                                dismiss_stage_signal(DEFAULT_DB, sig["id"])
                                st.rerun(scope="app")
                        elif target_stage and b1.button(
                            f"→ {target_label}", key=f"sig_adv_{sig['id']}",
                            use_container_width=True, type="primary",
                        ):
                            if target_stage == "phone_screen" and stage == "applied":
                                advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
                                submit_task(DEFAULT_DB, "company_research", job_id)
                            elif target_stage:
                                advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
                            dismiss_stage_signal(DEFAULT_DB, sig["id"])
                            st.rerun(scope="app")
                        if b2.button("Dismiss", key=f"sig_dis_{sig['id']}",
                                     use_container_width=True):
                            dismiss_stage_signal(DEFAULT_DB, sig["id"])
                            st.rerun()
            # Advance / Reject buttons
            next_stage = STAGE_NEXT.get(stage)
            c1, c2 = st.columns(2)
            if next_stage:
                next_label = STAGE_NEXT_LABEL.get(stage, next_stage)
                if c1.button(
                    f"→ {next_label}", key=f"adv_{job_id}",
                    use_container_width=True, type="primary",
                ):
                    advance_to_stage(DEFAULT_DB, job_id=job_id, stage=next_stage)
                    if next_stage == "phone_screen":
                        submit_task(DEFAULT_DB, "company_research", job_id)
                    st.rerun(scope="app")  # full rerun — card must appear in new column
            if c2.button(
                "✗ Reject", key=f"rej_{job_id}",
                use_container_width=True,
            ):
                reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
                st.rerun()  # fragment-scope rerun — card disappears without scroll-to-top
            if job.get("url"):
                st.link_button("Open listing ↗", job["url"], use_container_width=True)
            if stage in ("phone_screen", "interviewing", "offer"):
                if st.button(
                    "📋 Open Prep Sheet", key=f"prep_{job_id}",
                    use_container_width=True,
                    help="Open the Interview Prep page for this job",
                ):
                    st.session_state["prep_job_id"] = job_id
                    st.switch_page("pages/6_Interview_Prep.py")
            # Detail modals — full-width overlays replace narrow inline expanders
            if stage in ("phone_screen", "interviewing", "offer"):
                mc1, mc2 = st.columns(2)
                if mc1.button("🔬 Research", key=f"res_btn_{job_id}", use_container_width=True):
                    _research_modal(job)
                if mc2.button("📧 Emails", key=f"email_btn_{job_id}", use_container_width=True):
                    _email_modal(job)
            else:
                if st.button("📧 Emails", key=f"email_btn_{job_id}", use_container_width=True):
                    _email_modal(job)
 # ── Fragment wrappers — keep scroll position on card actions ─────────────────
@st.fragment
 def _card_fragment(job_id: int, stage: str) -> None:
    """Re-fetches the job on each fragment rerun; renders nothing if moved/rejected."""
    job = get_job_by_id(DEFAULT_DB, job_id)
    if job is None or job.get("status") != stage:
        return
    _render_card(job, stage)
@st.fragment
 def _pre_kanban_row_fragment(job_id: int) -> None:
    """Pre-kanban compact row for applied and survey-stage jobs."""
    job = get_job_by_id(DEFAULT_DB, job_id)
    if job is None or job.get("status") not in ("applied", "survey"):
        return
    stage = job["status"]
    contacts = get_contacts(DEFAULT_DB, job_id=job_id)
    last_contact = contacts[-1] if contacts else None
    with st.container(border=True):
        left, mid, right = st.columns([3, 2, 2])
        badge = " 📋 **Survey**" if stage == "survey" else ""
        left.markdown(f"**{job.get('company')}** — {job.get('title', '')}{badge}")
        left.caption(f"Applied: {_days_ago(job.get('applied_at'))}")
        with mid:
            if last_contact:
                st.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}")
            if st.button("📧 Emails", key=f"email_pre_{job_id}", use_container_width=True):
                _email_modal(job)
            # Stage signal hint (email-detected next steps)
            signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
            if signals:
                sig = signals[-1]
                _SIGNAL_TO_STAGE = {
                    "interview_scheduled": ("phone_screen", "📞 Phone Screen"),
                    "positive_response":   ("phone_screen", "📞 Phone Screen"),
                    "offer_received":      ("offer",        "📜 Offer"),
                    "survey_received":     ("survey",       "📋 Survey"),
                }
                target_stage, target_label = _SIGNAL_TO_STAGE.get(
                    sig["stage_signal"], (None, None)
                )
                with st.container(border=True):
                    st.caption(
                        f"💡 **{sig['stage_signal'].replace('_', ' ')}**  \n"
                        f"_{sig.get('subject', '')}_ · {(sig.get('received_at') or '')[:10]}"
                    )
                    s1, s2 = st.columns(2)
                    if target_stage and s1.button(
                        f"→ {target_label}", key=f"sig_adv_pre_{sig['id']}",
                        use_container_width=True, type="primary",
                    ):
                        if target_stage == "phone_screen":
                            advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
                            submit_task(DEFAULT_DB, "company_research", job_id)
                        else:
                            advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
                        dismiss_stage_signal(DEFAULT_DB, sig["id"])
                        st.rerun(scope="app")
                    if s2.button("Dismiss", key=f"sig_dis_pre_{sig['id']}",
                                 use_container_width=True):
                        dismiss_stage_signal(DEFAULT_DB, sig["id"])
                        st.rerun()
        with right:
            if st.button(
                "→ 📞 Phone Screen", key=f"adv_pre_{job_id}",
                use_container_width=True, type="primary",
            ):
                advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
                submit_task(DEFAULT_DB, "company_research", job_id)
                st.rerun(scope="app")
            col_a, col_b = st.columns(2)
            if stage == "applied" and col_a.button(
                "📋 Survey", key=f"to_survey_{job_id}", use_container_width=True,
            ):
                advance_to_stage(DEFAULT_DB, job_id=job_id, stage="survey")
                st.rerun(scope="app")
            if col_b.button("✗ Reject", key=f"rej_pre_{job_id}", use_container_width=True):
                reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
                st.rerun()
@st.fragment
 def _hired_card_fragment(job_id: int) -> None:
    """Compact hired job card — shown in the Offer/Hired column."""
    job = get_job_by_id(DEFAULT_DB, job_id)
    if job is None or job.get("status") != "hired":
        return
    with st.container(border=True):
        st.markdown(f"✅ **{job.get('company', '?')}**")
        st.caption(job.get("title", ""))
        st.caption(f"Hired {_days_ago(job.get('hired_at'))}")
 # ── Stats bar ─────────────────────────────────────────────────────────────────
 c1, c2, c3, c4, c5, c6 = st.columns(6)
 c1.metric("Applied",      len(jobs_by_stage.get("applied", [])))
 c2.metric("Survey",       len(jobs_by_stage.get("survey", [])))
 c3.metric("Phone Screen", len(jobs_by_stage.get("phone_screen", [])))
 c4.metric("Interviewing", len(jobs_by_stage.get("interviewing", [])))
 c5.metric("Offer/Hired",  len(jobs_by_stage.get("offer", [])) + len(jobs_by_stage.get("hired", [])))
 c6.metric("Rejected",     len(jobs_by_stage.get("rejected", [])))
 st.divider()
 # ── Pre-kanban: Applied + Survey ───────────────────────────────────────────────
 applied_jobs = jobs_by_stage.get("applied", [])
 survey_jobs  = jobs_by_stage.get("survey", [])
 pre_kanban   = survey_jobs + applied_jobs  # survey shown first
 if pre_kanban:
    st.subheader(f"📋 Pre-pipeline ({len(pre_kanban)})")
    st.caption(
        "Move a job to **Phone Screen** once you receive an outreach. "
        "A company research brief will be auto-generated to help you prepare."
    )
    for job in pre_kanban:
        _pre_kanban_row_fragment(job["id"])
    st.divider()
 # ── Kanban columns ─────────────────────────────────────────────────────────────
 kanban_stages = ["phone_screen", "interviewing", "offer"]
 cols = st.columns(len(kanban_stages))
 for col, stage in zip(cols, kanban_stages):
    with col:
        stage_jobs = jobs_by_stage.get(stage, [])
        hired_jobs = jobs_by_stage.get("hired", []) if stage == "offer" else []
        all_col_jobs = stage_jobs + hired_jobs
        st.markdown(f"### {STAGE_LABELS[stage]}")
        st.caption(f"{len(all_col_jobs)} job{'s' if len(all_col_jobs) != 1 else ''}")
        st.divider()
        if not all_col_jobs:
            st.caption("_Empty_")
        else:
            for job in stage_jobs:
                _card_fragment(job["id"], stage)
            for job in hired_jobs:
                _hired_card_fragment(job["id"])
 st.divider()
 # ── Rejected log + analytics ───────────────────────────────────────────────────
 rejected_jobs = jobs_by_stage.get("rejected", [])
 if rejected_jobs:
    with st.expander(f"❌ Rejected ({len(rejected_jobs)})", expanded=False):
        # Stage breakdown
        stage_counts = Counter(
            j.get("rejection_stage") or "unknown" for j in rejected_jobs
        )
        st.caption(
            "Rejection by stage: "
            + " · ".join(f"**{k}**: {v}" for k, v in stage_counts.most_common())
        )
        # Rejection rate timeline (simple)
        if len(rejected_jobs) > 1:
            by_month: dict[str, int] = {}
            for j in rejected_jobs:
                mo = (j.get("applied_at") or "")[:7]
                if mo:
                    by_month[mo] = by_month.get(mo, 0) + 1
            if by_month:
                import pandas as pd
                chart_data = pd.DataFrame(
                    list(by_month.items()), columns=["Month", "Rejections"]
                ).sort_values("Month")
                st.bar_chart(chart_data.set_index("Month"))
        st.divider()
        for job in rejected_jobs:
            r_stage = job.get("rejection_stage") or "unknown"
            company = job.get("company") or "?"
            title = job.get("title") or ""
            applied = _days_ago(job.get("applied_at"))
            st.markdown(
                f"**{company}** — {title}  "
                f"· rejected at _**{r_stage}**_ · applied {applied}"
            )
--- a/app/pages/6_Interview_Prep.py
+++ b/app/pages/6_Interview_Prep.py
@ -0,0 +1,371 @@
 # app/pages/6_Interview_Prep.py
 """
 Interview Prep — a clean, glanceable reference you can keep open during a call.
 Left panel  : talking points, company brief, CEO info, practice Q&A
 Right panel : job description, email / contact history, cover letter snippet
 """
 import sys
 from datetime import date
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import streamlit as st
 from scripts.db import (
    DEFAULT_DB, init_db,
    get_interview_jobs, get_contacts, get_research,
    get_task_for_job,
 )
 from scripts.task_runner import submit_task
 init_db(DEFAULT_DB)
 # ── Job selection ─────────────────────────────────────────────────────────────
 jobs_by_stage = get_interview_jobs(DEFAULT_DB)
 active_stages = ["phone_screen", "interviewing", "offer"]
 active_jobs = [
    j for stage in active_stages
    for j in jobs_by_stage.get(stage, [])
 ]
 if not active_jobs:
    st.title("📋 Interview Prep")
    st.info(
        "No active interviews found. "
        "Move a job to **Phone Screen** on the Interviews page first."
    )
    st.stop()
 # Allow pre-selecting via session state (e.g., from Interviews page)
 preselect_id = st.session_state.pop("prep_job_id", None)
 job_options = {
    j["id"]: f"{j['title']} — {j['company']} ({j['status'].replace('_', ' ').title()})"
    for j in active_jobs
 }
 ids = list(job_options.keys())
 default_idx = ids.index(preselect_id) if preselect_id in ids else 0
 selected_id = st.selectbox(
    "Job",
    options=ids,
    format_func=lambda x: job_options[x],
    index=default_idx,
    label_visibility="collapsed",
 )
 job = next(j for j in active_jobs if j["id"] == selected_id)
 # ── Header bar ────────────────────────────────────────────────────────────────
 stage_label = job["status"].replace("_", " ").title()
 idate = job.get("interview_date")
 countdown = ""
 if idate:
    try:
        delta = (date.fromisoformat(idate) - date.today()).days
        if delta == 0:
            countdown = "  🔴 **TODAY**"
        elif delta == 1:
            countdown = "  🟡 **TOMORROW**"
        elif delta > 0:
            countdown = f"  🟢 in {delta} days"
        else:
            countdown = f"  (was {abs(delta)}d ago)"
    except Exception:
        countdown = ""
 st.title(f"📋 {job.get('company')} — {job.get('title')}")
 st.caption(
    f"Stage: **{stage_label}**"
    + (f"  ·  Interview: {idate}{countdown}" if idate else "")
    + (f"  ·  Applied: {job.get('applied_at', '')[:10]}" if job.get("applied_at") else "")
 )
 if job.get("url"):
    st.link_button("Open job listing ↗", job["url"])
 st.divider()
 # ── Two-column layout ─────────────────────────────────────────────────────────
 col_prep, col_context = st.columns([2, 3])
 # ════════════════════════════════════════════════
 #  LEFT — prep materials
 # ════════════════════════════════════════════════
 with col_prep:
    research = get_research(DEFAULT_DB, job_id=selected_id)
    # Refresh / generate research
    _res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
    _res_running = _res_task and _res_task["status"] in ("queued", "running")
    if not research:
        if not _res_running:
            st.warning("No research brief yet for this job.")
            if _res_task and _res_task["status"] == "failed":
                st.error(f"Last attempt failed: {_res_task.get('error', '')}")
            if st.button("🔬 Generate research brief", type="primary", use_container_width=True):
                submit_task(DEFAULT_DB, "company_research", selected_id)
                st.rerun()
        if _res_running:
            @st.fragment(run_every=3)
            def _res_status_initial():
                t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
                if t and t["status"] in ("queued", "running"):
                    stage = t.get("stage") or ""
                    lbl = "Queued…" if t["status"] == "queued" else (stage or "Generating… this may take 30–60 seconds")
                    st.info(f"⏳ {lbl}")
                else:
                    st.rerun()
            _res_status_initial()
        st.stop()
    else:
        generated_at = research.get("generated_at", "")
        col_ts, col_btn = st.columns([3, 1])
        col_ts.caption(f"Research generated: {generated_at}")
        if col_btn.button("🔄 Refresh", use_container_width=True, disabled=bool(_res_running)):
            submit_task(DEFAULT_DB, "company_research", selected_id)
            st.rerun()
        if _res_running:
            @st.fragment(run_every=3)
            def _res_status_refresh():
                t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
                if t and t["status"] in ("queued", "running"):
                    stage = t.get("stage") or ""
                    lbl = "Queued…" if t["status"] == "queued" else (stage or "Refreshing research…")
                    st.info(f"⏳ {lbl}")
                else:
                    st.rerun()
            _res_status_refresh()
        elif _res_task and _res_task["status"] == "failed":
            st.error(f"Refresh failed: {_res_task.get('error', '')}")
    st.divider()
    # ── Talking points (top — most useful during a call) ──────────────────────
    st.subheader("🎯 Talking Points")
    tp = (research.get("talking_points") or "").strip()
    if tp:
        st.markdown(tp)
    else:
        st.caption("_No talking points extracted — try regenerating._")
    st.divider()
    # ── Company brief ─────────────────────────────────────────────────────────
    st.subheader("🏢 Company Overview")
    st.markdown(research.get("company_brief", "_—_"))
    st.divider()
    # ── Leadership brief ──────────────────────────────────────────────────────
    st.subheader("👤 Leadership & Culture")
    st.markdown(research.get("ceo_brief", "_—_"))
    st.divider()
    # ── Tech Stack & Product ───────────────────────────────────────────────────
    tech = (research.get("tech_brief") or "").strip()
    if tech:
        st.subheader("⚙️ Tech Stack & Product")
        st.markdown(tech)
        st.divider()
    # ── Funding & Market Position ──────────────────────────────────────────────
    funding = (research.get("funding_brief") or "").strip()
    if funding:
        st.subheader("💰 Funding & Market Position")
        st.markdown(funding)
        st.divider()
    # ── Red Flags & Watch-outs ────────────────────────────────────────────────
    red = (research.get("red_flags") or "").strip()
    if red and "no significant red flags" not in red.lower():
        st.subheader("⚠️ Red Flags & Watch-outs")
        st.warning(red)
        st.divider()
    # ── Inclusion & Accessibility ─────────────────────────────────────────────
    access = (research.get("accessibility_brief") or "").strip()
    if access:
        st.subheader("♿ Inclusion & Accessibility")
        st.caption("For your personal evaluation — not disclosed in any application.")
        st.markdown(access)
        st.divider()
    # ── Practice Q&A (collapsible — use before the call) ─────────────────────
    with st.expander("🎤 Practice Q&A (pre-call prep)", expanded=False):
        st.caption(
            "The LLM will play the interviewer. Type your answers below. "
            "Use this before the call to warm up."
        )
        qa_key = f"qa_{selected_id}"
        if qa_key not in st.session_state:
            st.session_state[qa_key] = []
        if st.button("🔄 Start / Reset session", key=f"qa_reset_{selected_id}"):
            st.session_state[qa_key] = []
            st.rerun()
        # Display history
        for msg in st.session_state[qa_key]:
            with st.chat_message(msg["role"]):
                st.markdown(msg["content"])
        # Initial question if session is empty
        if not st.session_state[qa_key]:
            with st.spinner("Setting up your mock interview…"):
                try:
                    from scripts.llm_router import complete
                    opening = complete(
                        prompt=(
                            f"Start a mock phone screen for the {job.get('title')} "
                            f"role at {job.get('company')}. Ask your first question. "
                            f"Keep it realistic and concise."
                        ),
                        system=(
                            f"You are a recruiter at {job.get('company')} conducting "
                            f"a phone screen for the {job.get('title')} role. "
                            f"Ask one question at a time. After Alex answers, give "
                            f"brief feedback (1–2 sentences), then ask your next question. "
                            f"Be professional but warm."
                        ),
                    )
                    st.session_state[qa_key] = [{"role": "assistant", "content": opening}]
                    st.rerun()
                except Exception as e:
                    st.error(f"LLM error: {e}")
        # Answer input
        answer = st.chat_input("Your answer…", key=f"qa_input_{selected_id}")
        if answer and st.session_state[qa_key]:
            history = st.session_state[qa_key]
            history.append({"role": "user", "content": answer})
            messages = [
                {
                    "role": "system",
                    "content": (
                        f"You are a recruiter at {job.get('company')} conducting "
                        f"a phone screen for the {job.get('title')} role. "
                        f"Ask one question at a time. After Alex answers, give "
                        f"brief feedback (1–2 sentences), then ask your next question."
                    ),
                }
            ] + history
            with st.spinner("…"):
                try:
                    from scripts.llm_router import LLMRouter
                    router = LLMRouter()
                    # Build prompt from history for single-turn backends
                    convo = "\n\n".join(
                        f"{'Interviewer' if m['role'] == 'assistant' else 'Alex'}: {m['content']}"
                        for m in history
                    )
                    response = router.complete(
                        prompt=convo + "\n\nInterviewer:",
                        system=messages[0]["content"],
                    )
                    history.append({"role": "assistant", "content": response})
                    st.session_state[qa_key] = history
                    st.rerun()
                except Exception as e:
                    st.error(f"Error: {e}")
 # ════════════════════════════════════════════════
 #  RIGHT — context / reference
 # ════════════════════════════════════════════════
 with col_context:
    tab_jd, tab_emails, tab_letter = st.tabs(
        ["📄 Job Description", "📧 Email History", "📝 Cover Letter"]
    )
    with tab_jd:
        score = job.get("match_score")
        if score is not None:
            badge = (
                f"🟢 {score:.0f}% match" if score >= 70 else
                f"🟡 {score:.0f}% match" if score >= 40 else
                f"🔴 {score:.0f}% match"
            )
            st.caption(badge)
        if job.get("keyword_gaps"):
            st.caption(f"**Gaps to address:** {job['keyword_gaps']}")
        st.markdown(job.get("description") or "_No description saved for this listing._")
    with tab_emails:
        contacts = get_contacts(DEFAULT_DB, job_id=selected_id)
        if not contacts:
            st.info("No contacts logged yet. Use the Interviews page to log emails.")
        else:
            for c in contacts:
                icon = "📥" if c["direction"] == "inbound" else "📤"
                recv = (c.get("received_at") or "")[:10]
                st.markdown(
                    f"{icon} **{c.get('subject') or '(no subject)'}** · _{recv}_"
                )
                if c.get("from_addr"):
                    st.caption(f"From: {c['from_addr']}")
                if c.get("body"):
                    st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else ""))
                st.divider()
            # Quick draft reply
            inbound = [c for c in contacts if c["direction"] == "inbound"]
            if inbound:
                last = inbound[-1]
                if st.button("✍️ Draft reply to last email"):
                    with st.spinner("Drafting…"):
                        try:
                            from scripts.llm_router import complete
                            draft = complete(
                                prompt=(
                                    f"Draft a professional, warm reply.\n\n"
                                    f"From: {last.get('from_addr', '')}\n"
                                    f"Subject: {last.get('subject', '')}\n\n"
                                    f"{last.get('body', '')}\n\n"
                                    f"Context: Alex is a CS/TAM professional applying "
                                    f"for {job.get('title')} at {job.get('company')}."
                                ),
                                system=(
                                    "You are Alex Rivera's professional email assistant. "
                                    "Write concise, warm, and professional replies in her voice."
                                ),
                            )
                            st.session_state[f"draft_{selected_id}"] = draft
                        except Exception as e:
                            st.error(f"Draft failed: {e}")
                if f"draft_{selected_id}" in st.session_state:
                    st.text_area(
                        "Draft (edit before sending)",
                        value=st.session_state[f"draft_{selected_id}"],
                        height=180,
                    )
    with tab_letter:
        cl = (job.get("cover_letter") or "").strip()
        if cl:
            st.markdown(cl)
        else:
            st.info("No cover letter saved for this job.")
    st.divider()
    # ── Notes (freeform, stored in session only — not persisted to DB) ────────
    st.subheader("📝 Call Notes")
    st.caption("Notes are per-session only — copy anything important before navigating away.")
    st.text_area(
        "notes",
        placeholder="Type notes during or after the call…",
        height=200,
        key=f"notes_{selected_id}",
        label_visibility="collapsed",
    )
--- a/app/pages/7_Survey.py
+++ b/app/pages/7_Survey.py
@ -0,0 +1,274 @@
 # app/pages/7_Survey.py
 """
 Survey Assistant — real-time help with culture-fit surveys.
 Supports text paste and screenshot (via clipboard or file upload).
 Quick mode: "pick B" + one-liner. Detailed mode: option-by-option breakdown.
 """
 import base64
 import io
 import sys
 from datetime import datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 import requests
 import streamlit as st
 from scripts.db import (
    DEFAULT_DB, init_db,
    get_interview_jobs, get_job_by_id,
    insert_survey_response, get_survey_responses,
 )
 from scripts.llm_router import LLMRouter
 st.title("📋 Survey Assistant")
 init_db(DEFAULT_DB)
 # ── Vision service health check ────────────────────────────────────────────────
 def _vision_available() -> bool:
    try:
        r = requests.get("http://localhost:8002/health", timeout=2)
        return r.status_code == 200
    except Exception:
        return False
 vision_up = _vision_available()
 # ── Job selector ───────────────────────────────────────────────────────────────
 jobs_by_stage = get_interview_jobs(DEFAULT_DB)
 survey_jobs = jobs_by_stage.get("survey", [])
 other_jobs = (
    jobs_by_stage.get("applied", []) +
    jobs_by_stage.get("phone_screen", []) +
    jobs_by_stage.get("interviewing", []) +
    jobs_by_stage.get("offer", [])
 )
 all_jobs = survey_jobs + other_jobs
 if not all_jobs:
    st.info("No active jobs found. Add jobs in Job Review first.")
    st.stop()
 job_labels = {j["id"]: f"{j.get('company', '?')} — {j.get('title', '')}" for j in all_jobs}
 selected_job_id = st.selectbox(
    "Job",
    options=[j["id"] for j in all_jobs],
    format_func=lambda jid: job_labels[jid],
    index=0,
 )
 selected_job = get_job_by_id(DEFAULT_DB, selected_job_id)
 # ── LLM prompt builders ────────────────────────────────────────────────────────
 _SURVEY_SYSTEM = (
    "You are a job application advisor helping a candidate answer a culture-fit survey. "
    "The candidate values collaborative teamwork, clear communication, growth, and impact. "
    "Choose answers that present them in the best professional light."
 )
 def _build_text_prompt(text: str, mode: str) -> str:
    if mode == "Quick":
        return (
            "Answer each survey question below. For each, give ONLY the letter of the best "
            "option and a single-sentence reason. Format exactly as:\n"
            "1. B — reason here\n2. A — reason here\n\n"
            f"Survey:\n{text}"
        )
    return (
        "Analyze each survey question below. For each question:\n"
        "- Briefly evaluate each option (1 sentence each)\n"
        "- State your recommendation with reasoning\n\n"
        f"Survey:\n{text}"
    )
 def _build_image_prompt(mode: str) -> str:
    if mode == "Quick":
        return (
            "This is a screenshot of a culture-fit survey. Read all questions and answer each "
            "with the letter of the best option for a collaborative, growth-oriented candidate. "
            "Format: '1. B — brief reason' on separate lines."
        )
    return (
        "This is a screenshot of a culture-fit survey. For each question, evaluate each option "
        "and recommend the best choice for a collaborative, growth-oriented candidate. "
        "Include a brief breakdown per option and a clear recommendation."
    )
 # ── Layout ─────────────────────────────────────────────────────────────────────
 left_col, right_col = st.columns([1, 1], gap="large")
 with left_col:
    survey_name = st.text_input(
        "Survey name (optional)",
        placeholder="e.g. Culture Fit Round 1",
        key="survey_name",
    )
    mode = st.radio("Mode", ["Quick", "Detailed"], horizontal=True, key="survey_mode")
    st.caption(
        "**Quick** — best answer + one-liner per question  |  "
        "**Detailed** — option-by-option breakdown"
    )
    # Input tabs
    if vision_up:
        tab_text, tab_screenshot = st.tabs(["📝 Paste Text", "🖼️ Screenshot"])
    else:
        st.info(
            "📷 Screenshot input unavailable — vision service not running.  \n"
            "Start it with: `bash scripts/manage-vision.sh start`"
        )
        tab_text = st.container()
        tab_screenshot = None
    image_b64: str | None = None
    raw_text: str = ""
    with tab_text:
        raw_text = st.text_area(
            "Paste survey questions here",
            height=280,
            placeholder=(
                "Q1: Which describes your ideal work environment?\n"
                "A. Solo focused work\nB. Collaborative team\n"
                "C. Mix of both\nD. Depends on the task"
            ),
            key="survey_text",
        )
    if tab_screenshot is not None:
        with tab_screenshot:
            st.caption("Paste from clipboard or upload a screenshot file.")
            paste_col, upload_col = st.columns(2)
            with paste_col:
                try:
                    from streamlit_paste_button import paste_image_button
                    paste_result = paste_image_button("📋 Paste from clipboard", key="paste_btn")
                    if paste_result and paste_result.image_data:
                        buf = io.BytesIO()
                        paste_result.image_data.save(buf, format="PNG")
                        image_b64 = base64.b64encode(buf.getvalue()).decode()
                        st.image(
                            paste_result.image_data,
                            caption="Pasted image",
                            use_container_width=True,
                        )
                except ImportError:
                    st.warning("streamlit-paste-button not installed. Use file upload.")
            with upload_col:
                uploaded = st.file_uploader(
                    "Upload screenshot",
                    type=["png", "jpg", "jpeg"],
                    key="survey_upload",
                    label_visibility="collapsed",
                )
                if uploaded:
                    image_b64 = base64.b64encode(uploaded.read()).decode()
                    st.image(uploaded, caption="Uploaded image", use_container_width=True)
    # Analyze button
    has_input = bool(raw_text.strip()) or bool(image_b64)
    if st.button("🔍 Analyze", type="primary", disabled=not has_input, use_container_width=True):
        with st.spinner("Analyzing…"):
            try:
                router = LLMRouter()
                if image_b64:
                    prompt = _build_image_prompt(mode)
                    output = router.complete(
                        prompt,
                        images=[image_b64],
                        fallback_order=router.config.get("vision_fallback_order"),
                    )
                    source = "screenshot"
                else:
                    prompt = _build_text_prompt(raw_text, mode)
                    output = router.complete(
                        prompt,
                        system=_SURVEY_SYSTEM,
                        fallback_order=router.config.get("research_fallback_order"),
                    )
                    source = "text_paste"
                st.session_state["survey_output"] = output
                st.session_state["survey_source"] = source
                st.session_state["survey_image_b64"] = image_b64
                st.session_state["survey_raw_text"] = raw_text
            except Exception as e:
                st.error(f"Analysis failed: {e}")
 with right_col:
    output = st.session_state.get("survey_output")
    if output:
        st.markdown("### Analysis")
        st.markdown(output)
        st.divider()
        with st.form("save_survey_form"):
            reported_score = st.text_input(
                "Reported score (optional)",
                placeholder="e.g. 82% or 4.2/5",
                key="reported_score_input",
            )
            if st.form_submit_button("💾 Save to Job"):
                source = st.session_state.get("survey_source", "text_paste")
                image_b64_saved = st.session_state.get("survey_image_b64")
                raw_text_saved = st.session_state.get("survey_raw_text", "")
                image_path = ""
                if image_b64_saved:
                    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
                    save_dir = (
                        Path(__file__).parent.parent.parent
                        / "data"
                        / "survey_screenshots"
                        / str(selected_job_id)
                    )
                    save_dir.mkdir(parents=True, exist_ok=True)
                    img_file = save_dir / f"{ts}.png"
                    img_file.write_bytes(base64.b64decode(image_b64_saved))
                    image_path = str(img_file)
                insert_survey_response(
                    DEFAULT_DB,
                    job_id=selected_job_id,
                    survey_name=survey_name,
                    source=source,
                    raw_input=raw_text_saved,
                    image_path=image_path,
                    mode=mode.lower(),
                    llm_output=output,
                    reported_score=reported_score,
                )
                st.success("Saved!")
                del st.session_state["survey_output"]
                st.rerun()
    else:
        st.markdown("### Analysis")
        st.caption("Results will appear here after analysis.")
 # ── History ────────────────────────────────────────────────────────────────────
 st.divider()
 st.subheader("📂 Response History")
 history = get_survey_responses(DEFAULT_DB, job_id=selected_job_id)
 if not history:
    st.caption("No saved responses for this job yet.")
 else:
    for resp in history:
        label = resp.get("survey_name") or "Survey response"
        ts = (resp.get("created_at") or "")[:16]
        score = resp.get("reported_score")
        score_str = f" · Score: {score}" if score else ""
        with st.expander(f"{label} · {ts}{score_str}"):
            st.caption(f"Mode: {resp.get('mode', '?')} · Source: {resp.get('source', '?')}")
            if resp.get("raw_input"):
                with st.expander("Original input"):
                    st.text(resp["raw_input"])
            st.markdown(resp.get("llm_output", ""))
--- a/config/adzuna.yaml.example
+++ b/config/adzuna.yaml.example
@ -0,0 +1,5 @@
 # Adzuna Jobs API credentials
 # Register at https://developer.adzuna.com/admin/applications
 # Both app_id and app_key are required.
 app_id: ""   # short alphanumeric ID from your developer dashboard
 app_key: ""  # 32-character hex key from your developer dashboard
--- a/config/blocklist.yaml
+++ b/config/blocklist.yaml
@ -0,0 +1,15 @@
 # Discovery blocklist — entries matching any rule are silently dropped before DB insert.
 # Applies globally across all search profiles and custom boards.
 # Company name blocklist — partial case-insensitive match on the company field.
 # e.g. "Amazon" blocks any listing where company contains "amazon".
 companies: []
 # Industry/content blocklist — blocked if company name OR job description contains any keyword.
 # Use this for industries you will never work in regardless of company.
 # e.g. "gambling", "crypto", "tobacco", "defense"
 industries: []
 # Location blocklist — blocked if the location field contains any of these strings.
 # e.g. "Dallas", "Austin, TX"
 locations: []
--- a/config/craigslist.yaml.example
+++ b/config/craigslist.yaml.example
@ -0,0 +1,24 @@
 # Craigslist metro subdomains to search.
 # Copy to config/craigslist.yaml and adjust for your markets.
 # Full subdomain list: https://www.craigslist.org/about/sites
 metros:
  - sfbay
  - newyork
  - chicago
  - losangeles
  - seattle
  - austin
 # Maps search profile location strings → Craigslist metro subdomain.
 # Locations not listed here are silently skipped.
 location_map:
  "San Francisco Bay Area, CA": sfbay
  "New York, NY": newyork
  "Chicago, IL": chicago
  "Los Angeles, CA": losangeles
  "Seattle, WA": seattle
  "Austin, TX": austin
 # Craigslist job category. Defaults to 'jjj' (general jobs) if omitted.
 # Other options: csr (customer service), mar (marketing), sof (software/qa/dba)
 # category: jjj
--- a/config/email.yaml.example
+++ b/config/email.yaml.example
@ -0,0 +1,38 @@
 # config/email.yaml — IMAP email sync configuration
 # Copy this to config/email.yaml and fill in your credentials.
 # config/email.yaml is gitignored — never commit real credentials.
 #
 # Gmail setup:
 #   1. Enable IMAP: Gmail Settings → See all settings → Forwarding and POP/IMAP
 #   2. Create App Password: myaccount.google.com/apppasswords
 #      (requires 2-Step Verification to be enabled)
 #   3. Use your Gmail address as username, App Password as password.
 #
 # Outlook / Office 365:
 #   host: outlook.office365.com
 #   port: 993
 #   use_ssl: true
 #   (Use your regular email + password, or an App Password if MFA is enabled)
 host: imap.gmail.com
 port: 993
 use_ssl: true
 # Your full email address
 username: your.email@gmail.com
 # Gmail: use an App Password (16-char code, no spaces)
 # Other providers: use your regular password (or App Password if MFA enabled)
 password: xxxx-xxxx-xxxx-xxxx
 # Sent folder name — leave blank to auto-detect
 # Gmail: "[Gmail]/Sent Mail"   Outlook: "Sent Items"   Generic: "Sent"
 sent_folder: ""
 # How many days back to search (90 = ~3 months)
 lookback_days: 90
 # Optional: Gmail label to scan for action-needed emails (e.g. "TO DO JOBS").
 # Emails in this label are matched to pipeline jobs by company name, then
 # filtered by action keywords in the subject. Leave blank to disable.
 todo_label: ""
--- a/config/llm.yaml
+++ b/config/llm.yaml
@ -0,0 +1,66 @@
 backends:
  anthropic:
    api_key_env: ANTHROPIC_API_KEY
    enabled: false
    model: claude-sonnet-4-6
    type: anthropic
    supports_images: true
  claude_code:
    api_key: any
    base_url: http://localhost:3009/v1
    enabled: false
    model: claude-code-terminal
    type: openai_compat
    supports_images: true
  github_copilot:
    api_key: any
    base_url: http://localhost:3010/v1
    enabled: false
    model: gpt-4o
    type: openai_compat
    supports_images: false
  ollama:
    api_key: ollama
    base_url: http://localhost:11434/v1
    enabled: true
    model: alex-cover-writer:latest
    type: openai_compat
    supports_images: false
  ollama_research:
    api_key: ollama
    base_url: http://localhost:11434/v1
    enabled: true
    model: llama3.1:8b
    type: openai_compat
    supports_images: false
  vllm:
    api_key: ''
    base_url: http://localhost:8000/v1
    enabled: true
    model: __auto__
    type: openai_compat
    supports_images: false
  vision_service:
    base_url: http://localhost:8002
    enabled: false
    type: vision_service
    supports_images: true
 fallback_order:
 - ollama
 - claude_code
 - vllm
 - github_copilot
 - anthropic
 research_fallback_order:
 - claude_code
 - vllm
 - ollama_research
 - github_copilot
 - anthropic
 vision_fallback_order:
 - vision_service
 - claude_code
 - anthropic
 # Note: 'ollama' (alex-cover-writer) intentionally excluded — research
 # must never use the fine-tuned writer model, and this also avoids evicting
 # the writer from GPU memory while a cover letter task is in flight.
--- a/config/llm.yaml.example
+++ b/config/llm.yaml.example
@ -0,0 +1,66 @@
 backends:
  anthropic:
    api_key_env: ANTHROPIC_API_KEY
    enabled: false
    model: claude-sonnet-4-6
    type: anthropic
    supports_images: true
  claude_code:
    api_key: any
    base_url: http://localhost:3009/v1
    enabled: false
    model: claude-code-terminal
    type: openai_compat
    supports_images: true
  github_copilot:
    api_key: any
    base_url: http://localhost:3010/v1
    enabled: false
    model: gpt-4o
    type: openai_compat
    supports_images: false
  ollama:
    api_key: ollama
    base_url: http://localhost:11434/v1
    enabled: true
    model: alex-cover-writer:latest
    type: openai_compat
    supports_images: false
  ollama_research:
    api_key: ollama
    base_url: http://localhost:11434/v1
    enabled: true
    model: llama3.1:8b
    type: openai_compat
    supports_images: false
  vllm:
    api_key: ''
    base_url: http://localhost:8000/v1
    enabled: true
    model: __auto__
    type: openai_compat
    supports_images: false
  vision_service:
    base_url: http://localhost:8002
    enabled: false
    type: vision_service
    supports_images: true
 fallback_order:
 - ollama
 - claude_code
 - vllm
 - github_copilot
 - anthropic
 research_fallback_order:
 - claude_code
 - vllm
 - ollama_research
 - github_copilot
 - anthropic
 vision_fallback_order:
 - vision_service
 - claude_code
 - anthropic
 # Note: 'ollama' (alex-cover-writer) intentionally excluded — research
 # must never use the fine-tuned writer model, and this also avoids evicting
 # the writer from GPU memory while a cover letter task is in flight.
--- a/config/notion.yaml.example
+++ b/config/notion.yaml.example
@ -0,0 +1,24 @@
 # Copy to config/notion.yaml and fill in your values.
 # notion.yaml is gitignored — never commit it.
 #
 # Get your integration token from: https://www.notion.so/my-integrations
 # Then share the "Tracking Job Applications" database with your integration:
 #   Open the DB in Notion → ... menu → Add connections → select your integration
 #
 token: "secret_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
 database_id: "1bd75cff-7708-8007-8c00-f1de36620a0a"
 field_map:
  title_field: "Salary"
  job_title: "Job Title"
  company: "Company Name"
  url: "Role Link"
  source: "Job Source"
  status: "Status of Application"
  status_new: "Application Submitted"
  date_found: "Date Found"
  remote: "Remote"
  match_score: "Match Score"
  keyword_gaps: "Keyword Gaps"
  notes: "Notes"
  job_description: "Job Description"
--- a/config/resume_keywords.yaml
+++ b/config/resume_keywords.yaml
@ -0,0 +1,23 @@
 domains:
 - B2B SaaS
 - enterprise software
 - security
 - compliance
 - post-sale lifecycle
 - SaaS metrics
 - web security
 keywords:
 - churn reduction
 - escalation management
 - cross-functional
 - product feedback loop
 - customer advocacy
 skills:
 - Customer Success
 - Technical Account Management
 - Revenue Operations
 - data analysis
 - stakeholder management
 - project management
 - onboarding
 - renewal management
--- a/config/resume_keywords.yaml.example
+++ b/config/resume_keywords.yaml.example
@ -0,0 +1,33 @@
 skills:
  - Customer Success
  - Technical Account Management
  - Revenue Operations
  - Salesforce
  - Gainsight
  - data analysis
  - stakeholder management
  - project management
  - onboarding
  - renewal management
 domains:
  - B2B SaaS
  - enterprise software
  - security
  - compliance
  - post-sale lifecycle
  - SaaS metrics
 keywords:
  - QBR
  - churn reduction
  - NRR
  - ARR
  - MRR
  - executive sponsorship
  - VOC
  - health score
  - escalation management
  - cross-functional
  - product feedback loop
  - customer advocacy
--- a/config/search_profiles.yaml
+++ b/config/search_profiles.yaml
@ -0,0 +1,123 @@
 profiles:
 - boards:
  - linkedin
  - indeed
  - glassdoor
  - zip_recruiter
  - google
  custom_boards:
  - adzuna
  - theladders
  - craigslist
  exclude_keywords:
  - sales
  - account executive
  - sales engineer
  - SDR
  - BDR
  - business development
  - sales development
  - sales manager
  - sales representative
  - sales rep
  hours_old: 240
  locations:
  - Remote
  - San Francisco Bay Area, CA
  name: cs_leadership
  results_per_board: 75
  titles:
  - Customer Success Manager
  - Customer Engagement Manager
  - Director of Customer Success
  - VP Customer Success
  - Head of Customer Success
  - Technical Account Manager
  - TAM
  - Customer Experience Lead
  - CSM
  - CX
  - Customer Success Consultant
 - boards:
  - linkedin
  - indeed
  custom_boards:
  - adzuna
  - craigslist
  exclude_keywords:
  - sales
  - account executive
  - SDR
  - BDR
  - sales development
  hours_old: 336
  locations:
  - Remote
  - San Francisco Bay Area, CA
  mission_tags:
  - music
  name: music_industry
  results_per_board: 50
  titles:
  - Customer Success Manager
  - Partner Success Manager
  - Artist Success Manager
  - Creator Success Manager
  - Technical Account Manager
  - Community Manager
  - Account Manager
  - Label Relations Manager
 - boards:
  - linkedin
  - indeed
  custom_boards:
  - adzuna
  - craigslist
  exclude_keywords:
  - sales
  - account executive
  - SDR
  - BDR
  hours_old: 336
  locations:
  - Remote
  - San Francisco Bay Area, CA
  mission_tags:
  - animal_welfare
  name: animal_welfare
  results_per_board: 50
  titles:
  - Customer Success Manager
  - Program Manager
  - Community Engagement Manager
  - Operations Manager
  - Partner Success Manager
  - Account Manager
  - Development Manager
 - boards:
  - linkedin
  - indeed
  custom_boards:
  - adzuna
  - craigslist
  exclude_keywords:
  - sales
  - account executive
  - SDR
  - BDR
  hours_old: 336
  locations:
  - Remote
  - San Francisco Bay Area, CA
  mission_tags:
  - education
  name: education
  results_per_board: 50
  titles:
  - Customer Success Manager
  - District Success Manager
  - Implementation Specialist
  - Partner Success Manager
  - Account Manager
  - School Success Manager
  - Customer Experience Manager
--- a/data/survey_screenshots/.gitkeep
+++ b/data/survey_screenshots/.gitkeep
--- a/environment.yml
+++ b/environment.yml
@ -0,0 +1,68 @@
 name: job-seeker
 # Recreate: conda env create -f environment.yml
 # Update pinned snapshot: conda env export --no-builds > environment.yml
 channels:
  - conda-forge
  - defaults
 dependencies:
  - python=3.12
  - pip
  - pip:
    # ── Web UI ────────────────────────────────────────────────────────────────
    - streamlit>=1.35
    - watchdog            # live reload
    - reportlab>=4.0      # PDF cover letter export
    - pandas>=2.0
    - pyarrow             # streamlit data tables
    - streamlit-paste-button>=0.1.0
    # ── Job scraping ──────────────────────────────────────────────────────────
    - python-jobspy>=1.1
    - playwright          # browser automation (run: playwright install chromium)
    - selenium
    - undetected-chromedriver
    - webdriver-manager
    - beautifulsoup4
    - requests
    - curl_cffi           # Chrome TLS fingerprint — bypasses Cloudflare on The Ladders
    - fake-useragent      # company scraper rotation
    # ── LLM / AI backends ─────────────────────────────────────────────────────
    - openai>=1.0         # used for OpenAI-compat backends (ollama, vllm, wrappers)
    - anthropic>=0.80     # direct Anthropic API fallback
    - ollama              # Python client for Ollama management
    - langchain>=0.2
    - langchain-openai
    - langchain-anthropic
    - langchain-ollama
    - langchain-community
    - langchain-google-genai
    - google-generativeai
    - tiktoken
    # ── Resume matching ───────────────────────────────────────────────────────
    - scikit-learn>=1.3
    - rapidfuzz
    - lib-resume-builder-aihawk
    # ── Notion integration ────────────────────────────────────────────────────
    - notion-client>=3.0
    # ── Document handling ─────────────────────────────────────────────────────
    - pypdf
    - pdfminer-six
    - pyyaml>=6.0
    - python-dotenv
    # ── Utilities ─────────────────────────────────────────────────────────────
    - sqlalchemy
    - tqdm
    - loguru
    - rich
    - tenacity
    - httpx
    # ── Testing ───────────────────────────────────────────────────────────────
    - pytest>=9.0
    - pytest-cov
    - pytest-mock
--- a/pytest.ini
+++ b/pytest.ini
@ -0,0 +1,2 @@
 [pytest]
 testpaths = tests
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/company_research.py
+++ b/scripts/company_research.py
@ -0,0 +1,468 @@
 # scripts/company_research.py
 """
 Pre-interview company research generator.
 Three-phase approach:
  1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
     data: CEO name, HQ address, LinkedIn, contact info.
  1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
      recent news snippets (funding, launches, leadership changes, etc.).
  2. Feed all real data into an LLM prompt to synthesise a structured brief
     covering company overview, leadership, recent developments, and talking
     points tailored to Alex.
 Falls back to pure LLM knowledge when SearXNG is offline.
 Usage (standalone):
    conda run -n job-seeker python scripts/company_research.py --job-id 42
    conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape
 """
 import re
 import sys
 from pathlib import Path
 from types import SimpleNamespace
 sys.path.insert(0, str(Path(__file__).parent.parent))
 # ── SearXNG scraper integration ───────────────────────────────────────────────
 _SCRAPER_DIR = Path("/Library/Development/scrapers")
 _SCRAPER_AVAILABLE = False
 if _SCRAPER_DIR.exists():
    sys.path.insert(0, str(_SCRAPER_DIR))
    try:
        from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
        _SCRAPER_AVAILABLE = True
    except (ImportError, SystemExit):
        # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
        pass
 def _searxng_running() -> bool:
    """Quick check whether SearXNG is reachable."""
    try:
        import requests
        r = requests.get("http://localhost:8888/", timeout=3)
        return r.status_code == 200
    except Exception:
        return False
 def _scrape_company(company: str) -> dict:
    """
    Use companyScraper in minimal mode to pull live CEO / HQ data.
    Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found').
    """
    mock_args = SimpleNamespace(
        mode="minimal",
        verbose=False,
        dry_run=False,
        debug=False,
        use_cache=True,
        save_raw=False,
        target_staff=None,
        include_types=None,
        exclude_types=None,
        include_contact=False,
        include_address=False,
        include_social=True,   # grab LinkedIn while we're at it
        timeout=20,
        input_file=None,
        output_file="/dev/null",
        searxng_url="http://localhost:8888/",
    )
    # Override the singleton Config URL
    _ScraperConfig.SEARXNG_URL = "http://localhost:8888/"
    scraper = EnhancedCompanyScraper(mock_args)
    scraper.companies = [company]
    result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"}
    for search_type in ["ceo", "hq", "social"]:
        html = scraper.search_company(company, search_type)
        if search_type == "ceo":
            result["ceo"] = scraper.extract_ceo(html, company)
        elif search_type == "hq":
            result["headquarters"] = scraper.extract_address(html, company)
        elif search_type == "social":
            social = scraper.extract_social(html, company)
            # Pull out just the LinkedIn entry
            for part in (social or "").split(";"):
                if "linkedin" in part.lower():
                    result["linkedin"] = part.strip()
                    break
    return result
 _SEARCH_QUERIES = {
    "news":          '"{company}" news 2025 2026',
    "funding":       '"{company}" funding round investors Series valuation',
    "tech":          '"{company}" tech stack engineering technology platform',
    "competitors":   '"{company}" competitors alternatives vs market',
    "culture":       '"{company}" glassdoor culture reviews employees',
    "accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG',
    "ceo_press":     '"{ceo}" "{company}"',  # only used if ceo is known
 }
 def _run_search_query(query: str, results: dict, key: str) -> None:
    """Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key]."""
    import requests
    snippets: list[str] = []
    seen: set[str] = set()
    try:
        resp = requests.get(
            "http://localhost:8888/search",
            params={"q": query, "format": "json", "language": "en-US"},
            timeout=12,
        )
        if resp.status_code != 200:
            return
        for r in resp.json().get("results", [])[:4]:
            url = r.get("url", "")
            if url in seen:
                continue
            seen.add(url)
            title = r.get("title", "").strip()
            content = r.get("content", "").strip()
            if title or content:
                snippets.append(f"- **{title}**\n  {content}\n  <{url}>")
    except Exception:
        pass
    results[key] = "\n\n".join(snippets)
 def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]:
    """
    Run all search queries in parallel threads.
    Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press).
    Missing/failed queries produce empty strings.
    """
    import threading
    results: dict[str, str] = {}
    threads = []
    keys: list[str] = []
    for key, pattern in _SEARCH_QUERIES.items():
        if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found":
            continue
        # Use replace() not .format() — company names may contain curly braces
        query = pattern.replace("{company}", company).replace("{ceo}", ceo)
        t = threading.Thread(
            target=_run_search_query,
            args=(query, results, key),
            daemon=True,
        )
        threads.append(t)
        keys.append(key)
        t.start()
    for t, key in zip(threads, keys):
        t.join(timeout=15)
        # Thread may still be alive after timeout — pre-populate key so
        # the results dict contract ("missing queries → empty string") holds
        if t.is_alive():
            results.setdefault(key, "")
    return results
 def _parse_sections(text: str) -> dict[str, str]:
    """Split LLM markdown output on ## headers into named sections."""
    sections: dict[str, str] = {}
    pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE)
    matches = list(pattern.finditer(text))
    for i, match in enumerate(matches):
        name = match.group(1).strip()
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        sections[name] = text[start:end].strip()
    return sections
 _RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
 # Companies where Alex has an NDA — reference as generic label unless
 # the role is security-focused (score >= 3 matching JD keywords).
 _NDA_COMPANIES = {"upguard"}
 def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
    """Score each experience entry by keyword overlap with JD; return sorted descending."""
    jd_lower = jd.lower()
    scored = []
    for exp in experiences:
        text = " ".join([
            exp.get("position", ""),
            exp.get("company", ""),
            " ".join(
                v
                for resp in exp.get("key_responsibilities", [])
                for v in resp.values()
            ),
        ]).lower()
        score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower)
        scored.append({**exp, "score": score})
    return sorted(scored, key=lambda x: x["score"], reverse=True)
 def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
    """
    Build the resume section of the LLM context block.
    Top 2 scored experiences included in full detail; rest as one-liners.
    Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
    unless the role is security-focused (score >= 3).
    """
    experiences = resume.get("experience_details", [])
    if not experiences:
        return ""
    scored = _score_experiences(experiences, keywords, jd)
    top2 = scored[:2]
    rest = scored[2:]
    def _company_label(exp: dict) -> str:
        company = exp.get("company", "")
        if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
            return "enterprise security vendor (NDA)"
        return company
    def _exp_header(exp: dict) -> str:
        return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
    def _exp_bullets(exp: dict) -> str:
        bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
        return "\n".join(f"  - {b}" for b in bullets)
    lines = ["## Alex's Matched Experience"]
    for exp in top2:
        lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
        lines.append(_exp_bullets(exp))
    if rest:
        condensed = ", ".join(_exp_header(e) for e in rest)
        lines.append(f"\nAlso in Alex's background: {condensed}")
    return "\n".join(lines)
 def _load_resume_and_keywords() -> tuple[dict, list[str]]:
    """Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list)."""
    import yaml as _yaml
    resume = {}
    if _RESUME_YAML.exists():
        resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {}
    keywords: list[str] = []
    if _KEYWORDS_YAML.exists():
        kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {}
        for lst in kw_cfg.values():
            if isinstance(lst, list):
                keywords.extend(lst)
    return resume, keywords
 def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict:
    """
    Generate a pre-interview research brief for a job.
    Parameters
    ----------
    job : dict
        Job row from the DB (needs at least 'company', 'title', 'description').
    use_scraper : bool
        Whether to attempt live data via SearXNG before falling back to LLM.
    Returns
    -------
    dict with keys: raw_output, company_brief, ceo_brief, tech_brief,
    funding_brief, competitors_brief, red_flags, talking_points
    """
    from scripts.llm_router import LLMRouter
    router = LLMRouter()
    research_order = router.config.get("research_fallback_order") or router.config["fallback_order"]
    company = job.get("company") or "the company"
    title = job.get("title") or "this role"
    jd_excerpt = (job.get("description") or "")[:1500]
    resume, keywords = _load_resume_and_keywords()
    matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()]
    resume_context = _build_resume_context(resume, keywords, jd_excerpt)
    keywords_note = (
        f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}"
        if matched_keywords else ""
    )
    def _stage(msg: str) -> None:
        if on_stage:
            try:
                on_stage(msg)
            except Exception:
                pass  # never let stage callbacks break the task
    # ── Phase 1: live scrape (optional) ──────────────────────────────────────
    live_data: dict = {}
    scrape_note = ""
    _stage("Checking for live company data…")
    if use_scraper and _SCRAPER_AVAILABLE and _searxng_running():
        _stage("Scraping CEO & HQ data…")
        try:
            live_data = _scrape_company(company)
            parts = []
            if live_data.get("ceo") not in (None, "Not found"):
                parts.append(f"CEO: {live_data['ceo']}")
            if live_data.get("headquarters") not in (None, "Not found"):
                parts.append(f"HQ: {live_data['headquarters']}")
            if live_data.get("linkedin") not in (None, "Not found"):
                parts.append(f"LinkedIn: {live_data['linkedin']}")
            if parts:
                scrape_note = (
                    "\n\n**Live data retrieved via SearXNG:**\n"
                    + "\n".join(f"- {p}" for p in parts)
                    + "\n\nIncorporate these facts where relevant."
                )
        except BaseException as e:
            scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_"
    # ── Phase 1b: parallel search queries ────────────────────────────────────
    search_data: dict[str, str] = {}
    _stage("Running web searches…")
    if use_scraper and _searxng_running():
        _stage("Running web searches (news, funding, tech, culture)…")
        try:
            ceo_name = (live_data.get("ceo") or "") if live_data else ""
            search_data = _fetch_search_data(company, ceo=ceo_name)
        except BaseException:
            pass  # best-effort; never fail the whole task
    # Track whether SearXNG actually contributed usable data to this brief.
    scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0
    def _section_note(key: str, label: str) -> str:
        text = search_data.get(key, "").strip()
        return f"\n\n## {label} (live web search)\n\n{text}" if text else ""
    news_note          = _section_note("news",          "News & Press")
    funding_note       = _section_note("funding",       "Funding & Investors")
    tech_note          = _section_note("tech",          "Tech Stack")
    competitors_note   = _section_note("competitors",   "Competitors")
    culture_note       = _section_note("culture",       "Culture & Employee Signals")
    accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion")
    ceo_press_note     = _section_note("ceo_press",     "CEO in the News")
    # ── Phase 2: LLM synthesis ────────────────────────────────────────────────
    _stage("Generating brief with LLM… (30–90 seconds)")
    prompt = f"""You are preparing Alex Rivera for a job interview.
 Role: **{title}** at **{company}**
 ## Job Description
 {jd_excerpt}
 {resume_context}{keywords_note}
 ## Live Company Data
 {scrape_note.strip() or "_(scrape unavailable)_"}
 {news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note}
 ---
 Produce a structured research brief using **exactly** these eight markdown section headers
 (include all eight even if a section has limited data — say so honestly):
 ## Company Overview
 What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning.
 ## Leadership & Culture
 CEO background and leadership style, key execs, mission/values statements, Glassdoor themes.
 ## Tech Stack & Product
 Technologies, platforms, and product direction relevant to the {title} role.
 ## Funding & Market Position
 Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape.
 ## Recent Developments
 News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months.
 Draw on the live snippets above; if none available, note what is publicly known.
 ## Red Flags & Watch-outs
 Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call.
 If nothing notable, write "No significant red flags identified."
 ## Inclusion & Accessibility
 Assess {company}'s commitment to disability inclusion and accessibility. Cover:
 - ADA accommodation language in job postings or company policy
 - Disability Employee Resource Group (ERG) or affinity group
 - Product or service accessibility (WCAG compliance, adaptive features, AT integrations)
 - Any public disability/accessibility advocacy, partnerships, or certifications
 - Glassdoor or press signals about how employees with disabilities experience the company
 If no specific signals are found, say so clearly — absence of public commitment is itself signal.
 This section is for Alex's personal decision-making only and will not appear in any application.
 ## Talking Points for Alex
 Five specific talking points for the phone screen. Each must:
 - Reference a concrete experience from Alex's matched background by name
  (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
 - Connect to a specific signal from the JD or company context above
 - Be 1–2 sentences, ready to speak aloud
 - Never give generic advice
 ---
 ⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call.
 """
    raw = router.complete(prompt, fallback_order=research_order)
    # Strip <think>…</think> blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R)
    raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
    sections = _parse_sections(raw)
    return {
        "raw_output":        raw,
        "company_brief":     sections.get("Company Overview", ""),
        "ceo_brief":         sections.get("Leadership & Culture", ""),
        "tech_brief":        sections.get("Tech Stack & Product", ""),
        "funding_brief":     sections.get("Funding & Market Position", ""),
        "competitors_brief": sections.get("Funding & Market Position", ""),  # competitor landscape is in the funding section
        "red_flags":         sections.get("Red Flags & Watch-outs", ""),
        "accessibility_brief": sections.get("Inclusion & Accessibility", ""),
        "talking_points":    sections.get("Talking Points for Alex", ""),
        "scrape_used":       scrape_used,
    }
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Generate company research brief")
    parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db")
    parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape")
    args = parser.parse_args()
    from scripts.db import DEFAULT_DB, init_db, save_research
    import sqlite3
    init_db(DEFAULT_DB)
    conn = sqlite3.connect(DEFAULT_DB)
    conn.row_factory = sqlite3.Row
    row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
    conn.close()
    if not row:
        sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}")
    job = dict(row)
    print(f"Researching: {job['title']} @ {job['company']} …\n")
    if _SCRAPER_AVAILABLE and not args.no_scrape:
        print(f"SearXNG available: {_searxng_running()}")
    result = research_company(job, use_scraper=not args.no_scrape)
    save_research(DEFAULT_DB, job_id=args.job_id, **result)
    print(result["raw_output"])
    print(f"\n[Saved to company_research for job {args.job_id}]")
--- a/scripts/custom_boards/init.py
+++ b/scripts/custom_boards/init.py
@ -0,0 +1 @@
 # Custom job board scrapers — each module exposes scrape(profile, location, results_wanted) -> list[dict]
--- a/scripts/custom_boards/adzuna.py
+++ b/scripts/custom_boards/adzuna.py
@ -0,0 +1,160 @@
 """Adzuna Jobs API scraper.
 API docs: https://developer.adzuna.com/docs/search
 Config:   config/adzuna.yaml  (gitignored — contains app_id + app_key)
 Each title in the search profile is queried as an exact phrase per location.
 Returns a list of dicts compatible with scripts.db.insert_job().
 """
 from __future__ import annotations
 import time
 from pathlib import Path
 import requests
 import yaml
 _CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "adzuna.yaml"
 _BASE_URL = "https://api.adzuna.com/v1/api/jobs/us/search"
 def _load_config() -> tuple[str, str]:
    if not _CONFIG_PATH.exists():
        raise FileNotFoundError(
            f"Adzuna config not found: {_CONFIG_PATH}\n"
            "Copy config/adzuna.yaml.example → config/adzuna.yaml and fill in credentials."
        )
    cfg = yaml.safe_load(_CONFIG_PATH.read_text())
    app_id = (cfg.get("app_id") or "").strip()
    app_key = (cfg.get("app_key") or "").strip()
    if not app_id or not app_key:
        raise ValueError(
            "config/adzuna.yaml requires both 'app_id' and 'app_key'.\n"
            "Find your App ID at https://developer.adzuna.com/admin/applications"
        )
    return app_id, app_key
 def _salary_str(job: dict) -> str:
    lo = job.get("salary_min")
    hi = job.get("salary_max")
    try:
        if lo and hi:
            return f"${int(lo):,} – ${int(hi):,}"
        if lo:
            return f"${int(lo):,}+"
    except (TypeError, ValueError):
        pass
    return ""
 def _is_remote(location_display: str) -> bool:
    return "remote" in location_display.lower()
 def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
    """Fetch jobs from the Adzuna API for a single location.
    Args:
        profile: Search profile dict from search_profiles.yaml.
        location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
        results_wanted: Maximum results to return across all titles.
    Returns:
        List of job dicts with keys: title, company, url, source, location,
        is_remote, salary, description.
    """
    try:
        app_id, app_key = _load_config()
    except (FileNotFoundError, ValueError) as exc:
        print(f"    [adzuna] Skipped — {exc}")
        return []
    titles = profile.get("titles", [])
    hours_old = profile.get("hours_old", 240)
    max_days_old = max(1, hours_old // 24)
    is_remote_search = location.lower() == "remote"
    session = requests.Session()
    session.headers.update({"Accept": "application/json", "User-Agent": "Mozilla/5.0"})
    seen_ids: set[str] = set()
    results: list[dict] = []
    for title in titles:
        if len(results) >= results_wanted:
            break
        page = 1
        while len(results) < results_wanted:
            # Adzuna doesn't support where=remote — it treats it as a city name and
            # returns 0 results. For remote searches, append "remote" to the what param.
            if is_remote_search:
                params = {
                    "app_id": app_id,
                    "app_key": app_key,
                    "results_per_page": 50,
                    "what": f'"{title}" remote',
                    "sort_by": "date",
                    "max_days_old": max_days_old,
                }
            else:
                params = {
                    "app_id": app_id,
                    "app_key": app_key,
                    "results_per_page": 50,
                    "what_phrase": title,
                    "where": location,
                    "sort_by": "date",
                    "max_days_old": max_days_old,
                }
            try:
                resp = session.get(f"{_BASE_URL}/{page}", params=params, timeout=20)
            except requests.RequestException as exc:
                print(f"    [adzuna] Request error ({title}): {exc}")
                break
            if resp.status_code == 401:
                print("    [adzuna] Auth failed — check app_id and app_key in config/adzuna.yaml")
                return results
            if resp.status_code != 200:
                print(f"    [adzuna] HTTP {resp.status_code} for '{title}' page {page}")
                break
            data = resp.json()
            jobs = data.get("results", [])
            if not jobs:
                break
            for job in jobs:
                job_id = str(job.get("id", ""))
                if job_id in seen_ids:
                    continue
                seen_ids.add(job_id)
                loc_display = job.get("location", {}).get("display_name", "")
                redirect_url = job.get("redirect_url", "")
                if not redirect_url:
                    continue
                results.append({
                    "title":       job.get("title", ""),
                    "company":     job.get("company", {}).get("display_name", ""),
                    "url":         redirect_url,
                    "source":      "adzuna",
                    "location":    loc_display,
                    "is_remote":   is_remote_search or _is_remote(loc_display),
                    "salary":      _salary_str(job),
                    "description": job.get("description", ""),
                })
            total = data.get("count", 0)
            if len(results) >= total or len(jobs) < 50:
                break  # last page
            page += 1
            time.sleep(0.5)  # polite pacing between pages
        time.sleep(0.5)  # between titles
    return results[:results_wanted]
--- a/scripts/custom_boards/craigslist.py
+++ b/scripts/custom_boards/craigslist.py
@ -0,0 +1,177 @@
 """Craigslist job scraper — RSS-based.
 Uses Craigslist's native RSS feed endpoint for discovery.
 Full job description is populated by the scrape_url background task.
 Company name and salary (not structured in Craigslist listings) are
 extracted from the description body by the enrich_craigslist task.
 Config: config/craigslist.yaml  (gitignored — metro list + location map)
        config/craigslist.yaml.example  (committed template)
 Returns a list of dicts compatible with scripts.db.insert_job().
 """
 from __future__ import annotations
 import time
 import xml.etree.ElementTree as ET
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
 from pathlib import Path
 from urllib.parse import quote_plus
 import requests
 import yaml
 _CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml"
 _DEFAULT_CATEGORY = "jjj"
 _HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
 }
 _TIMEOUT = 15
 _SLEEP = 0.5  # seconds between requests — easy to make configurable later
 def _load_config() -> dict:
    if not _CONFIG_PATH.exists():
        raise FileNotFoundError(
            f"Craigslist config not found: {_CONFIG_PATH}\n"
            "Copy config/craigslist.yaml.example → config/craigslist.yaml "
            "and configure your target metros."
        )
    cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {}
    if not cfg.get("metros"):
        raise ValueError(
            "config/craigslist.yaml must contain at least one entry under 'metros'."
        )
    return cfg
 def _rss_url(metro: str, category: str, query: str) -> str:
    return (
        f"https://{metro}.craigslist.org/search/{category}"
        f"?query={quote_plus(query)}&format=rss&sort=date"
    )
 def _parse_pubdate(pubdate_str: str) -> datetime | None:
    """Parse an RSS pubDate string to a timezone-aware datetime."""
    try:
        return parsedate_to_datetime(pubdate_str)
    except Exception:
        return None
 def _fetch_rss(url: str) -> list[dict]:
    """Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts."""
    resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
    resp.raise_for_status()
    try:
        root = ET.fromstring(resp.content)
    except ET.ParseError as exc:
        raise ValueError(f"Malformed RSS XML: {exc}") from exc
    items = []
    for item in root.findall(".//item"):
        def _text(tag: str, _item=item) -> str:
            el = _item.find(tag)
            return (el.text or "").strip() if el is not None else ""
        items.append({
            "title":       _text("title"),
            "link":        _text("link"),
            "description": _text("description"),
            "pubDate":     _text("pubDate"),
        })
    return items
 def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
    """Fetch jobs from Craigslist RSS for a single location.
    Args:
        profile: Search profile dict from search_profiles.yaml.
        location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
        results_wanted: Maximum results to return across all metros and titles.
    Returns:
        List of job dicts with keys: title, company, url, source, location,
        is_remote, salary, description.
        company/salary are empty — filled later by enrich_craigslist task.
    """
    try:
        cfg = _load_config()
    except (FileNotFoundError, ValueError) as exc:
        print(f"    [craigslist] Skipped — {exc}")
        return []
    metros_all: list[str] = cfg.get("metros", [])
    location_map: dict[str, str] = cfg.get("location_map", {})
    category: str = cfg.get("category") or _DEFAULT_CATEGORY
    is_remote_search = location.lower() == "remote"
    if is_remote_search:
        metros = metros_all
    else:
        metro = location_map.get(location)
        if not metro:
            print(f"    [craigslist] No metro mapping for '{location}' — skipping")
            return []
        metros = [metro]
    titles: list[str] = profile.get("titles", [])
    hours_old: int = profile.get("hours_old", 240)
    cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600)
    seen_urls: set[str] = set()
    results: list[dict] = []
    for metro in metros:
        if len(results) >= results_wanted:
            break
        for title in titles:
            if len(results) >= results_wanted:
                break
            url = _rss_url(metro, category, title)
            try:
                items = _fetch_rss(url)
            except requests.RequestException as exc:
                print(f"    [craigslist] HTTP error ({metro}/{title}): {exc}")
                time.sleep(_SLEEP)
                continue
            except ValueError as exc:
                print(f"    [craigslist] Parse error ({metro}/{title}): {exc}")
                time.sleep(_SLEEP)
                continue
            for item in items:
                if len(results) >= results_wanted:
                    break
                item_url = item.get("link", "")
                if not item_url or item_url in seen_urls:
                    continue
                pub = _parse_pubdate(item.get("pubDate", ""))
                if pub and pub.timestamp() < cutoff:
                    continue
                seen_urls.add(item_url)
                results.append({
                    "title":       item.get("title", ""),
                    "company":     "",
                    "url":         item_url,
                    "source":      "craigslist",
                    "location":    f"{metro} (Craigslist)",
                    "is_remote":   is_remote_search,
                    "salary":      "",
                    "description": "",
                })
            time.sleep(_SLEEP)
    return results[:results_wanted]
--- a/scripts/custom_boards/theladders.py
+++ b/scripts/custom_boards/theladders.py
@ -0,0 +1,179 @@
 """The Ladders scraper — Playwright-based (requires chromium installed).
 The Ladders is a client-side React app (no SSR __NEXT_DATA__). We use Playwright
 to execute JS, wait for job cards to render, then extract from the DOM.
 Company names are hidden from guest (non-logged-in) users, but are encoded in
 the job URL slug: /job/{title-slug}-{company-slug}-{location-slug}_{id}
 curl_cffi is no longer needed for this scraper; plain Playwright is sufficient.
 playwright must be installed: `conda run -n job-seeker python -m playwright install chromium`
 Returns a list of dicts compatible with scripts.db.insert_job().
 """
 from __future__ import annotations
 import re
 import time
 from typing import Any
 _BASE = "https://www.theladders.com"
 _SEARCH_PATH = "/jobs/searchjobs/{slug}"
 # Location slug in URLs for remote jobs
 _REMOTE_SLUG = "virtual-travel"
 def _company_from_url(href: str, title_slug: str) -> str:
    """
    Extract company name from The Ladders job URL slug.
    URL format: /job/{title-slug}-{company-slug}-{location-slug}_{id}?ir=1
    Example: /job/customer-success-manager-gainsight-virtual-travel_85434789
             → "Gainsight"
    """
    # Strip path prefix and query
    slug = href.split("/job/", 1)[-1].split("?")[0]
    # Strip numeric ID suffix (e.g. _85434789)
    slug = re.sub(r"_\d+$", "", slug)
    # Strip known title prefix
    if slug.startswith(title_slug + "-"):
        slug = slug[len(title_slug) + 1:]
    # Strip common location suffixes
    for loc_suffix in [f"-{_REMOTE_SLUG}", "-new-york", "-los-angeles",
                       "-san-francisco", "-chicago", "-austin", "-seattle",
                       "-boston", "-atlanta", "-remote"]:
        if slug.endswith(loc_suffix):
            slug = slug[: -len(loc_suffix)]
            break
    # Convert kebab-case → title case
    return slug.replace("-", " ").title() if slug else ""
 def _extract_jobs_js() -> str:
    """JS to run in page context — extracts job data from rendered card elements."""
    return """() => {
        const cards = document.querySelectorAll('[class*=job-card-container]');
        return Array.from(cards).map(card => {
            const link = card.querySelector('p.job-link-wrapper a, a.clipped-text');
            const salary = card.querySelector('p.salary, .salary-info p');
            const locEl = card.querySelector('.remote-location-text, .location-info');
            const remoteEl = card.querySelector('.remote-flag-badge-remote');
            return {
                title: link ? link.textContent.trim() : null,
                href: link ? link.getAttribute('href') : null,
                salary: salary ? salary.textContent.replace('*','').trim() : null,
                location: locEl ? locEl.textContent.trim() : null,
                is_remote: !!remoteEl,
            };
        }).filter(j => j.title && j.href);
    }"""
 def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
    """
    Scrape job listings from The Ladders using Playwright.
    Args:
        profile: Search profile dict (uses 'titles').
        location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
        results_wanted: Maximum results to return across all titles.
    Returns:
        List of job dicts with keys: title, company, url, source, location,
        is_remote, salary, description.
    """
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        print(
            "    [theladders] playwright not installed.\n"
            "    Install: conda run -n job-seeker pip install playwright && "
            "conda run -n job-seeker python -m playwright install chromium"
        )
        return []
    is_remote_search = location.lower() == "remote"
    results: list[dict] = []
    seen_urls: set[str] = set()
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        ctx = browser.new_context(
            user_agent=(
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
            )
        )
        page = ctx.new_page()
        for title in profile.get("titles", []):
            if len(results) >= results_wanted:
                break
            slug = title.lower().replace(" ", "-").replace("/", "-")
            title_slug = slug  # used for company extraction from URL
            params: dict[str, str] = {}
            if is_remote_search:
                params["remote"] = "true"
            elif location:
                params["location"] = location
            url = _BASE + _SEARCH_PATH.format(slug=slug)
            if params:
                query = "&".join(f"{k}={v}" for k, v in params.items())
                url = f"{url}?{query}"
            try:
                page.goto(url, timeout=30_000)
                page.wait_for_load_state("networkidle", timeout=20_000)
            except Exception as exc:
                print(f"    [theladders] Page load error for '{title}': {exc}")
                continue
            try:
                raw_jobs: list[dict[str, Any]] = page.evaluate(_extract_jobs_js())
            except Exception as exc:
                print(f"    [theladders] JS extract error for '{title}': {exc}")
                continue
            if not raw_jobs:
                print(f"    [theladders] No cards found for '{title}' — selector may need updating")
                continue
            for job in raw_jobs:
                href = job.get("href", "")
                if not href:
                    continue
                full_url = _BASE + href if href.startswith("/") else href
                if full_url in seen_urls:
                    continue
                seen_urls.add(full_url)
                company = _company_from_url(href, title_slug)
                loc_text = (job.get("location") or "").replace("Remote", "").strip(", ")
                if is_remote_search or job.get("is_remote"):
                    loc_display = "Remote" + (f" — {loc_text}" if loc_text and loc_text != "US-Anywhere" else "")
                else:
                    loc_display = loc_text or location
                results.append({
                    "title":       job.get("title", ""),
                    "company":     company,
                    "url":         full_url,
                    "source":      "theladders",
                    "location":    loc_display,
                    "is_remote":   bool(job.get("is_remote") or is_remote_search),
                    "salary":      job.get("salary") or "",
                    "description": "",  # not available in card view; scrape_url will fill in
                })
                if len(results) >= results_wanted:
                    break
            time.sleep(1)  # polite pacing between titles
        browser.close()
    return results[:results_wanted]
--- a/scripts/db.py
+++ b/scripts/db.py
@ -0,0 +1,728 @@
 """
 SQLite staging layer for job listings.
 Jobs flow: pending → approved/rejected → applied → synced
          applied → phone_screen → interviewing → offer → hired (or rejected)
 """
 import sqlite3
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
 DEFAULT_DB = Path(__file__).parent.parent / "staging.db"
 CREATE_JOBS = """
 CREATE TABLE IF NOT EXISTS jobs (
    id              INTEGER PRIMARY KEY AUTOINCREMENT,
    title           TEXT,
    company         TEXT,
    url             TEXT UNIQUE,
    source          TEXT,
    location        TEXT,
    is_remote       INTEGER DEFAULT 0,
    salary          TEXT,
    description     TEXT,
    match_score     REAL,
    keyword_gaps    TEXT,
    date_found      TEXT,
    status          TEXT DEFAULT 'pending',
    notion_page_id  TEXT,
    cover_letter    TEXT,
    applied_at      TEXT
 );
 """
 CREATE_JOB_CONTACTS = """
 CREATE TABLE IF NOT EXISTS job_contacts (
    id                 INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id             INTEGER NOT NULL,
    direction          TEXT DEFAULT 'inbound',
    subject            TEXT,
    from_addr          TEXT,
    to_addr            TEXT,
    body               TEXT,
    received_at        TEXT,
    is_response_needed INTEGER DEFAULT 0,
    responded_at       TEXT,
    message_id         TEXT,
    FOREIGN KEY (job_id) REFERENCES jobs(id)
 );
 """
 _CONTACT_MIGRATIONS = [
    ("message_id",           "TEXT"),
    ("stage_signal",         "TEXT"),
    ("suggestion_dismissed", "INTEGER DEFAULT 0"),
 ]
 _RESEARCH_MIGRATIONS = [
    ("tech_brief",          "TEXT"),
    ("funding_brief",       "TEXT"),
    ("competitors_brief",   "TEXT"),
    ("red_flags",           "TEXT"),
    ("scrape_used",         "INTEGER"),  # 1 = SearXNG contributed data, 0 = LLM-only
    ("accessibility_brief", "TEXT"),     # Inclusion & Accessibility section
 ]
 CREATE_COMPANY_RESEARCH = """
 CREATE TABLE IF NOT EXISTS company_research (
    id               INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id           INTEGER NOT NULL UNIQUE,
    generated_at     TEXT,
    company_brief    TEXT,
    ceo_brief        TEXT,
    talking_points   TEXT,
    raw_output       TEXT,
    tech_brief       TEXT,
    funding_brief    TEXT,
    competitors_brief TEXT,
    red_flags        TEXT,
    FOREIGN KEY (job_id) REFERENCES jobs(id)
 );
 """
 CREATE_BACKGROUND_TASKS = """
 CREATE TABLE IF NOT EXISTS background_tasks (
    id          INTEGER PRIMARY KEY AUTOINCREMENT,
    task_type   TEXT NOT NULL,
    job_id      INTEGER NOT NULL,
    status      TEXT NOT NULL DEFAULT 'queued',
    error       TEXT,
    created_at  DATETIME DEFAULT (datetime('now')),
    started_at  DATETIME,
    finished_at DATETIME,
    stage       TEXT,
    updated_at  DATETIME
 )
 """
 CREATE_SURVEY_RESPONSES = """
 CREATE TABLE IF NOT EXISTS survey_responses (
    id             INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id         INTEGER NOT NULL REFERENCES jobs(id),
    survey_name    TEXT,
    received_at    DATETIME,
    source         TEXT,
    raw_input      TEXT,
    image_path     TEXT,
    mode           TEXT,
    llm_output     TEXT,
    reported_score TEXT,
    created_at     DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 """
 _MIGRATIONS = [
    ("cover_letter",    "TEXT"),
    ("applied_at",      "TEXT"),
    ("interview_date",  "TEXT"),
    ("rejection_stage", "TEXT"),
    ("phone_screen_at", "TEXT"),
    ("interviewing_at", "TEXT"),
    ("offer_at",        "TEXT"),
    ("hired_at",        "TEXT"),
    ("survey_at",       "TEXT"),
 ]
 def _migrate_db(db_path: Path) -> None:
    """Add new columns to existing tables without breaking old data."""
    conn = sqlite3.connect(db_path)
    for col, coltype in _MIGRATIONS:
        try:
            conn.execute(f"ALTER TABLE jobs ADD COLUMN {col} {coltype}")
        except sqlite3.OperationalError:
            pass  # column already exists
    for col, coltype in _CONTACT_MIGRATIONS:
        try:
            conn.execute(f"ALTER TABLE job_contacts ADD COLUMN {col} {coltype}")
        except sqlite3.OperationalError:
            pass
    for col, coltype in _RESEARCH_MIGRATIONS:
        try:
            conn.execute(f"ALTER TABLE company_research ADD COLUMN {col} {coltype}")
        except sqlite3.OperationalError:
            pass
    try:
        conn.execute("ALTER TABLE background_tasks ADD COLUMN stage TEXT")
    except sqlite3.OperationalError:
        pass
    try:
        conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT")
    except sqlite3.OperationalError:
        pass
    conn.commit()
    conn.close()
 def init_db(db_path: Path = DEFAULT_DB) -> None:
    """Create tables if they don't exist, then run migrations."""
    conn = sqlite3.connect(db_path)
    conn.execute(CREATE_JOBS)
    conn.execute(CREATE_JOB_CONTACTS)
    conn.execute(CREATE_COMPANY_RESEARCH)
    conn.execute(CREATE_BACKGROUND_TASKS)
    conn.execute(CREATE_SURVEY_RESPONSES)
    conn.commit()
    conn.close()
    _migrate_db(db_path)
 def insert_job(db_path: Path = DEFAULT_DB, job: dict = None) -> Optional[int]:
    """Insert a job. Returns row id, or None if URL already exists."""
    if job is None:
        return None
    conn = sqlite3.connect(db_path)
    try:
        cursor = conn.execute(
            """INSERT INTO jobs
               (title, company, url, source, location, is_remote, salary, description, date_found)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
            (
                job.get("title", ""),
                job.get("company", ""),
                job.get("url", ""),
                job.get("source", ""),
                job.get("location", ""),
                int(bool(job.get("is_remote", False))),
                job.get("salary", ""),
                job.get("description", ""),
                job.get("date_found", ""),
            ),
        )
        conn.commit()
        return cursor.lastrowid
    except sqlite3.IntegrityError:
        return None  # duplicate URL
    finally:
        conn.close()
 def get_job_by_id(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]:
    """Return a single job by ID, or None if not found."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
    conn.close()
    return dict(row) if row else None
 def get_jobs_by_status(db_path: Path = DEFAULT_DB, status: str = "pending") -> list[dict]:
    """Return all jobs with the given status as a list of dicts."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    cursor = conn.execute(
        "SELECT * FROM jobs WHERE status = ? ORDER BY date_found DESC, id DESC",
        (status,),
    )
    rows = [dict(row) for row in cursor.fetchall()]
    conn.close()
    return rows
 def get_email_leads(db_path: Path = DEFAULT_DB) -> list[dict]:
    """Return pending jobs with source='email', newest first."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
        "SELECT * FROM jobs WHERE source = 'email' AND status = 'pending' "
        "ORDER BY date_found DESC, id DESC"
    ).fetchall()
    conn.close()
    return [dict(r) for r in rows]
 def get_job_counts(db_path: Path = DEFAULT_DB) -> dict:
    """Return counts per status."""
    conn = sqlite3.connect(db_path)
    cursor = conn.execute(
        "SELECT status, COUNT(*) as n FROM jobs GROUP BY status"
    )
    counts = {row[0]: row[1] for row in cursor.fetchall()}
    conn.close()
    return counts
 def update_job_status(db_path: Path = DEFAULT_DB, ids: list[int] = None, status: str = "approved") -> None:
    """Batch-update status for a list of job IDs."""
    if not ids:
        return
    conn = sqlite3.connect(db_path)
    conn.execute(
        f"UPDATE jobs SET status = ? WHERE id IN ({','.join('?' * len(ids))})",
        [status] + list(ids),
    )
    conn.commit()
    conn.close()
 def get_existing_urls(db_path: Path = DEFAULT_DB) -> set[str]:
    """Return all URLs already in staging (any status)."""
    conn = sqlite3.connect(db_path)
    cursor = conn.execute("SELECT url FROM jobs")
    urls = {row[0] for row in cursor.fetchall()}
    conn.close()
    return urls
 def write_match_scores(db_path: Path = DEFAULT_DB, job_id: int = None,
                       score: float = 0.0, gaps: str = "") -> None:
    """Write match score and keyword gaps back to a job row."""
    conn = sqlite3.connect(db_path)
    conn.execute(
        "UPDATE jobs SET match_score = ?, keyword_gaps = ? WHERE id = ?",
        (score, gaps, job_id),
    )
    conn.commit()
    conn.close()
 def update_cover_letter(db_path: Path = DEFAULT_DB, job_id: int = None, text: str = "") -> None:
    """Persist a generated/edited cover letter for a job."""
    if job_id is None:
        return
    conn = sqlite3.connect(db_path)
    conn.execute("UPDATE jobs SET cover_letter = ? WHERE id = ?", (text, job_id))
    conn.commit()
    conn.close()
 _UPDATABLE_JOB_COLS = {
    "title", "company", "url", "source", "location", "is_remote",
    "salary", "description", "match_score", "keyword_gaps",
 }
 def update_job_fields(db_path: Path = DEFAULT_DB, job_id: int = None,
                      fields: dict = None) -> None:
    """Update arbitrary job columns. Unknown keys are silently ignored."""
    if job_id is None or not fields:
        return
    safe = {k: v for k, v in fields.items() if k in _UPDATABLE_JOB_COLS}
    if not safe:
        return
    conn = sqlite3.connect(db_path)
    sets = ", ".join(f"{col} = ?" for col in safe)
    conn.execute(
        f"UPDATE jobs SET {sets} WHERE id = ?",
        (*safe.values(), job_id),
    )
    conn.commit()
    conn.close()
 def mark_applied(db_path: Path = DEFAULT_DB, ids: list[int] = None) -> None:
    """Set status='applied' and record today's date for a list of job IDs."""
    if not ids:
        return
    today = datetime.now().isoformat()[:10]
    conn = sqlite3.connect(db_path)
    conn.execute(
        f"UPDATE jobs SET status = 'applied', applied_at = ? WHERE id IN ({','.join('?' * len(ids))})",
        [today] + list(ids),
    )
    conn.commit()
    conn.close()
 def kill_stuck_tasks(db_path: Path = DEFAULT_DB) -> int:
    """Mark all queued/running background tasks as failed. Returns count killed."""
    conn = sqlite3.connect(db_path)
    count = conn.execute(
        "UPDATE background_tasks SET status='failed', error='Killed by user',"
        " finished_at=datetime('now') WHERE status IN ('queued','running')"
    ).rowcount
    conn.commit()
    conn.close()
    return count
 def purge_email_data(db_path: Path = DEFAULT_DB) -> tuple[int, int]:
    """Delete all job_contacts rows and email-sourced pending jobs.
    Returns (contacts_deleted, jobs_deleted).
    """
    conn = sqlite3.connect(db_path)
    c1 = conn.execute("DELETE FROM job_contacts").rowcount
    c2 = conn.execute("DELETE FROM jobs WHERE source='email'").rowcount
    conn.commit()
    conn.close()
    return c1, c2
 def purge_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int:
    """Delete jobs matching given statuses. Returns number of rows deleted.
    If statuses is None or empty, deletes ALL jobs (full reset).
    """
    conn = sqlite3.connect(db_path)
    if statuses:
        placeholders = ",".join("?" * len(statuses))
        cur = conn.execute(f"DELETE FROM jobs WHERE status IN ({placeholders})", statuses)
    else:
        cur = conn.execute("DELETE FROM jobs")
    count = cur.rowcount
    conn.commit()
    conn.close()
    return count
 def purge_non_remote(db_path: Path = DEFAULT_DB) -> int:
    """Delete non-remote jobs that are not yet in the active pipeline.
    Preserves applied, phone_screen, interviewing, offer, hired, and synced records.
    Returns number of rows deleted.
    """
    _safe = ("applied", "phone_screen", "interviewing", "offer", "hired", "synced")
    placeholders = ",".join("?" * len(_safe))
    conn = sqlite3.connect(db_path)
    count = conn.execute(
        f"DELETE FROM jobs WHERE (is_remote = 0 OR is_remote IS NULL)"
        f" AND status NOT IN ({placeholders})",
        _safe,
    ).rowcount
    conn.commit()
    conn.close()
    return count
 def archive_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int:
    """Set status='archived' for jobs matching given statuses.
    Archived jobs stay in the DB (preserving dedup by URL) but are invisible
    to Job Review and other pipeline views.
    Returns number of rows updated.
    """
    if not statuses:
        return 0
    placeholders = ",".join("?" * len(statuses))
    conn = sqlite3.connect(db_path)
    count = conn.execute(
        f"UPDATE jobs SET status = 'archived' WHERE status IN ({placeholders})",
        statuses,
    ).rowcount
    conn.commit()
    conn.close()
    return count
 # ── Interview pipeline helpers ────────────────────────────────────────────────
 _STAGE_TS_COL = {
    "phone_screen": "phone_screen_at",
    "interviewing":  "interviewing_at",
    "offer":         "offer_at",
    "hired":         "hired_at",
    "survey":        "survey_at",
 }
 def get_interview_jobs(db_path: Path = DEFAULT_DB) -> dict[str, list[dict]]:
    """Return jobs grouped by interview/post-apply stage."""
    stages = ["applied", "survey", "phone_screen", "interviewing", "offer", "hired", "rejected"]
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    result: dict[str, list[dict]] = {}
    for stage in stages:
        cursor = conn.execute(
            "SELECT * FROM jobs WHERE status = ? ORDER BY applied_at DESC, id DESC",
            (stage,),
        )
        result[stage] = [dict(row) for row in cursor.fetchall()]
    conn.close()
    return result
 def advance_to_stage(db_path: Path = DEFAULT_DB, job_id: int = None, stage: str = "") -> None:
    """Move a job to the next interview stage and record a timestamp."""
    now = datetime.now().isoformat()[:16]
    ts_col = _STAGE_TS_COL.get(stage)
    conn = sqlite3.connect(db_path)
    if ts_col:
        conn.execute(
            f"UPDATE jobs SET status = ?, {ts_col} = ? WHERE id = ?",
            (stage, now, job_id),
        )
    else:
        conn.execute("UPDATE jobs SET status = ? WHERE id = ?", (stage, job_id))
    conn.commit()
    conn.close()
 def reject_at_stage(db_path: Path = DEFAULT_DB, job_id: int = None,
                    rejection_stage: str = "") -> None:
    """Mark a job as rejected and record at which stage it was rejected."""
    conn = sqlite3.connect(db_path)
    conn.execute(
        "UPDATE jobs SET status = 'rejected', rejection_stage = ? WHERE id = ?",
        (rejection_stage, job_id),
    )
    conn.commit()
    conn.close()
 def set_interview_date(db_path: Path = DEFAULT_DB, job_id: int = None,
                       date_str: str = "") -> None:
    """Persist an interview date for a job."""
    conn = sqlite3.connect(db_path)
    conn.execute("UPDATE jobs SET interview_date = ? WHERE id = ?", (date_str, job_id))
    conn.commit()
    conn.close()
 # ── Contact log helpers ───────────────────────────────────────────────────────
 def add_contact(db_path: Path = DEFAULT_DB, job_id: int = None,
                direction: str = "inbound", subject: str = "",
                from_addr: str = "", to_addr: str = "",
                body: str = "", received_at: str = "",
                message_id: str = "",
                stage_signal: str = "") -> int:
    """Log an email contact. Returns the new row id."""
    ts = received_at or datetime.now().isoformat()[:16]
    conn = sqlite3.connect(db_path)
    cur = conn.execute(
        """INSERT INTO job_contacts
           (job_id, direction, subject, from_addr, to_addr, body,
            received_at, message_id, stage_signal)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
        (job_id, direction, subject, from_addr, to_addr, body,
         ts, message_id, stage_signal or None),
    )
    conn.commit()
    row_id = cur.lastrowid
    conn.close()
    return row_id
 def get_contacts(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]:
    """Return all contact log entries for a job, oldest first."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    cursor = conn.execute(
        "SELECT * FROM job_contacts WHERE job_id = ? ORDER BY received_at ASC",
        (job_id,),
    )
    rows = [dict(row) for row in cursor.fetchall()]
    conn.close()
    return rows
 def get_unread_stage_signals(db_path: Path = DEFAULT_DB,
                             job_id: int = None) -> list[dict]:
    """Return inbound contacts with a non-neutral, non-dismissed stage signal."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
        """SELECT * FROM job_contacts
           WHERE job_id = ?
             AND direction = 'inbound'
             AND stage_signal IS NOT NULL
             AND stage_signal != 'neutral'
             AND (suggestion_dismissed IS NULL OR suggestion_dismissed = 0)
           ORDER BY received_at ASC""",
        (job_id,),
    ).fetchall()
    conn.close()
    return [dict(r) for r in rows]
 def dismiss_stage_signal(db_path: Path = DEFAULT_DB,
                         contact_id: int = None) -> None:
    """Mark a stage signal suggestion as dismissed."""
    conn = sqlite3.connect(db_path)
    conn.execute(
        "UPDATE job_contacts SET suggestion_dismissed = 1 WHERE id = ?",
        (contact_id,),
    )
    conn.commit()
    conn.close()
 def get_all_message_ids(db_path: Path = DEFAULT_DB) -> set[str]:
    """Return all known Message-IDs across all job contacts."""
    conn = sqlite3.connect(db_path)
    rows = conn.execute(
        "SELECT message_id FROM job_contacts WHERE message_id IS NOT NULL AND message_id != ''"
    ).fetchall()
    conn.close()
    return {r[0] for r in rows}
 # ── Company research helpers ──────────────────────────────────────────────────
 def save_research(db_path: Path = DEFAULT_DB, job_id: int = None,
                  company_brief: str = "", ceo_brief: str = "",
                  talking_points: str = "", raw_output: str = "",
                  tech_brief: str = "", funding_brief: str = "",
                  competitors_brief: str = "", red_flags: str = "",
                  accessibility_brief: str = "",
                  scrape_used: int = 0) -> None:
    """Insert or replace a company research record for a job."""
    now = datetime.now().isoformat()[:16]
    conn = sqlite3.connect(db_path)
    conn.execute(
        """INSERT INTO company_research
               (job_id, generated_at, company_brief, ceo_brief, talking_points,
                raw_output, tech_brief, funding_brief, competitors_brief, red_flags,
                accessibility_brief, scrape_used)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
           ON CONFLICT(job_id) DO UPDATE SET
               generated_at        = excluded.generated_at,
               company_brief       = excluded.company_brief,
               ceo_brief           = excluded.ceo_brief,
               talking_points      = excluded.talking_points,
               raw_output          = excluded.raw_output,
               tech_brief          = excluded.tech_brief,
               funding_brief       = excluded.funding_brief,
               competitors_brief   = excluded.competitors_brief,
               red_flags           = excluded.red_flags,
               accessibility_brief = excluded.accessibility_brief,
               scrape_used         = excluded.scrape_used""",
        (job_id, now, company_brief, ceo_brief, talking_points, raw_output,
         tech_brief, funding_brief, competitors_brief, red_flags,
         accessibility_brief, scrape_used),
    )
    conn.commit()
    conn.close()
 def get_research(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]:
    """Return the company research record for a job, or None if absent."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    cursor = conn.execute(
        "SELECT * FROM company_research WHERE job_id = ?", (job_id,)
    )
    row = cursor.fetchone()
    conn.close()
    return dict(row) if row else None
 # ── Survey response helpers ───────────────────────────────────────────────────
 def insert_survey_response(
    db_path: Path = DEFAULT_DB,
    job_id: int = None,
    survey_name: str = "",
    received_at: str = "",
    source: str = "text_paste",
    raw_input: str = "",
    image_path: str = "",
    mode: str = "quick",
    llm_output: str = "",
    reported_score: str = "",
 ) -> int:
    """Insert a survey response row. Returns the new row id."""
    conn = sqlite3.connect(db_path)
    cur = conn.execute(
        """INSERT INTO survey_responses
           (job_id, survey_name, received_at, source, raw_input,
            image_path, mode, llm_output, reported_score)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
        (job_id, survey_name or None, received_at or None,
         source, raw_input or None, image_path or None,
         mode, llm_output, reported_score or None),
    )
    conn.commit()
    row_id = cur.lastrowid
    conn.close()
    return row_id
 def get_survey_responses(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]:
    """Return all survey responses for a job, newest first."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
        "SELECT * FROM survey_responses WHERE job_id = ? ORDER BY created_at DESC",
        (job_id,),
    ).fetchall()
    conn.close()
    return [dict(r) for r in rows]
 # ── Background task helpers ───────────────────────────────────────────────────
 def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "",
                job_id: int = None) -> tuple[int, bool]:
    """Insert a new background task.
    Returns (task_id, True) if inserted, or (existing_id, False) if a
    queued/running task for the same (task_type, job_id) already exists.
    """
    conn = sqlite3.connect(db_path)
    existing = conn.execute(
        "SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')",
        (task_type, job_id),
    ).fetchone()
    if existing:
        conn.close()
        return existing[0], False
    cur = conn.execute(
        "INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')",
        (task_type, job_id),
    )
    task_id = cur.lastrowid
    conn.commit()
    conn.close()
    return task_id, True
 def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None,
                       status: str = "", error: Optional[str] = None) -> None:
    """Update a task's status and set the appropriate timestamp."""
    now = datetime.now().isoformat()[:16]
    conn = sqlite3.connect(db_path)
    if status == "running":
        conn.execute(
            "UPDATE background_tasks SET status=?, started_at=?, updated_at=? WHERE id=?",
            (status, now, now, task_id),
        )
    elif status in ("completed", "failed"):
        conn.execute(
            "UPDATE background_tasks SET status=?, finished_at=?, updated_at=?, error=? WHERE id=?",
            (status, now, now, error, task_id),
        )
    else:
        conn.execute(
            "UPDATE background_tasks SET status=?, updated_at=? WHERE id=?",
            (status, now, task_id),
        )
    conn.commit()
    conn.close()
 def update_task_stage(db_path: Path = DEFAULT_DB, task_id: int = None,
                      stage: str = "") -> None:
    """Update the stage label on a running task (for progress display)."""
    conn = sqlite3.connect(db_path)
    conn.execute("UPDATE background_tasks SET stage=? WHERE id=?", (stage, task_id))
    conn.commit()
    conn.close()
 def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]:
    """Return all queued/running tasks with job title and company joined in."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute("""
        SELECT bt.*, j.title, j.company
        FROM background_tasks bt
        LEFT JOIN jobs j ON j.id = bt.job_id
        WHERE bt.status IN ('queued', 'running')
        ORDER BY bt.created_at ASC
    """).fetchall()
    conn.close()
    return [dict(r) for r in rows]
 def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "",
                     job_id: int = None) -> Optional[dict]:
    """Return the most recent task row for a (task_type, job_id) pair, or None."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    row = conn.execute(
        """SELECT * FROM background_tasks
           WHERE task_type=? AND job_id=?
           ORDER BY id DESC LIMIT 1""",
        (task_type, job_id),
    ).fetchone()
    conn.close()
    return dict(row) if row else None
--- a/scripts/discover.py
+++ b/scripts/discover.py
@ -0,0 +1,285 @@
 # scripts/discover.py
 """
 JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True).
 Usage:
    conda run -n job-seeker python scripts/discover.py
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import yaml
 from datetime import datetime
 import pandas as pd
 from jobspy import scrape_jobs
 from notion_client import Client
 from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls
 from scripts.custom_boards import adzuna as _adzuna
 from scripts.custom_boards import theladders as _theladders
 from scripts.custom_boards import craigslist as _craigslist
 CONFIG_DIR = Path(__file__).parent.parent / "config"
 NOTION_CFG = CONFIG_DIR / "notion.yaml"
 PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml"
 BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
 # Registry of custom board scrapers keyed by name used in search_profiles.yaml
 CUSTOM_SCRAPERS: dict[str, object] = {
    "adzuna": _adzuna.scrape,
    "theladders": _theladders.scrape,
    "craigslist": _craigslist.scrape,
 }
 def load_config() -> tuple[dict, dict]:
    profiles = yaml.safe_load(PROFILES_CFG.read_text())
    notion_cfg = yaml.safe_load(NOTION_CFG.read_text())
    return profiles, notion_cfg
 def load_blocklist() -> dict:
    """Load global blocklist config. Returns dict with companies, industries, locations lists."""
    if not BLOCKLIST_CFG.exists():
        return {"companies": [], "industries": [], "locations": []}
    raw = yaml.safe_load(BLOCKLIST_CFG.read_text()) or {}
    return {
        "companies":  [c.lower() for c in raw.get("companies", []) if c],
        "industries": [i.lower() for i in raw.get("industries", []) if i],
        "locations":  [loc.lower() for loc in raw.get("locations", []) if loc],
    }
 def _is_blocklisted(job_row: dict, blocklist: dict) -> bool:
    """Return True if this job matches any global blocklist rule."""
    company_lower = (job_row.get("company") or "").lower()
    location_lower = (job_row.get("location") or "").lower()
    desc_lower = (job_row.get("description") or "").lower()
    content_lower = f"{company_lower} {desc_lower}"
    if any(bl in company_lower for bl in blocklist["companies"]):
        return True
    if any(bl in content_lower for bl in blocklist["industries"]):
        return True
    if any(bl in location_lower for bl in blocklist["locations"]):
        return True
    return False
 def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]:
    """Return the set of all job URLs already tracked in Notion (for notion_push mode)."""
    existing: set[str] = set()
    has_more = True
    start_cursor = None
    while has_more:
        kwargs: dict = {"database_id": db_id, "page_size": 100}
        if start_cursor:
            kwargs["start_cursor"] = start_cursor
        resp = notion.databases.query(**kwargs)
        for page in resp["results"]:
            url = page["properties"].get(url_field, {}).get("url")
            if url:
                existing.add(url)
        has_more = resp.get("has_more", False)
        start_cursor = resp.get("next_cursor")
    return existing
 def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None:
    """Create a new page in the Notion jobs database for a single listing."""
    min_amt = job.get("min_amount")
    max_amt = job.get("max_amount")
    if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
        title_content = f"${int(min_amt):,} – ${int(max_amt):,}"
    elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""):
        title_content = str(job["salary_source"])
    else:
        title_content = str(job.get("title", "Unknown"))
    job_url = str(job.get("job_url", "") or "")
    if job_url in ("nan", "None"):
        job_url = ""
    notion.pages.create(
        parent={"database_id": db_id},
        properties={
            fm["title_field"]: {"title": [{"text": {"content": title_content}}]},
            fm["job_title"]:   {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]},
            fm["company"]:     {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]},
            fm["url"]:         {"url": job_url or None},
            fm["source"]:      {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]},
            fm["status"]:      {"select": {"name": fm["status_new"]}},
            fm["remote"]:      {"checkbox": bool(job.get("is_remote", False))},
            fm["date_found"]:  {"date": {"start": datetime.now().isoformat()[:10]}},
        },
    )
 def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False) -> None:
    profiles_cfg, notion_cfg = load_config()
    fm = notion_cfg["field_map"]
    blocklist = load_blocklist()
    _bl_summary = {k: len(v) for k, v in blocklist.items() if v}
    if _bl_summary:
        print(f"[discover] Blocklist active: {_bl_summary}")
    # SQLite dedup — by URL and by (title, company) to catch cross-board reposts
    init_db(db_path)
    existing_urls = db_existing_urls(db_path)
    import sqlite3 as _sqlite3
    _conn = _sqlite3.connect(db_path)
    existing_tc = {
        (r[0].lower().strip()[:80], r[1].lower().strip())
        for r in _conn.execute("SELECT title, company FROM jobs").fetchall()
    }
    _conn.close()
    # Notion dedup (only in notion_push mode)
    notion = None
    if notion_push:
        notion = Client(auth=notion_cfg["token"])
        existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"])
    print(f"[discover] {len(existing_urls)} existing listings in DB")
    new_count = 0
    def _s(val, default="") -> str:
        """Convert a value to str, treating pandas NaN/None as default."""
        if val is None:
            return default
        s = str(val)
        return default if s in ("nan", "None", "NaN") else s
    def _insert_if_new(job_row: dict, source_label: str) -> bool:
        """Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted."""
        url = job_row.get("url", "")
        if not url or url in existing_urls:
            return False
        # Global blocklist — checked before anything else
        if _is_blocklisted(job_row, blocklist):
            return False
        title_lower = job_row.get("title", "").lower()
        desc_lower  = job_row.get("description", "").lower()
        exclude_kw  = job_row.get("_exclude_kw", [])
        if any(kw in title_lower or kw in desc_lower for kw in exclude_kw):
            return False
        tc_key = (title_lower[:80], job_row.get("company", "").lower().strip())
        if tc_key in existing_tc:
            return False
        existing_tc.add(tc_key)
        insert_job(db_path, {
            "title":       job_row.get("title", ""),
            "company":     job_row.get("company", ""),
            "url":         url,
            "source":      job_row.get("source", source_label),
            "location":    job_row.get("location", ""),
            "is_remote":   bool(job_row.get("is_remote", False)),
            "salary":      job_row.get("salary", ""),
            "description": job_row.get("description", ""),
            "date_found":  datetime.now().isoformat()[:10],
        })
        existing_urls.add(url)
        return True
    for profile in profiles_cfg["profiles"]:
        print(f"\n[discover] ── Profile: {profile['name']} ──")
        boards = profile.get("boards", [])
        custom_boards = profile.get("custom_boards", [])
        exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])]
        results_per_board = profile.get("results_per_board", 25)
        for location in profile["locations"]:
            # ── JobSpy boards ──────────────────────────────────────────────────
            if boards:
                print(f"  [jobspy] {location} — boards: {', '.join(boards)}")
                try:
                    jobs: pd.DataFrame = scrape_jobs(
                        site_name=boards,
                        search_term=" OR ".join(f'"{t}"' for t in profile["titles"]),
                        location=location,
                        results_wanted=results_per_board,
                        hours_old=profile.get("hours_old", 72),
                        linkedin_fetch_description=True,
                    )
                    print(f"  [jobspy] {len(jobs)} raw results")
                except Exception as exc:
                    print(f"  [jobspy] ERROR: {exc}")
                    jobs = pd.DataFrame()
                jobspy_new = 0
                for _, job in jobs.iterrows():
                    url = str(job.get("job_url", "") or "")
                    if not url or url in ("nan", "None"):
                        continue
                    job_dict = job.to_dict()
                    # Build salary string from JobSpy numeric fields
                    min_amt = job_dict.get("min_amount")
                    max_amt = job_dict.get("max_amount")
                    salary_str = ""
                    if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
                        salary_str = f"${int(min_amt):,} – ${int(max_amt):,}"
                    elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""):
                        salary_str = str(job_dict["salary_source"])
                    row = {
                        "url":         url,
                        "title":       _s(job_dict.get("title")),
                        "company":     _s(job_dict.get("company")),
                        "source":      _s(job_dict.get("site")),
                        "location":    _s(job_dict.get("location")),
                        "is_remote":   bool(job_dict.get("is_remote", False)),
                        "salary":      salary_str,
                        "description": _s(job_dict.get("description")),
                        "_exclude_kw": exclude_kw,
                    }
                    if _insert_if_new(row, _s(job_dict.get("site"))):
                        if notion_push:
                            push_to_notion(notion, notion_cfg["database_id"], job_dict, fm)
                        new_count += 1
                        jobspy_new += 1
                        print(f"    + {row['title']} @ {row['company']} [{row['source']}]")
                print(f"  [jobspy] {jobspy_new} new listings from {location}")
            # ── Custom boards ──────────────────────────────────────────────────
            for board_name in custom_boards:
                scraper_fn = CUSTOM_SCRAPERS.get(board_name)
                if scraper_fn is None:
                    print(f"  [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)")
                    continue
                print(f"  [{board_name}] {location} — fetching up to {results_per_board} results …")
                try:
                    custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board)
                except Exception as exc:
                    print(f"  [{board_name}] ERROR: {exc}")
                    custom_jobs = []
                print(f"  [{board_name}] {len(custom_jobs)} raw results")
                board_new = 0
                for job in custom_jobs:
                    row = {**job, "_exclude_kw": exclude_kw}
                    if _insert_if_new(row, board_name):
                        new_count += 1
                        board_new += 1
                        print(f"    + {job.get('title')} @ {job.get('company')} [{board_name}]")
                print(f"  [{board_name}] {board_new} new listings from {location}")
    print(f"\n[discover] Done — {new_count} new listings staged total.")
    return new_count
 if __name__ == "__main__":
    run_discovery()
--- a/scripts/enrich_descriptions.py
+++ b/scripts/enrich_descriptions.py
@ -0,0 +1,284 @@
 # scripts/enrich_descriptions.py
 """
 Post-discovery enrichment: retry Glassdoor job description fetches that
 returned empty/null during the initial scrape (usually rate-limit 429s or
 expired listings mid-batch).
 Fetches descriptions one at a time with a configurable delay between
 requests to stay under Glassdoor's rate limit.
 Usage:
    conda run -n job-seeker python scripts/enrich_descriptions.py
    conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run
    conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0
 """
 import re
 import sqlite3
 import sys
 import time
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.db import DEFAULT_DB, init_db
 DELAY_SECS = 1.5  # seconds between description fetches
 def _extract_job_id(url: str) -> str | None:
    """Pull the Glassdoor listing ID from a job URL (…?jl=1234567890)."""
    m = re.search(r"jl=(\d+)", url or "")
    return m.group(1) if m else None
 def _setup_scraper():
    """
    Create a Glassdoor scraper instance initialised just enough to call
    _fetch_job_description() — skips the full job-search setup.
    """
    from jobspy.glassdoor import Glassdoor
    from jobspy.glassdoor.constant import fallback_token, headers
    from jobspy.model import ScraperInput, Site
    from jobspy.util import create_session
    scraper = Glassdoor()
    scraper.base_url = "https://www.glassdoor.com/"
    scraper.session = create_session(has_retry=True)
    token = scraper._get_csrf_token()
    headers["gd-csrf-token"] = token if token else fallback_token
    scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
    return scraper
 def enrich_glassdoor_descriptions(
    db_path: Path = DEFAULT_DB,
    dry_run: bool = False,
    delay: float = DELAY_SECS,
 ) -> dict:
    """
    Find Glassdoor jobs with missing descriptions and re-fetch them.
    Returns:
        {"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
    """
    init_db(db_path)
    conn = sqlite3.connect(db_path)
    rows = conn.execute(
        """SELECT id, url, company, title FROM jobs
           WHERE source = 'glassdoor'
             AND (description IS NULL OR TRIM(description) = '')
           ORDER BY id ASC"""
    ).fetchall()
    conn.close()
    result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
    if not rows:
        print("[enrich] No Glassdoor jobs missing descriptions.")
        return result
    print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions — fetching…")
    try:
        scraper = _setup_scraper()
    except Exception as e:
        msg = f"Glassdoor scraper init failed: {e}"
        result["errors"].append(msg)
        result["failed"] = len(rows)
        print(f"[enrich] ERROR — {msg}")
        return result
    for db_id, url, company, title in rows:
        job_id = _extract_job_id(url)
        if not job_id:
            msg = f"job #{db_id}: cannot extract listing ID from URL: {url}"
            result["errors"].append(msg)
            result["failed"] += 1
            print(f"[enrich] SKIP — {msg}")
            continue
        try:
            description = scraper._fetch_job_description(int(job_id))
            if description and description.strip():
                if not dry_run:
                    upd = sqlite3.connect(db_path)
                    upd.execute(
                        "UPDATE jobs SET description = ? WHERE id = ?",
                        (description, db_id),
                    )
                    upd.commit()
                    upd.close()
                tag = "[DRY-RUN] " if dry_run else ""
                print(f"[enrich] {tag}{company} — {title}: {len(description)} chars")
                result["succeeded"] += 1
            else:
                print(f"[enrich] {company} — {title}: empty response (expired listing?)")
                result["failed"] += 1
        except Exception as e:
            msg = f"job #{db_id} ({company}): {e}"
            result["errors"].append(msg)
            result["failed"] += 1
            print(f"[enrich] ERROR — {msg}")
        if delay > 0:
            time.sleep(delay)
    return result
 def enrich_all_descriptions(
    db_path: Path = DEFAULT_DB,
    dry_run: bool = False,
    delay: float = DELAY_SECS,
 ) -> dict:
    """
    Find ALL jobs with missing/empty descriptions (any source) and re-fetch them.
    Uses scrape_job_url for every source — it handles LinkedIn, Indeed, Glassdoor,
    Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags.
    Returns:
        {"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
    """
    from scripts.scrape_url import scrape_job_url
    init_db(db_path)
    conn = sqlite3.connect(db_path)
    rows = conn.execute(
        """SELECT id, url, company, title, source FROM jobs
           WHERE (description IS NULL OR TRIM(description) = '')
             AND url IS NOT NULL AND url != ''
           ORDER BY source, id ASC"""
    ).fetchall()
    conn.close()
    result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
    if not rows:
        print("[enrich] No jobs with missing descriptions.")
        return result
    print(f"[enrich] {len(rows)} job(s) missing descriptions — fetching…")
    for db_id, url, company, title, source in rows:
        if not url.startswith("http"):
            result["failed"] += 1
            continue
        tag = "[DRY-RUN] " if dry_run else ""
        try:
            fields = {} if dry_run else scrape_job_url(db_path, db_id)
            if fields or dry_run:
                desc_len = len(fields.get("description", "") or "")
                print(f"[enrich] {tag}[{source}] {company} — {title}: {desc_len} chars")
                result["succeeded"] += 1
            else:
                print(f"[enrich] [{source}] {company} — {title}: no data returned")
                result["failed"] += 1
        except Exception as e:
            msg = f"job #{db_id} ({company}): {e}"
            result["errors"].append(msg)
            result["failed"] += 1
            print(f"[enrich] ERROR — {msg}")
        if delay > 0:
            time.sleep(delay)
    return result
 def enrich_craigslist_fields(
    db_path: Path = DEFAULT_DB,
    job_id: int = None,
 ) -> dict:
    """
    Use LLM to extract company name and salary from a Craigslist job description.
    Called after scrape_url populates the description for a craigslist job.
    Only runs when: source='craigslist', company='', description non-empty.
    Returns dict with keys 'company' and/or 'salary' (may be empty strings).
    """
    import json
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    row = conn.execute(
        "SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,)
    ).fetchone()
    conn.close()
    if not row:
        return {}
    if row["source"] != "craigslist":
        return {}
    if row["company"]:  # already populated
        return {}
    if not (row["description"] or "").strip():
        return {}
    from scripts.llm_router import LLMRouter
    prompt = (
        "Extract the following from this job posting. "
        "Return JSON only, no commentary.\n\n"
        '{"company": "<company name or empty string>", '
        '"salary": "<salary/compensation or empty string>"}\n\n'
        f"Posting:\n{row['description'][:3000]}"
    )
    try:
        router = LLMRouter()
        raw = router.complete(prompt)
    except Exception as exc:
        print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}")
        return {}
    try:
        clean = re.sub(r"```(?:json)?|```", "", raw).strip()
        fields = json.loads(clean)
    except (json.JSONDecodeError, ValueError):
        print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}")
        return {}
    extracted = {
        k: (fields.get(k) or "").strip()
        for k in ("company", "salary")
        if (fields.get(k) or "").strip()
    }
    if extracted:
        from scripts.db import update_job_fields
        update_job_fields(db_path, job_id, extracted)
        print(f"[enrich_craigslist] job {job_id}: "
              f"company={extracted.get('company', '—')} "
              f"salary={extracted.get('salary', '—')}")
    return extracted
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description="Re-fetch missing job descriptions (all sources)"
    )
    parser.add_argument("--glassdoor-only", action="store_true",
                        help="Only re-fetch Glassdoor listings (legacy behaviour)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show what would be fetched without saving")
    parser.add_argument("--delay", type=float, default=DELAY_SECS,
                        help=f"Seconds between requests (default: {DELAY_SECS})")
    args = parser.parse_args()
    if args.glassdoor_only:
        r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay)
    else:
        r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay)
    print(
        f"\n[enrich] Done — {r['succeeded']} fetched, {r['failed']} failed"
        + (f", {len(r['errors'])} error(s)" if r["errors"] else "")
    )
--- a/scripts/finetune_local.py
+++ b/scripts/finetune_local.py
@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 # scripts/finetune_local.py
 """
 Local LoRA fine-tune on Alex's cover letter corpus.
 No HuggingFace account or internet required after the base model is cached.
 Usage:
    conda run -n ogma python scripts/finetune_local.py
    conda run -n ogma python scripts/finetune_local.py --model unsloth/Llama-3.2-3B-Instruct
    conda run -n ogma python scripts/finetune_local.py --epochs 15 --rank 16
 After training, follow the printed instructions to load the model into Ollama.
 """
 import argparse
 import json
 import os
 import sys
 from pathlib import Path
 # Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained
 # pins every layer to GPU 0, avoiding the accelerate None-device bug that
 # occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation.
 # Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation.
 os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
 # ── Config ────────────────────────────────────────────────────────────────────
 DEFAULT_MODEL   = "unsloth/Llama-3.2-3B-Instruct"   # safe on 8 GB VRAM
 LETTERS_JSONL   = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl")
 OUTPUT_DIR      = Path("/Library/Documents/JobSearch/training_data/finetune_output")
 GGUF_DIR        = Path("/Library/Documents/JobSearch/training_data/gguf")
 OLLAMA_NAME     = "alex-cover-writer"
 SYSTEM_PROMPT = (
    "You are Alex Rivera's personal cover letter writer. "
    "Write professional, warm, and results-focused cover letters in Alex's voice. "
    "Draw on her background in customer success, technical account management, "
    "and revenue operations. Be specific and avoid generic filler."
 )
 # ── Args ──────────────────────────────────────────────────────────────────────
 parser = argparse.ArgumentParser()
 parser.add_argument("--model",  default=DEFAULT_MODEL, help="Base model (HF repo id or local path)")
 parser.add_argument("--epochs", type=int, default=10,  help="Training epochs (default: 10)")
 parser.add_argument("--rank",   type=int, default=16,  help="LoRA rank (default: 16)")
 parser.add_argument("--batch",  type=int, default=2,   help="Per-device batch size (default: 2)")
 parser.add_argument("--no-gguf", action="store_true",  help="Skip GGUF export")
 parser.add_argument("--max-length", type=int, default=1024, help="Max token length (default: 1024)")
 args = parser.parse_args()
 print(f"\n{'='*60}")
 print(f"  Alex Cover Letter Fine-Tuner")
 print(f"  Base model : {args.model}")
 print(f"  Epochs     : {args.epochs}")
 print(f"  LoRA rank  : {args.rank}")
 print(f"  Dataset    : {LETTERS_JSONL}")
 print(f"{'='*60}\n")
 # ── Load dataset ──────────────────────────────────────────────────────────────
 if not LETTERS_JSONL.exists():
    sys.exit(f"ERROR: Dataset not found at {LETTERS_JSONL}\n"
             "Run: conda run -n job-seeker python scripts/prepare_training_data.py")
 records = [json.loads(l) for l in LETTERS_JSONL.read_text().splitlines() if l.strip()]
 print(f"Loaded {len(records)} training examples.")
 # Convert to chat format expected by SFTTrainer
 def to_messages(rec: dict) -> dict:
    return {"messages": [
        {"role": "system",    "content": SYSTEM_PROMPT},
        {"role": "user",      "content": rec["instruction"]},
        {"role": "assistant", "content": rec["output"]},
    ]}
 chat_data = [to_messages(r) for r in records]
 # ── Load model with unsloth ────────────────────────────────────────────────────
 try:
    from unsloth import FastLanguageModel
    USE_UNSLOTH = True
 except ImportError:
    USE_UNSLOTH = False
    print("WARNING: unsloth not found — falling back to standard transformers + PEFT")
    print("  Install: pip install 'unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git'")
 import torch
 if USE_UNSLOTH:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name   = args.model,
        max_seq_length = args.max_length,
        load_in_4bit = True,          # QLoRA — fits 7-9B in 8 GB VRAM
        dtype        = None,          # auto-detect
        device_map   = {"": 0},       # pin everything to GPU 0; avoids accelerate None-device bug
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r              = args.rank,
        lora_alpha     = args.rank * 2,
        lora_dropout   = 0,      # 0 = full unsloth kernel patching (faster)
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
        bias           = "none",
        use_gradient_checkpointing = "unsloth",
    )
 else:
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    from peft import LoraConfig, get_peft_model, TaskType
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = AutoModelForCausalLM.from_pretrained(
        args.model,
        quantization_config=bnb_config,
        device_map="auto",
    )
    lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank * 2,
        lora_dropout=0.05,
        task_type=TaskType.CAUSAL_LM,
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
 # ── Build HF Dataset ──────────────────────────────────────────────────────────
 from datasets import Dataset
 raw = Dataset.from_list(chat_data)
 split = raw.train_test_split(test_size=0.1, seed=42)
 train_ds = split["train"]
 eval_ds  = split["test"]
 print(f"Train: {len(train_ds)}  Eval: {len(eval_ds)}")
 # formatting_func must ALWAYS return a list of strings.
 # Unsloth tests it with a single example dict; during training it gets batches.
 # Gemma 2 has no "system" role — fold it into the first user turn.
 def _apply_template(msgs):
    msgs = list(msgs)
    if msgs and msgs[0]["role"] == "system":
        sys_text = msgs.pop(0)["content"]
        if msgs and msgs[0]["role"] == "user":
            msgs[0] = {"role": "user", "content": f"{sys_text}\n\n{msgs[0]['content']}"}
    return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
 def formatting_func(example):
    msgs_field = example["messages"]
    # Single example: messages is a list of role dicts {"role":..., "content":...}
    # Batched example: messages is a list of those lists
    if msgs_field and isinstance(msgs_field[0], dict):
        return [_apply_template(msgs_field)]
    return [_apply_template(m) for m in msgs_field]
 # ── Train ─────────────────────────────────────────────────────────────────────
 from trl import SFTTrainer, SFTConfig
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    formatting_func=formatting_func,
    args=SFTConfig(
        output_dir                  = str(OUTPUT_DIR),
        num_train_epochs            = args.epochs,
        per_device_train_batch_size = args.batch,
        gradient_accumulation_steps = max(1, 8 // args.batch),
        learning_rate               = 2e-4,
        warmup_ratio                = 0.1,
        lr_scheduler_type           = "cosine",
        fp16                        = not torch.cuda.is_bf16_supported(),
        bf16                        = torch.cuda.is_bf16_supported(),
        logging_steps               = 5,
        eval_strategy               = "epoch",
        save_strategy               = "epoch",
        load_best_model_at_end      = True,
        max_length                  = args.max_length,
        report_to                   = "none",
        push_to_hub                 = False,        # local only
    ),
 )
 print("\nStarting training…")
 trainer.train()
 print("Training complete.")
 # ── Save adapter ──────────────────────────────────────────────────────────────
 adapter_path = OUTPUT_DIR / "adapter"
 model.save_pretrained(str(adapter_path))
 tokenizer.save_pretrained(str(adapter_path))
 print(f"\nLoRA adapter saved to: {adapter_path}")
 # ── GGUF export ───────────────────────────────────────────────────────────────
 if not args.no_gguf and USE_UNSLOTH:
    GGUF_DIR.mkdir(parents=True, exist_ok=True)
    gguf_path = GGUF_DIR / f"{OLLAMA_NAME}.gguf"
    print(f"\nExporting GGUF → {gguf_path} …")
    model.save_pretrained_gguf(
        str(GGUF_DIR / OLLAMA_NAME),
        tokenizer,
        quantization_method="q4_k_m",
    )
    # unsloth names the file automatically — find it
    gguf_files = list(GGUF_DIR.glob("*.gguf"))
    if gguf_files:
        gguf_path = gguf_files[0]
        print(f"GGUF written: {gguf_path}")
    else:
        print("GGUF export may have succeeded — check GGUF_DIR above.")
 else:
    gguf_path = None
 # ── Print next steps ──────────────────────────────────────────────────────────
 print(f"\n{'='*60}")
 print("  DONE — next steps to load into Ollama:")
 print(f"{'='*60}")
 if gguf_path and gguf_path.exists():
    modelfile = OUTPUT_DIR / "Modelfile"
    modelfile.write_text(f"""FROM {gguf_path}
 SYSTEM \"\"\"
 {SYSTEM_PROMPT}
 \"\"\"
 PARAMETER temperature 0.7
 PARAMETER top_p 0.9
 PARAMETER num_ctx 32768
 """)
    print(f"\n1. Modelfile written to: {modelfile}")
    print(f"\n2. Create the Ollama model:")
    print(f"     ollama create {OLLAMA_NAME} -f {modelfile}")
    print(f"\n3. Test it:")
    print(f"     ollama run {OLLAMA_NAME} 'Write a cover letter for a Senior Customer Success Manager position at Acme Corp.'")
    print(f"\n4. Update llm.yaml to use '{OLLAMA_NAME}:latest' as the ollama model,")
    print(f"   then pick it in Settings → LLM Backends → Ollama → Model.")
 else:
    print(f"\n  Adapter only (no GGUF). To convert manually:")
    print(f"  1. Merge adapter:")
    print(f"       conda run -n ogma python -c \"")
    print(f"         from peft import AutoPeftModelForCausalLM")
    print(f"         m = AutoPeftModelForCausalLM.from_pretrained('{adapter_path}')")
    print(f"         m.merge_and_unload().save_pretrained('{OUTPUT_DIR}/merged')\"")
    print(f"  2. Convert to GGUF using textgen env's convert_hf_to_gguf.py")
    print(f"  3. ollama create {OLLAMA_NAME} -f Modelfile")
 print()
--- a/scripts/generate_cover_letter.py
+++ b/scripts/generate_cover_letter.py
@ -0,0 +1,224 @@
 # scripts/generate_cover_letter.py
 """
 Generate a cover letter in Alex's voice using few-shot examples from her corpus.
 Usage:
    conda run -n job-seeker python scripts/generate_cover_letter.py \
        --title "Director of Customer Success" \
        --company "Acme Corp" \
        --description "We are looking for..."
    Or pass a staging DB job ID:
        conda run -n job-seeker python scripts/generate_cover_letter.py --job-id 42
 """
 import argparse
 import re
 import sys
 from pathlib import Path
 LETTERS_DIR = Path("/Library/Documents/JobSearch")
 LETTER_GLOB = "*Cover Letter*.md"
 # Background injected into every prompt so the model has Alex's facts
 SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader.
 Background:
 - 6+ years in customer success, technical account management, and CS leadership
 - Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95
 - Also founder of M3 Consulting, a CS advisory practice for SaaS startups
 - Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere
 - Based in San Francisco Bay Area; open to remote/hybrid
 - Pronouns: any
 Voice guidelines:
 - Warm, confident, and specific — never generic
 - Opens with "I'm delighted/thrilled to apply for [role] at [company]."
 - 3–4 focused paragraphs, ~250–350 words total
 - Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric)
 - Para 3: genuine connection to THIS company's mission/product
 - Closes with "Thank you for considering my application." + warm sign-off
 - Never use: "I am writing to express my interest", "passionate about making a difference",
  "I look forward to hearing from you", or any hollow filler phrases
 """
 # ── Mission-alignment detection ───────────────────────────────────────────────
 # When a company/JD signals one of these preferred industries, the cover letter
 # prompt injects a hint so Para 3 can reflect genuine personal connection.
 # This does NOT disclose any personal disability or family information.
 _MISSION_SIGNALS: dict[str, list[str]] = {
    "music": [
        "music", "spotify", "tidal", "soundcloud", "bandcamp", "apple music",
        "distrokid", "cd baby", "landr", "beatport", "reverb", "vinyl",
        "streaming", "artist", "label", "live nation", "ticketmaster", "aeg",
        "songkick", "concert", "venue", "festival", "audio", "podcast",
        "studio", "record", "musician", "playlist",
    ],
    "animal_welfare": [
        "animal", "shelter", "rescue", "humane society", "spca", "aspca",
        "veterinary", "vet ", "wildlife", "pet ", "adoption", "foster",
        "dog", "cat", "feline", "canine", "sanctuary", "zoo",
    ],
    "education": [
        "education", "school", "learning", "student", "edtech", "classroom",
        "curriculum", "tutoring", "academic", "university", "kids", "children",
        "youth", "literacy", "khan academy", "duolingo", "chegg", "coursera",
        "instructure", "canvas lms", "clever", "district", "teacher",
        "k-12", "k12", "grade", "pedagogy",
    ],
 }
 _MISSION_NOTES: dict[str, str] = {
    "music": (
        "This company is in the music industry, which is one of Alex's genuinely "
        "ideal work environments — she has a real personal passion for the music scene. "
        "Para 3 should warmly and specifically reflect this authentic alignment, not as "
        "a generic fan statement, but as an honest statement of where she'd love to apply "
        "her CS skills."
    ),
    "animal_welfare": (
        "This organization works in animal welfare/rescue — one of Alex's dream-job "
        "domains and a genuine personal passion. Para 3 should reflect this authentic "
        "connection warmly and specifically, tying her CS skills to this mission."
    ),
    "education": (
        "This company works in children's education or EdTech — one of Alex's ideal "
        "work domains, reflecting genuine personal values around learning and young people. "
        "Para 3 should reflect this authentic connection specifically and warmly."
    ),
 }
 def detect_mission_alignment(company: str, description: str) -> str | None:
    """Return a mission hint string if company/JD matches a preferred industry, else None."""
    text = f"{company} {description}".lower()
    for industry, signals in _MISSION_SIGNALS.items():
        if any(sig in text for sig in signals):
            return _MISSION_NOTES[industry]
    return None
 def load_corpus() -> list[dict]:
    """Load all .md cover letters from LETTERS_DIR. Returns list of {path, company, text}."""
    corpus = []
    for path in sorted(LETTERS_DIR.glob(LETTER_GLOB)):
        text = path.read_text(encoding="utf-8", errors="ignore").strip()
        if not text:
            continue
        # Extract company from filename: "Tailscale Cover Letter.md" → "Tailscale"
        company = re.sub(r"\s*Cover Letter.*", "", path.stem, flags=re.IGNORECASE).strip()
        corpus.append({"path": path, "company": company, "text": text})
    return corpus
 def find_similar_letters(job_description: str, corpus: list[dict], top_k: int = 3) -> list[dict]:
    """Return the top_k letters most similar to the job description by TF-IDF cosine sim."""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    if not corpus:
        return []
    docs = [job_description] + [c["text"] for c in corpus]
    vectorizer = TfidfVectorizer(stop_words="english", max_features=500)
    tfidf = vectorizer.fit_transform(docs)
    sims = cosine_similarity(tfidf[0:1], tfidf[1:])[0]
    ranked = sorted(zip(sims, corpus), key=lambda x: x[0], reverse=True)
    return [entry for _, entry in ranked[:top_k]]
 def build_prompt(
    title: str,
    company: str,
    description: str,
    examples: list[dict],
    mission_hint: str | None = None,
 ) -> str:
    parts = [SYSTEM_CONTEXT.strip(), ""]
    if examples:
        parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n")
        for i, ex in enumerate(examples, 1):
            parts.append(f"--- Example {i} ({ex['company']}) ---")
            parts.append(ex["text"])
            parts.append("")
        parts.append("=== END EXAMPLES ===\n")
    if mission_hint:
        parts.append(f"⭐ Mission alignment note (for Para 3): {mission_hint}\n")
    parts.append(f"Now write a new cover letter for:")
    parts.append(f"  Role: {title}")
    parts.append(f"  Company: {company}")
    if description:
        snippet = description[:1500].strip()
        parts.append(f"\nJob description excerpt:\n{snippet}")
    parts.append("\nWrite the full cover letter now:")
    return "\n".join(parts)
 def generate(title: str, company: str, description: str = "", _router=None) -> str:
    """Generate a cover letter and return it as a string.
    _router is an optional pre-built LLMRouter (used in tests to avoid real LLM calls).
    """
    corpus = load_corpus()
    examples = find_similar_letters(description or f"{title} {company}", corpus)
    mission_hint = detect_mission_alignment(company, description)
    if mission_hint:
        print(f"[cover-letter] Mission alignment detected for {company}", file=sys.stderr)
    prompt = build_prompt(title, company, description, examples, mission_hint=mission_hint)
    if _router is None:
        sys.path.insert(0, str(Path(__file__).parent.parent))
        from scripts.llm_router import LLMRouter
        _router = LLMRouter()
    print(f"[cover-letter] Generating for: {title} @ {company}", file=sys.stderr)
    print(f"[cover-letter] Style examples: {[e['company'] for e in examples]}", file=sys.stderr)
    result = _router.complete(prompt)
    return result.strip()
 def main() -> None:
    parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice")
    parser.add_argument("--title", help="Job title")
    parser.add_argument("--company", help="Company name")
    parser.add_argument("--description", default="", help="Job description text")
    parser.add_argument("--job-id", type=int, help="Load job from staging.db by ID")
    parser.add_argument("--output", help="Write output to this file path")
    args = parser.parse_args()
    title, company, description = args.title, args.company, args.description
    if args.job_id is not None:
        from scripts.db import DEFAULT_DB
        import sqlite3
        conn = sqlite3.connect(DEFAULT_DB)
        conn.row_factory = sqlite3.Row
        row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
        conn.close()
        if not row:
            print(f"No job with id={args.job_id} in staging.db", file=sys.stderr)
            sys.exit(1)
        job = dict(row)
        title = title or job.get("title", "")
        company = company or job.get("company", "")
        description = description or job.get("description", "")
    if not title or not company:
        parser.error("--title and --company are required (or use --job-id)")
    letter = generate(title, company, description)
    if args.output:
        Path(args.output).write_text(letter)
        print(f"Saved to {args.output}", file=sys.stderr)
    else:
        print(letter)
 if __name__ == "__main__":
    main()
--- a/scripts/imap_sync.py
+++ b/scripts/imap_sync.py
@ -0,0 +1,906 @@
 # scripts/imap_sync.py
 """
 IMAP email sync — associates recruitment emails with job applications.
 Safety / privacy design:
  - Only imports emails that pass BOTH checks:
      1. Sender or subject contains the exact company name (or derived domain)
      2. Subject contains at least one recruitment keyword
  - Fuzzy / partial company name matches are rejected
  - Emails between known personal contacts are never imported
  - Only the INBOX and Sent folders are touched; no other folders
  - Credentials stored in config/email.yaml (gitignored)
 Config: config/email.yaml  (see config/email.yaml.example)
 Usage:
    conda run -n job-seeker python scripts/imap_sync.py
    conda run -n job-seeker python scripts/imap_sync.py --job-id 42
    conda run -n job-seeker python scripts/imap_sync.py --dry-run
 """
 import email
 import imaplib
 import re
 import sys
 from datetime import datetime, timedelta
 from email.header import decode_header as _raw_decode_header
 from pathlib import Path
 from typing import Optional
 from urllib.parse import urlparse
 import yaml
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.db import DEFAULT_DB, init_db, get_interview_jobs, add_contact, get_contacts
 from scripts.llm_router import LLMRouter
 _CLASSIFIER_ROUTER = LLMRouter()
 _CLASSIFY_SYSTEM = (
    "You are an email classifier. Classify the recruitment email into exactly ONE of these categories:\n"
    "  interview_scheduled, offer_received, rejected, positive_response, survey_received, neutral\n\n"
    "Rules:\n"
    "- interview_scheduled: recruiter wants to book a call/interview\n"
    "- offer_received: job offer is being extended\n"
    "- rejected: explicitly not moving forward\n"
    "- positive_response: interested/impressed but no interview booked yet\n"
    "- survey_received: link or request to complete a survey, assessment, or questionnaire\n"
    "- neutral: auto-confirmation, generic update, no clear signal\n\n"
    "Respond with ONLY the category name. No explanation."
 )
 _CLASSIFY_LABELS = [
    "interview_scheduled", "offer_received", "rejected",
    "positive_response", "survey_received", "neutral",
 ]
 CONFIG_PATH = Path(__file__).parent.parent / "config" / "email.yaml"
 # ── Recruitment keyword filter ────────────────────────────────────────────────
 # An email must match at least one of these in its subject line to be imported.
 RECRUITMENT_KEYWORDS = {
    # Application lifecycle
    "interview", "application", "applicant", "apply", "applied",
    "position", "opportunity", "role", "opening", "vacancy",
    "offer", "offer letter", "schedule", "scheduling",
    "screening", "screen", "phone screen", "video call",
    "assessment", "hiring", "hired", "recruiter", "recruitment",
    "talent", "candidate", "recruiting", "next steps", "follow up", "follow-up",
    "onboarding", "start date", "background check", "reference",
    "congratulations", "unfortunately", "decision", "update",
    # Job board / ATS notifications
    "viewed your profile", "interested in your background",
    "job alert", "new job", "job match", "job opportunity",
    "your application", "application received", "application status",
    "application update", "we received", "thank you for applying",
    "thanks for applying", "moved forward", "moving forward",
    "not moving forward", "decided to", "other candidates",
    "keep your resume", "keep you in mind",
    # Recruiter outreach
    "reaching out", "i came across", "your experience",
    "connect with you", "exciting opportunity", "great fit",
    "perfect fit", "right fit", "strong fit", "ideal candidate",
 }
 # ── Rejection / ATS-confirm phrase filter ─────────────────────────────────────
 # Checked against subject + first 800 chars of body BEFORE calling any LLM.
 # Covers the cases phi3:mini consistently mis-classifies as "neutral".
 _REJECTION_PHRASES = [
    # Explicit rejection — safe to check subject + body
    "not moving forward", "decided not to move forward",
    "not selected", "not be moving forward", "will not be moving forward",
    "unfortunately", "regret to inform", "regret to let you know",
    "decided to go with other", "decided to pursue other",
    "other candidates", "other applicants", "position has been filled",
    "filled the position", "no longer moving forward",
    "we have decided", "we've decided", "after careful consideration",
    "at this time we", "at this point we",
    "we will not", "we won't be", "we are not able",
    "wish you the best", "best of luck in your",
    "keep your resume on file",
 ]
 # ATS-confirm phrases — checked against SUBJECT ONLY.
 # Do NOT check these in the body: recruiters often quote ATS thread history,
 # so "thank you for applying" can appear in a genuine follow-up body.
 _ATS_CONFIRM_SUBJECTS = [
    "application received", "application confirmation",
    "thanks for applying", "thank you for applying",
    "thank you for your application",
    "we received your application",
    "application has been received",
    "has received your application",
    "successfully submitted",
    "your application for",
    "you applied to",
 ]
 # Phrases that immediately identify a non-recruitment email (retail, spam, etc.)
 _SPAM_PHRASES = [
    # Retail / commerce offers
    "special offer", "private offer", "exclusive offer", "limited time offer",
    "limited-time offer", "sent you a special offer", "sent you an offer",
    "holiday offer", "seasonal offer", "membership offer",
    "round trip from $", "bonus points",
    "% off", "% discount", "save up to", "free shipping",
    "unsubscribe", "view in browser", "view this email in",
    "update your preferences", "email preferences",
    # LinkedIn apply confirmations & digests (not new inbound leads)
    "your application was sent to",
    "your application was viewed by",
    "application updates this week",
    "don't forget to complete your application",
    "view your application updates",
    "you have new application updates",
    # Indeed apply confirmations
    "indeed application:",
    # DocuSign / e-signature
    "requests you to sign",
    "has sent you a reminder",
    "please sign",
    # Security / MFA codes
    "security code for your application",
    "verification code",
 ]
 # Subject prefixes that identify non-job emails
 _SPAM_SUBJECT_PREFIXES = [
    "@",                    # "@user sent you a special offer" — Depop / social commerce
    "re: fw:",              # forwarded chains unlikely to be first-contact recruitment
    "accepted:",            # Google Calendar accepted invite
    "notification:",        # Google Calendar notification
    "[meeting reminder]",   # Google Calendar meeting reminder
    "updated invitation:",  # Google Calendar update
    "[updated]",            # Google Calendar update
    "reminder:",            # Generic reminder (AAA digital interview reminders, etc.)
    "📄",                   # Newsletter/article emoji prefix
    "invitation from",      # Google Calendar invite forwarded by name
 ]
 # Unicode-safe "don't forget" variants (Gmail renders typographic apostrophes)
 _DONT_FORGET_VARIANTS = [
    "don't forget to complete your application",          # straight apostrophe
    "don\u2019t forget to complete your application",    # right single quotation mark '
    "don\u2018t forget to complete your application",    # left single quotation mark '
 ]
 def _has_rejection_or_ats_signal(subject: str, body: str) -> bool:
    """Return True if the email is a rejection, ATS auto-confirmation, or non-recruitment spam."""
    subject_lower = subject.lower().strip()
    # Fast subject-prefix checks (Depop "@user", etc.)
    if any(subject_lower.startswith(p) for p in _SPAM_SUBJECT_PREFIXES):
        return True
    # Fast subject-only check for ATS confirmations
    if any(phrase in subject_lower for phrase in _ATS_CONFIRM_SUBJECTS):
        return True
    # Check subject + opening body for rejection and spam phrases
    haystack = subject_lower + " " + body[:1500].lower()
    if any(phrase in haystack for phrase in _REJECTION_PHRASES + _SPAM_PHRASES):
        return True
    # Unicode-safe "don't forget" check (handles straight, right, and left apostrophes)
    raw = (subject + " " + body[:1500]).lower()
    return any(phrase in raw for phrase in _DONT_FORGET_VARIANTS)
 # Legal entity suffixes to strip when normalising company names
 _LEGAL_SUFFIXES = re.compile(
    r",?\s*\b(Inc|LLC|Ltd|Limited|Corp|Corporation|Co|GmbH|AG|plc|PLC|SAS|SA|NV|BV|LP|LLP)\b\.?\s*$",
    re.IGNORECASE,
 )
 # Job-board SLDs that must never be used as company-match search terms.
 # A LinkedIn job URL has domain "linkedin.com" → SLD "linkedin", which would
 # incorrectly match every LinkedIn notification email against every LinkedIn job.
 _JOB_BOARD_SLDS = {
    "linkedin", "indeed", "glassdoor", "ziprecruiter", "monster",
    "careerbuilder", "dice", "simplyhired", "wellfound", "angellist",
    "greenhouse", "lever", "workday", "taleo", "icims", "smartrecruiters",
    "bamboohr", "ashby", "rippling", "jobvite", "workable", "gusto",
    "paylocity", "paycom", "adp", "breezy", "recruitee", "jazz",
 }
 # ── Helpers ───────────────────────────────────────────────────────────────────
 def _decode_str(value: Optional[str]) -> str:
    """Decode an RFC2047-encoded header value to a plain Python string."""
    if not value:
        return ""
    parts = _raw_decode_header(value)
    result = []
    for part, encoding in parts:
        if isinstance(part, bytes):
            result.append(part.decode(encoding or "utf-8", errors="replace"))
        else:
            result.append(str(part))
    return " ".join(result).strip()
 def _extract_domain(url_or_email: str) -> str:
    """
    Pull the bare domain from a URL (https://company.com/jobs/...) or
    an email address (recruiter@company.com).  Returns '' if none found.
    """
    url_or_email = url_or_email.strip()
    if "@" in url_or_email:
        return url_or_email.split("@")[-1].split(">")[0].strip().lower()
    try:
        parsed = urlparse(url_or_email)
        host = parsed.netloc or parsed.path
        # strip www.
        return re.sub(r"^www\.", "", host).lower()
    except Exception:
        return ""
 def _normalise_company(company: str) -> str:
    """Strip legal suffixes and extra whitespace from a company name."""
    return _LEGAL_SUFFIXES.sub("", company).strip()
 def _company_search_terms(company: str, job_url: str = "") -> list[str]:
    """
    Return a list of strings that must appear (case-insensitively) in the
    email's from-address or subject for it to be considered a match.
    We are deliberately conservative:
      - Use the full normalised company name (not just the first word)
      - Also include the company domain derived from the job URL, but ONLY
        when the domain belongs to the actual company (not a job board).
        LinkedIn jobs link to linkedin.com — if we used "linkedin" as a term
        we'd match every LinkedIn notification email against every LinkedIn job.
    """
    terms = []
    clean = _normalise_company(company)
    if len(clean) >= 3:
        terms.append(clean.lower())
    domain = _extract_domain(job_url)
    if domain and len(domain) > 4:
        sld = domain.split(".")[0]
        if len(sld) >= 3 and sld not in terms and sld not in _JOB_BOARD_SLDS:
            terms.append(sld)
    return terms
 def _has_recruitment_keyword(subject: str) -> bool:
    """Return True if the subject contains at least one recruitment keyword."""
    subject_lower = subject.lower()
    return any(kw in subject_lower for kw in RECRUITMENT_KEYWORDS)
 def _email_is_relevant(from_addr: str, subject: str, search_terms: list[str]) -> bool:
    """
    Two-gate filter:
      Gate 1 — from-address OR subject must contain an exact company term
      Gate 2 — subject must contain a recruitment keyword
    Both gates must pass.  This prevents importing unrelated emails that
    happen to mention a company name in passing.
    """
    combined = (from_addr + " " + subject).lower()
    gate1 = any(term in combined for term in search_terms)
    gate2 = _has_recruitment_keyword(subject)
    return gate1 and gate2
 def _get_existing_message_ids(job_id: int, db_path: Path) -> set[str]:
    contacts = get_contacts(db_path, job_id=job_id)
    return {c.get("message_id", "") for c in contacts if c.get("message_id")}
 def classify_stage_signal(subject: str, body: str) -> Optional[str]:
    """Classify an inbound email into a pipeline stage signal.
    Returns one of the 5 label strings, or None on failure.
    Uses phi3:mini via Ollama (benchmarked 100% on 12-case test set).
    """
    try:
        prompt = f"Subject: {subject}\n\nEmail: {body[:400]}"
        raw = _CLASSIFIER_ROUTER.complete(
            prompt,
            system=_CLASSIFY_SYSTEM,
            model_override="llama3.1:8b",
            fallback_order=["ollama_research"],
        )
        # Strip <think> blocks (in case a reasoning model slips through)
        text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
        text = text.lower().strip()
        for label in _CLASSIFY_LABELS:
            if text.startswith(label) or label in text:
                return label
        return "neutral"
    except Exception:
        return None
 _EXTRACT_SYSTEM = (
    "Extract the hiring company name and job title from this recruitment email, "
    "but ONLY if it represents genuine new recruiter outreach — i.e. a recruiter "
    "contacting you about an open role for the first time.\n\n"
    "Return {\"company\": null, \"title\": null} if the email is any of:\n"
    "  - A rejection or 'not moving forward' notice\n"
    "  - An ATS auto-confirmation ('we received your application')\n"
    "  - A status update for an application already in progress\n"
    "  - A generic job-alert digest or newsletter\n"
    "  - A follow-up you sent, not a reply from a recruiter\n\n"
    "Otherwise respond with ONLY valid JSON: "
    '{"company": "Company Name", "title": "Job Title"}.'
 )
 def extract_lead_info(subject: str, body: str,
                      from_addr: str) -> tuple[Optional[str], Optional[str]]:
    """Use LLM to extract (company, title) from an unmatched recruitment email.
    Returns (company, title) or (None, None) on failure / low confidence.
    """
    import json as _json
    try:
        prompt = (
            f"From: {from_addr}\n"
            f"Subject: {subject}\n\n"
            f"Email excerpt:\n{body[:600]}"
        )
        raw = _CLASSIFIER_ROUTER.complete(
            prompt,
            system=_EXTRACT_SYSTEM,
            fallback_order=["ollama_research"],
        )
        text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
        m = re.search(r'\{.*\}', text, re.DOTALL)
        if not m:
            return None, None
        data = _json.loads(m.group())
        company = data.get("company") or None
        title   = data.get("title") or None
        return company, title
    except Exception:
        return None, None
 # Keywords that indicate an email in a curated label needs attention.
 # Intentionally separate from RECRUITMENT_KEYWORDS — these are action-oriented.
 _TODO_LABEL_KEYWORDS = {
    "action needed", "action required",
    "please complete", "please submit", "please respond", "please reply",
    "response needed", "response required",
    "next steps", "next step",
    "follow up", "follow-up",
    "deadline", "by end of",
    "your offer", "offer letter",
    "background check", "reference check",
    "onboarding", "start date",
    "congrats", "congratulations",
    "we'd like to", "we would like to",
    "interview", "schedule", "scheduling",
 }
 def _has_todo_keyword(subject: str) -> bool:
    """Return True if the subject contains a TODO-label action keyword."""
    subject_lower = subject.lower()
    return any(kw in subject_lower for kw in _TODO_LABEL_KEYWORDS)
 _LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com"
 # Social-proof / nav lines to skip when parsing alert blocks
 _ALERT_SKIP_PHRASES = {
    "school alumni", "apply with", "actively hiring", "manage alerts",
    "view all jobs", "your job alert", "new jobs match",
    "unsubscribe", "linkedin corporation",
 }
 def parse_linkedin_alert(body: str) -> list[dict]:
    """
    Parse the plain-text body of a LinkedIn Job Alert digest email.
    Returns a list of dicts: {title, company, location, url}.
    URL is canonicalized to https://www.linkedin.com/jobs/view/<id>/
    (tracking parameters stripped).
    """
    jobs = []
    # Split on separator lines (10+ dashes)
    blocks = re.split(r"\n\s*-{10,}\s*\n", body)
    for block in blocks:
        lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()]
        # Find "View job:" URL
        url = None
        for line in lines:
            m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE)
            if m:
                raw_url = m.group(1)
                job_id_m = re.search(r"/jobs/view/(\d+)", raw_url)
                if job_id_m:
                    url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/"
                break
        if not url:
            continue
        # Filter noise lines
        content = [
            ln for ln in lines
            if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES)
            and not ln.lower().startswith("view job:")
            and not ln.startswith("http")
        ]
        if len(content) < 2:
            continue
        jobs.append({
            "title": content[0],
            "company": content[1],
            "location": content[2] if len(content) > 2 else "",
            "url": url,
        })
    return jobs
 def _scan_todo_label(conn: imaplib.IMAP4, cfg: dict, db_path: Path,
                     active_jobs: list[dict],
                     known_message_ids: set) -> int:
    """Scan the configured Gmail label for action emails, matching them to pipeline jobs.
    Two gates per email:
      1. Company name appears in from-address or subject (same as sync_job_emails)
      2. Subject contains a TODO-label action keyword
    Returns count of new contacts attached.
    """
    label = cfg.get("todo_label", "").strip()
    if not label:
        return 0
    lookback = int(cfg.get("lookback_days", 90))
    since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
    # Search the label folder for any emails (no keyword pre-filter — it's curated)
    uids = _search_folder(conn, label, "ALL", since)
    if not uids:
        return 0
    # Build a lookup: search_term → [job, ...] for all active jobs
    term_to_jobs: dict[str, list[dict]] = {}
    for job in active_jobs:
        for term in _company_search_terms(job.get("company", ""), job.get("url", "")):
            term_to_jobs.setdefault(term, []).append(job)
    added = 0
    for uid in uids:
        parsed = _parse_message(conn, uid)
        if not parsed:
            continue
        mid = parsed["message_id"]
        if mid in known_message_ids:
            continue
        # Gate 1: company name match — from_addr + subject + first 300 chars of body
        # Body fallback catches ATS emails (e.g. noreply@greenhouse.io) where the
        # company name only appears in the email body, not the sender or subject.
        combined = (
            parsed["from_addr"] + " " +
            parsed["subject"] + " " +
            parsed["body"][:300]
        ).lower()
        matched_jobs = []
        for term, jobs in term_to_jobs.items():
            if term in combined:
                matched_jobs.extend(jobs)
        # Deduplicate by job id
        seen_ids: set[int] = set()
        matched_jobs = [j for j in matched_jobs if not (j["id"] in seen_ids or seen_ids.add(j["id"]))]  # type: ignore[func-returns-value]
        if not matched_jobs:
            continue
        # Gate 2: action keyword in subject
        if not _has_todo_keyword(parsed["subject"]):
            continue
        for job in matched_jobs:
            contact_id = add_contact(
                db_path, job_id=job["id"], direction="inbound",
                subject=parsed["subject"],
                from_addr=parsed["from_addr"],
                to_addr=parsed["to_addr"],
                body=parsed["body"],
                received_at=parsed["date"][:16] if parsed["date"] else since,
                message_id=mid,
            )
            signal = classify_stage_signal(parsed["subject"], parsed["body"])
            if signal and signal != "neutral":
                _update_contact_signal(db_path, contact_id, signal)
        known_message_ids.add(mid)
        added += 1
        print(f"[imap] TODO label → {matched_jobs[0].get('company')} — {parsed['subject'][:60]}")
    return added
 def _scan_unmatched_leads(conn: imaplib.IMAP4, cfg: dict,
                          db_path: Path,
                          known_message_ids: set) -> int:
    """Scan INBOX for recruitment emails not matched to any pipeline job.
    Calls LLM to extract company/title; inserts qualifying emails as pending jobs.
    Returns the count of new leads inserted.
    """
    from scripts.db import get_existing_urls, insert_job, add_contact as _add_contact
    lookback = int(cfg.get("lookback_days", 90))
    since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
    broad_terms = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"]
    all_uids: set = set()
    for term in broad_terms:
        uids = _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since)
        all_uids.update(uids)
    existing_urls = get_existing_urls(db_path)
    new_leads = 0
    for uid in all_uids:
        parsed = _parse_message(conn, uid)
        if not parsed:
            continue
        mid = parsed["message_id"]
        if mid in known_message_ids:
            continue
        # ── LinkedIn Job Alert digest — parse each card individually ──────
        if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
            cards = parse_linkedin_alert(parsed["body"])
            for card in cards:
                if card["url"] in existing_urls:
                    continue
                job_id = insert_job(db_path, {
                    "title": card["title"],
                    "company": card["company"],
                    "url": card["url"],
                    "source": "linkedin",
                    "location": card["location"],
                    "is_remote": 0,
                    "salary": "",
                    "description": "",
                    "date_found": datetime.now().isoformat()[:10],
                })
                if job_id:
                    from scripts.task_runner import submit_task
                    submit_task(db_path, "scrape_url", job_id)
                    existing_urls.add(card["url"])
                    new_leads += 1
                    print(f"[imap] LinkedIn alert → {card['company']} — {card['title']}")
            known_message_ids.add(mid)
            continue  # skip normal LLM extraction path
        if not _has_recruitment_keyword(parsed["subject"]):
            continue
        # Fast phrase-based rejection / ATS-confirm filter (catches what phi3 misses)
        if _has_rejection_or_ats_signal(parsed["subject"], parsed["body"]):
            continue
        # LLM classification as secondary gate — skip on rejection or classifier failure
        signal = classify_stage_signal(parsed["subject"], parsed["body"])
        if signal is None or signal == "rejected":
            continue
        company, title = extract_lead_info(
            parsed["subject"], parsed["body"], parsed["from_addr"]
        )
        if not company:
            continue
        from_domain = _extract_domain(parsed["from_addr"]) or "unknown"
        mid_hash = str(abs(hash(mid)))[:10]
        synthetic_url = f"email://{from_domain}/{mid_hash}"
        if synthetic_url in existing_urls:
            continue
        job_id = insert_job(db_path, {
            "title": title or "(untitled)",
            "company": company,
            "url": synthetic_url,
            "source": "email",
            "location": "",
            "is_remote": 0,
            "salary": "",
            "description": parsed["body"][:2000],
            "date_found": datetime.now().isoformat()[:10],
        })
        if job_id:
            _add_contact(db_path, job_id=job_id, direction="inbound",
                         subject=parsed["subject"],
                         from_addr=parsed["from_addr"],
                         body=parsed["body"],
                         received_at=parsed["date"][:16] if parsed["date"] else "",
                         message_id=mid)
            known_message_ids.add(mid)
            existing_urls.add(synthetic_url)
            new_leads += 1
    return new_leads
 # ── IMAP connection ───────────────────────────────────────────────────────────
 def load_config() -> dict:
    if not CONFIG_PATH.exists():
        raise FileNotFoundError(
            f"Email config not found: {CONFIG_PATH}\n"
            f"Copy config/email.yaml.example → config/email.yaml and fill it in."
        )
    return yaml.safe_load(CONFIG_PATH.read_text()) or {}
 def connect(cfg: dict) -> imaplib.IMAP4:
    host = cfg.get("host", "imap.gmail.com")
    port = int(cfg.get("port", 993))
    use_ssl = cfg.get("use_ssl", True)
    conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port)
    conn.login(cfg["username"], cfg["password"])
    return conn
 def _detect_sent_folder(conn: imaplib.IMAP4) -> str:
    """Try to auto-detect the Sent folder name."""
    candidates = ["[Gmail]/Sent Mail", "Sent", "Sent Items", "Sent Messages", "INBOX.Sent"]
    try:
        _, folder_list = conn.list()
        flat = " ".join(f.decode() for f in (folder_list or []))
        for candidate in candidates:
            if candidate.lower() in flat.lower():
                return candidate
    except Exception:
        pass
    return "Sent"
 def _quote_folder(name: str) -> str:
    """Quote an IMAP folder name if it contains spaces.
    Escapes internal backslashes and double-quotes per RFC 3501.
    e.g. 'TO DO JOBS' → '"TO DO JOBS"', 'My "Jobs"' → '"My \\"Jobs\\""'
    """
    if " " in name:
        escaped = name.replace("\\", "\\\\").replace('"', '\\"')
        return f'"{escaped}"'
    return name
 def _search_folder(conn: imaplib.IMAP4, folder: str, criteria: str,
                   since: str) -> list[bytes]:
    """SELECT a folder and return matching UID list (empty on any error)."""
    try:
        conn.select(_quote_folder(folder), readonly=True)
        _, data = conn.search(None, f'(SINCE "{since}" {criteria})')
        return data[0].split() if data and data[0] else []
    except Exception:
        return []
 def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
    """Fetch and parse one message.  Returns None on failure."""
    try:
        _, data = conn.fetch(uid, "(RFC822)")
        if not data or not data[0]:
            return None
        msg = email.message_from_bytes(data[0][1])
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    try:
                        body = part.get_payload(decode=True).decode("utf-8", errors="replace")
                    except Exception:
                        pass
                    break
        else:
            try:
                body = msg.get_payload(decode=True).decode("utf-8", errors="replace")
            except Exception:
                pass
        mid = msg.get("Message-ID", "").strip()
        if not mid:
            return None  # No Message-ID → can't dedup; skip to avoid repeat inserts
        return {
            "message_id": mid,
            "subject":    _decode_str(msg.get("Subject")),
            "from_addr":  _decode_str(msg.get("From")),
            "to_addr":    _decode_str(msg.get("To")),
            "date":       _decode_str(msg.get("Date")),
            "body":       body[:4000],
        }
    except Exception:
        return None
 # ── Per-job sync ──────────────────────────────────────────────────────────────
 def _update_contact_signal(db_path: Path, contact_id: int, signal: str) -> None:
    """Write a stage signal onto an existing contact row."""
    import sqlite3 as _sqlite3
    conn = _sqlite3.connect(db_path)
    conn.execute(
        "UPDATE job_contacts SET stage_signal = ? WHERE id = ?",
        (signal, contact_id),
    )
    conn.commit()
    conn.close()
 def sync_job_emails(job: dict, conn: imaplib.IMAP4, cfg: dict,
                    db_path: Path, dry_run: bool = False) -> tuple[int, int]:
    """
    Sync recruitment emails for one job.
    Returns (inbound_added, outbound_added).
    """
    company = (job.get("company") or "").strip()
    if not company:
        return 0, 0
    search_terms = _company_search_terms(company, job.get("url", ""))
    if not search_terms:
        return 0, 0
    lookback = int(cfg.get("lookback_days", 90))
    since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
    existing_ids = _get_existing_message_ids(job["id"], db_path)
    inbound = outbound = 0
    for term in search_terms:
        # ── INBOX — inbound ───────────────────────────────────────────────
        uids = _search_folder(
            conn, "INBOX",
            f'(OR FROM "{term}" SUBJECT "{term}")',
            since,
        )
        for uid in uids:
            parsed = _parse_message(conn, uid)
            if not parsed:
                continue
            if parsed["message_id"] in existing_ids:
                continue
            if not _email_is_relevant(parsed["from_addr"], parsed["subject"], search_terms):
                continue
            if not dry_run:
                contact_id = add_contact(
                    db_path, job_id=job["id"], direction="inbound",
                    subject=parsed["subject"], from_addr=parsed["from_addr"],
                    to_addr=parsed["to_addr"], body=parsed["body"],
                    received_at=parsed["date"][:16] if parsed["date"] else since,
                    message_id=parsed["message_id"],
                )
                signal = classify_stage_signal(parsed["subject"], parsed["body"])
                if signal and signal != "neutral":
                    _update_contact_signal(db_path, contact_id, signal)
            existing_ids.add(parsed["message_id"])
            inbound += 1
        # ── Sent — outbound ───────────────────────────────────────────────
        sent_folder = cfg.get("sent_folder") or _detect_sent_folder(conn)
        uids = _search_folder(
            conn, sent_folder,
            f'(OR TO "{term}" SUBJECT "{term}")',
            since,
        )
        for uid in uids:
            parsed = _parse_message(conn, uid)
            if not parsed:
                continue
            if parsed["message_id"] in existing_ids:
                continue
            if not _email_is_relevant(parsed["to_addr"], parsed["subject"], search_terms):
                continue
            if not dry_run:
                add_contact(
                    db_path, job_id=job["id"], direction="outbound",
                    subject=parsed["subject"], from_addr=parsed["from_addr"],
                    to_addr=parsed["to_addr"], body=parsed["body"],
                    received_at=parsed["date"][:16] if parsed["date"] else since,
                    message_id=parsed["message_id"],
                )
            existing_ids.add(parsed["message_id"])
            outbound += 1
    return inbound, outbound
 # ── Main entry ────────────────────────────────────────────────────────────────
 def sync_all(db_path: Path = DEFAULT_DB,
             dry_run: bool = False,
             job_ids: Optional[list[int]] = None,
             on_stage=None) -> dict:
    """
    Sync emails for all active pipeline jobs (or a specific subset).
    Returns a summary dict:
        {"synced": N, "inbound": N, "outbound": N, "errors": [...]}
    """
    def _stage(msg: str) -> None:
        if on_stage:
            on_stage(msg)
    cfg = load_config()
    init_db(db_path)
    jobs_by_stage = get_interview_jobs(db_path)
    active_stages = ["applied", "phone_screen", "interviewing", "offer", "hired"]
    all_active = [j for stage in active_stages for j in jobs_by_stage.get(stage, [])]
    if job_ids:
        all_active = [j for j in all_active if j["id"] in job_ids]
    if not all_active:
        return {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "todo_attached": 0, "errors": []}
    _stage("connecting")
    print(f"[imap] Connecting to {cfg.get('host', 'imap.gmail.com')} …")
    conn = connect(cfg)
    summary = {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "errors": []}
    try:
        for i, job in enumerate(all_active, 1):
            _stage(f"job {i}/{len(all_active)}")
            try:
                inb, out = sync_job_emails(job, conn, cfg, db_path, dry_run=dry_run)
                label = "DRY-RUN " if dry_run else ""
                print(f"[imap] {label}{job.get('company'):30s}  +{inb} in  +{out} out")
                if inb + out > 0:
                    summary["synced"] += 1
                summary["inbound"]  += inb
                summary["outbound"] += out
            except Exception as e:
                msg = f"{job.get('company')}: {e}"
                summary["errors"].append(msg)
                print(f"[imap] ERROR — {msg}")
        _stage("scanning todo label")
        from scripts.db import get_all_message_ids
        known_mids = get_all_message_ids(db_path)
        summary["todo_attached"] = _scan_todo_label(conn, cfg, db_path, all_active, known_mids)
        _stage("scanning leads")
        summary["new_leads"] = _scan_unmatched_leads(conn, cfg, db_path, known_mids)
    finally:
        try:
            conn.logout()
        except Exception:
            pass
    return summary
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Sync IMAP emails to job contacts")
    parser.add_argument("--job-id", type=int, nargs="+", help="Sync only these job IDs")
    parser.add_argument("--dry-run", action="store_true", help="Show matches without saving")
    args = parser.parse_args()
    result = sync_all(
        dry_run=args.dry_run,
        job_ids=args.job_id,
    )
    print(f"\n[imap] Done — {result['synced']} jobs updated, "
          f"{result['inbound']} inbound, {result['outbound']} outbound"
          + (f", {len(result['errors'])} errors" if result["errors"] else ""))
--- a/scripts/llm_router.py
+++ b/scripts/llm_router.py
@ -0,0 +1,170 @@
 """
 LLM abstraction layer with priority fallback chain.
 Reads config/llm.yaml. Tries backends in order; falls back on any error.
 """
 import os
 import yaml
 import requests
 from pathlib import Path
 from openai import OpenAI
 CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml"
 class LLMRouter:
    def __init__(self, config_path: Path = CONFIG_PATH):
        with open(config_path) as f:
            self.config = yaml.safe_load(f)
    def _is_reachable(self, base_url: str) -> bool:
        """Quick health-check ping. Returns True if backend is up."""
        health_url = base_url.rstrip("/").removesuffix("/v1") + "/health"
        try:
            resp = requests.get(health_url, timeout=2)
            return resp.status_code < 500
        except Exception:
            return False
    def _resolve_model(self, client: OpenAI, model: str) -> str:
        """Resolve __auto__ to the first model served by vLLM."""
        if model != "__auto__":
            return model
        models = client.models.list()
        return models.data[0].id
    def complete(self, prompt: str, system: str | None = None,
                 model_override: str | None = None,
                 fallback_order: list[str] | None = None,
                 images: list[str] | None = None) -> str:
        """
        Generate a completion. Tries each backend in fallback_order.
        model_override: when set, replaces the configured model for
        openai_compat backends (e.g. pass a research-specific ollama model).
        fallback_order: when set, overrides config fallback_order for this
        call (e.g. pass config["research_fallback_order"] for research tasks).
        images: optional list of base64-encoded PNG/JPG strings. When provided,
        backends without supports_images=true are skipped. vision_service backends
        are only tried when images is provided.
        Raises RuntimeError if all backends are exhausted.
        """
        order = fallback_order if fallback_order is not None else self.config["fallback_order"]
        for name in order:
            backend = self.config["backends"][name]
            if not backend.get("enabled", True):
                print(f"[LLMRouter] {name}: disabled, skipping")
                continue
            supports_images = backend.get("supports_images", False)
            is_vision_service = backend["type"] == "vision_service"
            # vision_service only used when images provided
            if is_vision_service and not images:
                print(f"[LLMRouter] {name}: vision_service skipped (no images)")
                continue
            # non-vision backends skipped when images provided and they don't support it
            if images and not supports_images and not is_vision_service:
                print(f"[LLMRouter] {name}: no image support, skipping")
                continue
            if is_vision_service:
                if not self._is_reachable(backend["base_url"]):
                    print(f"[LLMRouter] {name}: unreachable, skipping")
                    continue
                try:
                    resp = requests.post(
                        backend["base_url"].rstrip("/") + "/analyze",
                        json={
                            "prompt": prompt,
                            "image_base64": images[0] if images else "",
                        },
                        timeout=60,
                    )
                    resp.raise_for_status()
                    print(f"[LLMRouter] Used backend: {name} (vision_service)")
                    return resp.json()["text"]
                except Exception as e:
                    print(f"[LLMRouter] {name}: error — {e}, trying next")
                    continue
            elif backend["type"] == "openai_compat":
                if not self._is_reachable(backend["base_url"]):
                    print(f"[LLMRouter] {name}: unreachable, skipping")
                    continue
                try:
                    client = OpenAI(
                        base_url=backend["base_url"],
                        api_key=backend.get("api_key") or "any",
                    )
                    raw_model = model_override or backend["model"]
                    model = self._resolve_model(client, raw_model)
                    messages = []
                    if system:
                        messages.append({"role": "system", "content": system})
                    if images and supports_images:
                        content = [{"type": "text", "text": prompt}]
                        for img in images:
                            content.append({
                                "type": "image_url",
                                "image_url": {"url": f"data:image/png;base64,{img}"},
                            })
                        messages.append({"role": "user", "content": content})
                    else:
                        messages.append({"role": "user", "content": prompt})
                    resp = client.chat.completions.create(
                        model=model, messages=messages
                    )
                    print(f"[LLMRouter] Used backend: {name} ({model})")
                    return resp.choices[0].message.content
                except Exception as e:
                    print(f"[LLMRouter] {name}: error — {e}, trying next")
                    continue
            elif backend["type"] == "anthropic":
                api_key = os.environ.get(backend["api_key_env"], "")
                if not api_key:
                    print(f"[LLMRouter] {name}: {backend['api_key_env']} not set, skipping")
                    continue
                try:
                    import anthropic as _anthropic
                    client = _anthropic.Anthropic(api_key=api_key)
                    if images and supports_images:
                        content = []
                        for img in images:
                            content.append({
                                "type": "image",
                                "source": {"type": "base64", "media_type": "image/png", "data": img},
                            })
                        content.append({"type": "text", "text": prompt})
                    else:
                        content = prompt
                    kwargs: dict = {
                        "model": backend["model"],
                        "max_tokens": 4096,
                        "messages": [{"role": "user", "content": content}],
                    }
                    if system:
                        kwargs["system"] = system
                    msg = client.messages.create(**kwargs)
                    print(f"[LLMRouter] Used backend: {name}")
                    return msg.content[0].text
                except Exception as e:
                    print(f"[LLMRouter] {name}: error — {e}, trying next")
                    continue
        raise RuntimeError("All LLM backends exhausted")
 # Module-level singleton for convenience
 _router: LLMRouter | None = None
 def complete(prompt: str, system: str | None = None) -> str:
    global _router
    if _router is None:
        _router = LLMRouter()
    return _router.complete(prompt, system)
--- a/scripts/manage-ui.sh
+++ b/scripts/manage-ui.sh
@ -0,0 +1,106 @@
 #!/usr/bin/env bash
 # scripts/manage-ui.sh — manage the Streamlit job-seeker web UI
 # Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]
 set -euo pipefail
 REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 STREAMLIT_BIN="/devl/miniconda3/envs/job-seeker/bin/streamlit"
 APP_ENTRY="$REPO_DIR/app/app.py"
 PID_FILE="$REPO_DIR/.streamlit.pid"
 LOG_FILE="$REPO_DIR/.streamlit.log"
 PORT="${STREAMLIT_PORT:-8501}"
 start() {
    if is_running; then
        echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
        return 0
    fi
    echo "Starting Streamlit on http://localhost:$PORT …"
    "$STREAMLIT_BIN" run "$APP_ENTRY" \
        --server.port "$PORT" \
        --server.headless true \
        --server.fileWatcherType none \
        > "$LOG_FILE" 2>&1 &
    echo $! > "$PID_FILE"
    sleep 2
    if is_running; then
        echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
    else
        echo "Failed to start. Check logs: $LOG_FILE"
        tail -20 "$LOG_FILE"
        exit 1
    fi
 }
 stop() {
    if ! is_running; then
        echo "Not running."
        rm -f "$PID_FILE"
        return 0
    fi
    PID=$(cat "$PID_FILE")
    echo "Stopping PID $PID …"
    kill "$PID" 2>/dev/null || true
    sleep 1
    if kill -0 "$PID" 2>/dev/null; then
        kill -9 "$PID" 2>/dev/null || true
    fi
    rm -f "$PID_FILE"
    echo "Stopped."
 }
 restart() {
    stop
    sleep 1
    start
 }
 status() {
    if is_running; then
        echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT"
    else
        echo "Not running."
    fi
 }
 logs() {
    if [[ -f "$LOG_FILE" ]]; then
        tail -50 "$LOG_FILE"
    else
        echo "No log file found at $LOG_FILE"
    fi
 }
 is_running() {
    if [[ -f "$PID_FILE" ]]; then
        PID=$(cat "$PID_FILE")
        if kill -0 "$PID" 2>/dev/null; then
            return 0
        fi
    fi
    return 1
 }
 CMD="${1:-help}"
 case "$CMD" in
    start)   start ;;
    stop)    stop ;;
    restart) restart ;;
    status)  status ;;
    logs)    logs ;;
    *)
        echo "Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]"
        echo ""
        echo "  start    Start the Streamlit UI (default port: $PORT)"
        echo "  stop     Stop the running UI"
        echo "  restart  Stop then start"
        echo "  status   Show whether it's running"
        echo "  logs     Tail the last 50 lines of the log"
        echo ""
        echo "  STREAMLIT_PORT=8502 bash scripts/manage-ui.sh start  (custom port)"
        ;;
 esac
--- a/scripts/manage-vision.sh
+++ b/scripts/manage-vision.sh
@ -0,0 +1,113 @@
 #!/usr/bin/env bash
 # scripts/manage-vision.sh — manage the moondream2 vision service
 # Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs
 #
 # First-time setup:
 #   conda env create -f scripts/vision_service/environment.yml
 #
 # On first start, moondream2 is downloaded from HuggingFace (~1.8GB).
 # Model stays resident in memory between requests.
 set -euo pipefail
 CONDA_ENV="job-seeker-vision"
 UVICORN_BIN="/devl/miniconda3/envs/${CONDA_ENV}/bin/uvicorn"
 PID_FILE="/tmp/vision-service.pid"
 LOG_FILE="/tmp/vision-service.log"
 PORT=8002
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(dirname "$SCRIPT_DIR")"
 is_running() {
    if [[ -f "$PID_FILE" ]]; then
        PID=$(cat "$PID_FILE")
        if kill -0 "$PID" 2>/dev/null; then
            return 0
        fi
    fi
    return 1
 }
 start() {
    if is_running; then
        echo "Already running (PID $(cat "$PID_FILE"))."
        return 0
    fi
    if [[ ! -f "$UVICORN_BIN" ]]; then
        echo "ERROR: conda env '$CONDA_ENV' not found."
        echo "Install with: conda env create -f scripts/vision_service/environment.yml"
        exit 1
    fi
    echo "Starting vision service (moondream2) on port $PORT…"
    cd "$REPO_ROOT"
    PYTHONPATH="$REPO_ROOT" "$UVICORN_BIN" \
        scripts.vision_service.main:app \
        --host 0.0.0.0 \
        --port "$PORT" \
        > "$LOG_FILE" 2>&1 &
    echo $! > "$PID_FILE"
    sleep 2
    if is_running; then
        echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
        echo "Health: http://localhost:$PORT/health"
    else
        echo "Failed to start. Check logs: $LOG_FILE"
        tail -20 "$LOG_FILE"
        rm -f "$PID_FILE"
        exit 1
    fi
 }
 stop() {
    if ! is_running; then
        echo "Not running."
        rm -f "$PID_FILE"
        return 0
    fi
    PID=$(cat "$PID_FILE")
    echo "Stopping PID $PID…"
    kill "$PID" 2>/dev/null || true
    sleep 2
    if kill -0 "$PID" 2>/dev/null; then
        kill -9 "$PID" 2>/dev/null || true
    fi
    rm -f "$PID_FILE"
    echo "Stopped."
 }
 restart() { stop; sleep 1; start; }
 status() {
    if is_running; then
        echo "Running (PID $(cat "$PID_FILE")) — http://localhost:$PORT"
        curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || true
    else
        echo "Not running."
    fi
 }
 logs() {
    if [[ -f "$LOG_FILE" ]]; then
        tail -50 "$LOG_FILE"
    else
        echo "No log file at $LOG_FILE"
    fi
 }
 CMD="${1:-help}"
 case "$CMD" in
    start)   start ;;
    stop)    stop ;;
    restart) restart ;;
    status)  status ;;
    logs)    logs ;;
    *)
        echo "Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs"
        echo ""
        echo "  Manages the moondream2 vision service on port $PORT."
        echo "  First-time setup: conda env create -f scripts/vision_service/environment.yml"
        ;;
 esac
--- a/scripts/manage-vllm.sh
+++ b/scripts/manage-vllm.sh
@ -0,0 +1,160 @@
 #!/usr/bin/env bash
 # scripts/manage-vllm.sh — manage the vLLM inference server
 # Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]
 set -euo pipefail
 VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python"
 MODEL_DIR="/Library/Assets/LLM/vllm/models"
 PID_FILE="/tmp/vllm-server.pid"
 LOG_FILE="/tmp/vllm-server.log"
 MODEL_FILE="/tmp/vllm-server.model"
 PORT=8000
 GPU=1
 _list_model_names() {
    if [[ -d "$MODEL_DIR" ]]; then
        find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort
    fi
 }
 is_running() {
    if [[ -f "$PID_FILE" ]]; then
        PID=$(cat "$PID_FILE")
        if kill -0 "$PID" 2>/dev/null; then
            return 0
        fi
    fi
    return 1
 }
 start() {
    local model_name="${1:-}"
    if [[ -z "$model_name" ]]; then
        model_name=$(_list_model_names | head -1)
        if [[ -z "$model_name" ]]; then
            echo "No models found in $MODEL_DIR"
            exit 1
        fi
    fi
    local model_path
    if [[ "$model_name" == /* ]]; then
        model_path="$model_name"
        model_name=$(basename "$model_path")
    else
        model_path="$MODEL_DIR/$model_name"
    fi
    if [[ ! -d "$model_path" ]]; then
        echo "Model not found: $model_path"
        exit 1
    fi
    if is_running; then
        echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
        return 0
    fi
    echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…"
    echo "$model_name" > "$MODEL_FILE"
    # Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard
    # transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928.
    CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \
        --model "$model_path" \
        --trust-remote-code \
        --max-model-len 3072 \
        --gpu-memory-utilization 0.75 \
        --enforce-eager \
        --max-num-seqs 8 \
        --port "$PORT" \
        > "$LOG_FILE" 2>&1 &
    echo $! > "$PID_FILE"
    sleep 3
    if is_running; then
        echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
    else
        echo "Failed to start. Check logs: $LOG_FILE"
        tail -20 "$LOG_FILE"
        rm -f "$PID_FILE" "$MODEL_FILE"
        exit 1
    fi
 }
 stop() {
    if ! is_running; then
        echo "Not running."
        rm -f "$PID_FILE"
        return 0
    fi
    PID=$(cat "$PID_FILE")
    echo "Stopping PID $PID …"
    kill "$PID" 2>/dev/null || true
    sleep 2
    if kill -0 "$PID" 2>/dev/null; then
        kill -9 "$PID" 2>/dev/null || true
    fi
    rm -f "$PID_FILE" "$MODEL_FILE"
    echo "Stopped."
 }
 restart() {
    local model_name="${1:-}"
    stop
    sleep 1
    start "$model_name"
 }
 status() {
    if is_running; then
        local model=""
        if [[ -f "$MODEL_FILE" ]]; then
            model=" — model: $(cat "$MODEL_FILE")"
        fi
        echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model"
    else
        echo "Not running."
    fi
 }
 logs() {
    if [[ -f "$LOG_FILE" ]]; then
        tail -50 "$LOG_FILE"
    else
        echo "No log file found at $LOG_FILE"
    fi
 }
 list() {
    echo "Available models in $MODEL_DIR:"
    _list_model_names | while read -r name; do
        echo "  - $name"
    done
 }
 CMD="${1:-help}"
 case "$CMD" in
    start)   start "${2:-}" ;;
    stop)    stop ;;
    restart) restart "${2:-}" ;;
    status)  status ;;
    logs)    logs ;;
    list)    list ;;
    *)
        echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]"
        echo ""
        echo "  start [model]    Start vLLM with the specified model (default: first in $MODEL_DIR)"
        echo "  stop             Stop the running vLLM server"
        echo "  restart [model]  Stop then start (pass a new model name to swap)"
        echo "  status           Show whether it's running and which model is loaded"
        echo "  logs             Tail the last 50 lines of the log"
        echo "  list             List available models"
        echo ""
        echo "  GPU:  $GPU (CUDA_VISIBLE_DEVICES)"
        echo "  Port: $PORT"
        ;;
 esac
--- a/scripts/match.py
+++ b/scripts/match.py
@ -0,0 +1,156 @@
 """
 Resume match scoring.
 Two modes:
  1. SQLite batch — score all unscored pending/approved jobs in staging.db
     Usage: python scripts/match.py
  2. Notion single — score one Notion page by URL/ID and write results back
     Usage: python scripts/match.py <notion-page-url-or-id>
 """
 import re
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import requests
 import yaml
 from bs4 import BeautifulSoup
 from notion_client import Client
 CONFIG_DIR = Path(__file__).parent.parent / "config"
 RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf")
 def load_notion() -> tuple[Client, dict]:
    cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
    return Client(auth=cfg["token"]), cfg["field_map"]
 def extract_page_id(url_or_id: str) -> str:
    """Extract 32-char Notion page ID from a URL or return as-is."""
    clean = url_or_id.replace("-", "")
    match = re.search(r"[0-9a-f]{32}", clean)
    return match.group(0) if match else url_or_id.strip()
 def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str:
    page = notion.pages.retrieve(page_id)
    return page["properties"][url_field]["url"] or ""
 def extract_job_description(url: str) -> str:
    """Fetch a job listing URL and return its visible text."""
    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "nav", "header", "footer"]):
        tag.decompose()
    return " ".join(soup.get_text(separator=" ").split())
 def read_resume_text() -> str:
    """Extract text from the ATS-clean PDF resume."""
    import pypdf
    reader = pypdf.PdfReader(str(RESUME_PATH))
    return " ".join(page.extract_text() or "" for page in reader.pages)
 def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]:
    """
    Score resume against job description using TF-IDF cosine similarity.
    Returns (score 0–100, list of high-value job keywords missing from resume).
    """
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    vectorizer = TfidfVectorizer(stop_words="english", max_features=200)
    tfidf = vectorizer.fit_transform([resume_text, job_text])
    score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100
    resume_terms = set(resume_text.lower().split())
    feature_names = vectorizer.get_feature_names_out()
    job_tfidf = tfidf[1].toarray()[0]
    top_indices = np.argsort(job_tfidf)[::-1][:30]
    top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0]
    gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10]  # t==t drops NaN
    return round(score, 1), gaps
 def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None:
    notion.pages.update(
        page_id=page_id,
        properties={
            fm["match_score"]:   {"number": score},
            fm["keyword_gaps"]:  {"rich_text": [{"text": {"content": ", ".join(gaps)}}]},
        },
    )
 def run_match(page_url_or_id: str) -> None:
    notion, fm = load_notion()
    page_id = extract_page_id(page_url_or_id)
    print(f"[match] Page ID: {page_id}")
    job_url = get_job_url_from_notion(notion, page_id, fm["url"])
    print(f"[match] Fetching job description from: {job_url}")
    job_text = extract_job_description(job_url)
    resume_text = read_resume_text()
    score, gaps = match_score(resume_text, job_text)
    print(f"[match] Score: {score}/100")
    print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}")
    write_match_to_notion(notion, page_id, score, gaps, fm)
    print("[match] Written to Notion.")
 def score_pending_jobs(db_path: Path = None) -> int:
    """
    Score all unscored jobs (any status) in SQLite using the description
    already scraped during discovery. Writes match_score + keyword_gaps back.
    Returns the number of jobs scored.
    """
    from scripts.db import DEFAULT_DB, write_match_scores
    if db_path is None:
        db_path = DEFAULT_DB
    import sqlite3
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
        "SELECT id, title, company, description FROM jobs "
        "WHERE match_score IS NULL "
        "AND description IS NOT NULL AND description != '' AND description != 'nan'"
    ).fetchall()
    conn.close()
    if not rows:
        print("[match] No unscored jobs with descriptions found.")
        return 0
    resume_text = read_resume_text()
    scored = 0
    for row in rows:
        job_id, title, company, description = row["id"], row["title"], row["company"], row["description"]
        try:
            score, gaps = match_score(resume_text, description)
            write_match_scores(db_path, job_id, score, ", ".join(gaps))
            print(f"[match] {title} @ {company}: {score}/100  gaps: {', '.join(gaps) or 'none'}")
            scored += 1
        except Exception as e:
            print(f"[match] Error scoring job {job_id}: {e}")
    print(f"[match] Done — {scored} jobs scored.")
    return scored
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        score_pending_jobs()
    else:
        run_match(sys.argv[1])
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@ -0,0 +1,134 @@
 # scripts/prepare_training_data.py
 """
 Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
 Outputs a JSONL file where each line is:
  {"instruction": "Write a cover letter for the [role] position at [company].",
   "output": "<full letter text>"}
 Usage:
    conda run -n job-seeker python scripts/prepare_training_data.py
    conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl
 """
 import argparse
 import json
 import re
 import sys
 from pathlib import Path
 LETTERS_DIR = Path("/Library/Documents/JobSearch")
 # Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
 LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
 DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
 # Patterns that appear in opening sentences to extract role
 ROLE_PATTERNS = [
    r"apply for (?:the )?(.+?) (?:position|role|opportunity) at",
    r"apply for (?:the )?(.+?) (?:at|with)\b",
 ]
 def extract_role_from_text(text: str) -> str:
    """Try to extract the role title from the first ~500 chars of a cover letter."""
    # Search the opening of the letter, skipping past any greeting line
    search_text = text[:600]
    for pattern in ROLE_PATTERNS:
        m = re.search(pattern, search_text, re.IGNORECASE)
        if m:
            role = m.group(1).strip().rstrip(".")
            # Filter out noise — role should be ≤6 words
            if 1 <= len(role.split()) <= 6:
                return role
    return ""
 def extract_company_from_filename(stem: str) -> str:
    """Extract company name from cover letter filename stem."""
    return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip()
 def strip_greeting(text: str) -> str:
    """Remove the 'Dear X,' line so the output is just the letter body + sign-off."""
    lines = text.splitlines()
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("dear "):
            # Skip the greeting line and any following blank lines
            rest = lines[i + 1:]
            while rest and not rest[0].strip():
                rest = rest[1:]
            return "\n".join(rest).strip()
    return text.strip()
 def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
    """Parse all cover letters and return list of training records."""
    records = []
    seen: set[Path] = set()
    all_paths = []
    for glob in LETTER_GLOBS:
        for p in letters_dir.glob(glob):
            if p not in seen:
                seen.add(p)
                all_paths.append(p)
    for path in sorted(all_paths):
        text = path.read_text(encoding="utf-8", errors="ignore").strip()
        if not text or len(text) < 100:
            continue
        company = extract_company_from_filename(path.stem)
        role = extract_role_from_text(text)
        body = strip_greeting(text)
        if not role:
            # Use a generic instruction when role extraction fails
            instruction = f"Write a cover letter for a position at {company}."
        else:
            instruction = f"Write a cover letter for the {role} position at {company}."
        records.append({
            "instruction": instruction,
            "output": body,
            "source_file": path.name,
        })
    return records
 def write_jsonl(records: list[dict], output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        for record in records:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
 def main() -> None:
    parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus")
    parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path")
    parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters")
    parser.add_argument("--stats", action="store_true", help="Print statistics and exit")
    args = parser.parse_args()
    records = build_records(Path(args.letters_dir))
    if args.stats:
        print(f"Total letters: {len(records)}")
        with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position"))
        print(f"Role extracted: {with_role}/{len(records)}")
        avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1)
        print(f"Avg letter length: {avg_len:.0f} chars")
        for r in records:
            print(f"  {r['source_file']!r:55s} → {r['instruction'][:70]}")
        return
    output_path = Path(args.output)
    write_jsonl(records, output_path)
    print(f"Wrote {len(records)} training records to {output_path}")
    print()
    print("Next step for LoRA fine-tuning:")
    print("  1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct")
    print("  2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)")
    print("  3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh — hugging-face-model-trainer skill")
 if __name__ == "__main__":
    main()
--- a/scripts/scrape_url.py
+++ b/scripts/scrape_url.py
@ -0,0 +1,228 @@
 # scripts/scrape_url.py
 """
 Scrape a job listing from its URL and update the job record.
 Supports:
  - LinkedIn  (guest jobs API — no auth required)
  - Indeed    (HTML parse)
  - Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py)
  - Generic   (JSON-LD → og:tags fallback)
 Usage (background task — called by task_runner):
    from scripts.scrape_url import scrape_job_url
    scrape_job_url(db_path, job_id)
 """
 import json
 import re
 import sqlite3
 import sys
 from pathlib import Path
 from typing import Optional
 from urllib.parse import urlparse, urlencode, parse_qsl
 import requests
 from bs4 import BeautifulSoup
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.db import DEFAULT_DB, update_job_fields
 _STRIP_PARAMS = {
    "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
    "trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig",
    "eid", "otpToken", "ssid", "fmid",
 }
 _HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
 }
 _TIMEOUT = 12
 def _detect_board(url: str) -> str:
    """Return 'linkedin', 'indeed', 'glassdoor', or 'generic'."""
    url_lower = url.lower()
    if "linkedin.com" in url_lower:
        return "linkedin"
    if "indeed.com" in url_lower:
        return "indeed"
    if "glassdoor.com" in url_lower:
        return "glassdoor"
    return "generic"
 def _extract_linkedin_job_id(url: str) -> Optional[str]:
    """Extract numeric job ID from a LinkedIn job URL."""
    m = re.search(r"/jobs/view/(\d+)", url)
    return m.group(1) if m else None
 def canonicalize_url(url: str) -> str:
    """
    Strip tracking parameters from a job URL and return a clean canonical form.
    LinkedIn:  https://www.linkedin.com/jobs/view/<id>/?trk=...  →  https://www.linkedin.com/jobs/view/<id>/
    Others:    strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId
    """
    url = url.strip()
    if "linkedin.com" in url.lower():
        job_id = _extract_linkedin_job_id(url)
        if job_id:
            return f"https://www.linkedin.com/jobs/view/{job_id}/"
    parsed = urlparse(url)
    clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS])
    return parsed._replace(query=clean_qs).geturl()
 def _scrape_linkedin(url: str) -> dict:
    """Fetch via LinkedIn guest jobs API (no auth required)."""
    job_id = _extract_linkedin_job_id(url)
    if not job_id:
        return {}
    api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    def _text(selector, **kwargs):
        tag = soup.find(selector, **kwargs)
        return tag.get_text(strip=True) if tag else ""
    title = _text("h2", class_="top-card-layout__title")
    company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link")
    location = _text("span", class_="topcard__flavor--bullet")
    desc_div = soup.find("div", class_="show-more-less-html__markup")
    description = desc_div.get_text(separator="\n", strip=True) if desc_div else ""
    return {k: v for k, v in {
        "title": title,
        "company": company,
        "location": location,
        "description": description,
        "source": "linkedin",
    }.items() if v}
 def _scrape_indeed(url: str) -> dict:
    """Scrape an Indeed job page."""
    resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
    resp.raise_for_status()
    return _parse_json_ld_or_og(resp.text) or {}
 def _scrape_glassdoor(url: str) -> dict:
    """Re-use JobSpy's Glassdoor scraper for description fetch."""
    m = re.search(r"jl=(\d+)", url)
    if not m:
        return {}
    try:
        from jobspy.glassdoor import Glassdoor
        from jobspy.glassdoor.constant import fallback_token, headers
        from jobspy.model import ScraperInput, Site
        from jobspy.util import create_session
        scraper = Glassdoor()
        scraper.base_url = "https://www.glassdoor.com/"
        scraper.session = create_session(has_retry=True)
        token = scraper._get_csrf_token()
        headers["gd-csrf-token"] = token if token else fallback_token
        scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
        description = scraper._fetch_job_description(int(m.group(1)))
        return {"description": description} if description else {}
    except Exception:
        return {}
 def _parse_json_ld_or_og(html: str) -> dict:
    """Extract job fields from JSON-LD structured data, then og: meta tags."""
    soup = BeautifulSoup(html, "html.parser")
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
            if isinstance(data, list):
                data = next((d for d in data if d.get("@type") == "JobPosting"), {})
            if data.get("@type") == "JobPosting":
                org = data.get("hiringOrganization") or {}
                loc = data.get("jobLocation") or {}
                if isinstance(loc, list):
                    loc = loc[0] if loc else {}
                addr = loc.get("address") or {}
                location = (
                    addr.get("addressLocality", "") or
                    addr.get("addressRegion", "") or
                    addr.get("addressCountry", "")
                )
                return {k: v for k, v in {
                    "title": data.get("title", ""),
                    "company": org.get("name", ""),
                    "location": location,
                    "description": data.get("description", ""),
                    "salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "",
                }.items() if v}
        except Exception:
            continue
    def _meta(prop):
        tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
        return tag.get("content", "") if tag else ""
    title_tag = soup.find("title")
    title = _meta("og:title") or (title_tag.get_text(strip=True) if title_tag else "")
    description = _meta("og:description")
    return {k: v for k, v in {"title": title, "description": description}.items() if v}
 def _scrape_generic(url: str) -> dict:
    resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
    resp.raise_for_status()
    return _parse_json_ld_or_og(resp.text) or {}
 def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict:
    """
    Fetch the job listing at the stored URL and update the job record.
    Returns the dict of fields scraped (may be empty on failure).
    Does not raise — failures are logged and the job row is left as-is.
    """
    if job_id is None:
        return {}
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone()
    conn.close()
    if not row:
        return {}
    url = row["url"] or ""
    if not url.startswith("http"):
        return {}
    board = _detect_board(url)
    try:
        if board == "linkedin":
            fields = _scrape_linkedin(url)
        elif board == "indeed":
            fields = _scrape_indeed(url)
        elif board == "glassdoor":
            fields = _scrape_glassdoor(url)
        else:
            fields = _scrape_generic(url)
    except requests.RequestException as exc:
        print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}")
        return {}
    except Exception as exc:
        print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}")
        return {}
    if fields:
        fields.pop("url", None)
        update_job_fields(db_path, job_id, fields)
        print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}")
    return fields
--- a/scripts/sync.py
+++ b/scripts/sync.py
@ -0,0 +1,97 @@
 # scripts/sync.py
 """
 Push approved jobs from SQLite staging to Notion.
 Usage:
    conda run -n job-seeker python scripts/sync.py
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import yaml
 from datetime import datetime
 from notion_client import Client
 from scripts.db import DEFAULT_DB, get_jobs_by_status, update_job_status
 CONFIG_DIR = Path(__file__).parent.parent / "config"
 def load_notion_config() -> dict:
    return yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
 def _build_properties(job: dict, fm: dict, include_optional: bool = True) -> dict:
    """Build the Notion properties dict for a job. Optional fields (match_score,
    keyword_gaps) are included by default but can be dropped for DBs that don't
    have those columns yet."""
    props = {
        fm["title_field"]: {"title": [{"text": {"content": job.get("salary") or job.get("title", "")}}]},
        fm["job_title"]:   {"rich_text": [{"text": {"content": job.get("title", "")}}]},
        fm["company"]:     {"rich_text": [{"text": {"content": job.get("company", "")}}]},
        fm["url"]:         {"url": job.get("url") or None},
        fm["source"]:      {"multi_select": [{"name": job.get("source", "unknown").title()}]},
        fm["status"]:      {"select": {"name": fm["status_new"]}},
        fm["remote"]:      {"checkbox": bool(job.get("is_remote", 0))},
        fm["date_found"]:  {"date": {"start": job.get("date_found", datetime.now().isoformat()[:10])}},
    }
    if include_optional:
        score = job.get("match_score")
        if score is not None and fm.get("match_score"):
            props[fm["match_score"]] = {"number": score}
        gaps = job.get("keyword_gaps")
        if gaps and fm.get("keyword_gaps"):
            props[fm["keyword_gaps"]] = {"rich_text": [{"text": {"content": gaps}}]}
    return props
 def sync_to_notion(db_path: Path = DEFAULT_DB) -> int:
    """Push all approved and applied jobs to Notion. Returns count synced."""
    cfg = load_notion_config()
    notion = Client(auth=cfg["token"])
    db_id = cfg["database_id"]
    fm = cfg["field_map"]
    approved = get_jobs_by_status(db_path, "approved")
    applied = get_jobs_by_status(db_path, "applied")
    pending_sync = approved + applied
    if not pending_sync:
        print("[sync] No approved/applied jobs to sync.")
        return 0
    synced_ids = []
    for job in pending_sync:
        try:
            notion.pages.create(
                parent={"database_id": db_id},
                properties=_build_properties(job, fm, include_optional=True),
            )
            synced_ids.append(job["id"])
            print(f"[sync] + {job.get('title')} @ {job.get('company')}")
        except Exception as e:
            err = str(e)
            # Notion returns 400 validation_error when a property column doesn't exist yet.
            # Fall back to core fields only and warn the user.
            if "validation_error" in err or "Could not find property" in err:
                try:
                    notion.pages.create(
                        parent={"database_id": db_id},
                        properties=_build_properties(job, fm, include_optional=False),
                    )
                    synced_ids.append(job["id"])
                    print(f"[sync] + {job.get('title')} @ {job.get('company')} "
                          f"(skipped optional fields — add Match Score / Keyword Gaps columns to Notion DB)")
                except Exception as e2:
                    print(f"[sync] Error syncing {job.get('url')}: {e2}")
            else:
                print(f"[sync] Error syncing {job.get('url')}: {e}")
    update_job_status(db_path, synced_ids, "synced")
    print(f"[sync] Done — {len(synced_ids)} jobs synced to Notion.")
    return len(synced_ids)
 if __name__ == "__main__":
    sync_to_notion()
--- a/scripts/task_runner.py
+++ b/scripts/task_runner.py
@ -0,0 +1,155 @@
 # scripts/task_runner.py
 """
 Background task runner for LLM generation tasks.
 Submitting a task inserts a row in background_tasks and spawns a daemon thread.
 The thread calls the appropriate generator, writes results to existing tables,
 and marks the task completed or failed.
 Deduplication: only one queued/running task per (task_type, job_id) is allowed.
 Different task types for the same job run concurrently (e.g. cover letter + research).
 """
 import sqlite3
 import threading
 from pathlib import Path
 from scripts.db import (
    DEFAULT_DB,
    insert_task,
    update_task_status,
    update_task_stage,
    update_cover_letter,
    save_research,
 )
 def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "",
                job_id: int = None) -> tuple[int, bool]:
    """Submit a background LLM task.
    Returns (task_id, True) if a new task was queued and a thread spawned.
    Returns (existing_id, False) if an identical task is already in-flight.
    """
    task_id, is_new = insert_task(db_path, task_type, job_id)
    if is_new:
        t = threading.Thread(
            target=_run_task,
            args=(db_path, task_id, task_type, job_id),
            daemon=True,
        )
        t.start()
    return task_id, is_new
 def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None:
    """Thread body: run the generator and persist the result."""
    # job_id == 0 means a global task (e.g. discovery) with no associated job row.
    job: dict = {}
    if job_id:
        conn = sqlite3.connect(db_path)
        conn.row_factory = sqlite3.Row
        row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
        conn.close()
        if row is None:
            update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found")
            return
        job = dict(row)
    update_task_status(db_path, task_id, "running")
    try:
        if task_type == "discovery":
            from scripts.discover import run_discovery
            new_count = run_discovery(db_path)
            n = new_count or 0
            update_task_status(
                db_path, task_id, "completed",
                error=f"{n} new listing{'s' if n != 1 else ''} added",
            )
            return
        elif task_type == "cover_letter":
            from scripts.generate_cover_letter import generate
            result = generate(
                job.get("title", ""),
                job.get("company", ""),
                job.get("description", ""),
            )
            update_cover_letter(db_path, job_id, result)
        elif task_type == "company_research":
            from scripts.company_research import research_company
            result = research_company(
                job,
                on_stage=lambda s: update_task_stage(db_path, task_id, s),
            )
            save_research(db_path, job_id=job_id, **result)
        elif task_type == "enrich_descriptions":
            from scripts.enrich_descriptions import enrich_all_descriptions
            r = enrich_all_descriptions(db_path)
            errs = len(r.get("errors", []))
            msg = (
                f"{r['succeeded']} description(s) fetched, {r['failed']} failed"
                + (f", {errs} error(s)" if errs else "")
            )
            update_task_status(db_path, task_id, "completed", error=msg)
            return
        elif task_type == "scrape_url":
            from scripts.scrape_url import scrape_job_url
            fields = scrape_job_url(db_path, job_id)
            title = fields.get("title") or job.get("url", "?")
            company = fields.get("company", "")
            msg = f"{title}" + (f" @ {company}" if company else "")
            update_task_status(db_path, task_id, "completed", error=msg)
            # Auto-enrich company/salary for Craigslist jobs
            conn = sqlite3.connect(db_path)
            conn.row_factory = sqlite3.Row
            job_row = conn.execute(
                "SELECT source, company FROM jobs WHERE id=?", (job_id,)
            ).fetchone()
            conn.close()
            if job_row and job_row["source"] == "craigslist" and not job_row["company"]:
                submit_task(db_path, "enrich_craigslist", job_id)
            return
        elif task_type == "enrich_craigslist":
            from scripts.enrich_descriptions import enrich_craigslist_fields
            extracted = enrich_craigslist_fields(db_path, job_id)
            company = extracted.get("company", "")
            msg = f"company={company}" if company else "no company found"
            update_task_status(db_path, task_id, "completed", error=msg)
            return
        elif task_type == "email_sync":
            try:
                from scripts.imap_sync import sync_all
                result = sync_all(db_path,
                                  on_stage=lambda s: update_task_stage(db_path, task_id, s))
                leads = result.get("new_leads", 0)
                todo  = result.get("todo_attached", 0)
                errs  = len(result.get("errors", []))
                msg = (
                    f"{result['synced']} jobs updated, "
                    f"+{result['inbound']} in, +{result['outbound']} out"
                    + (f", {leads} new lead(s)" if leads else "")
                    + (f", {todo} todo attached" if todo else "")
                    + (f", {errs} error(s)" if errs else "")
                )
                update_task_status(db_path, task_id, "completed", error=msg)
                return
            except FileNotFoundError:
                update_task_status(db_path, task_id, "failed",
                                   error="Email not configured — go to Settings → Email")
                return
        else:
            raise ValueError(f"Unknown task_type: {task_type!r}")
        update_task_status(db_path, task_id, "completed")
    except BaseException as exc:
        # BaseException catches SystemExit (from companyScraper sys.exit calls)
        # in addition to regular exceptions.
        update_task_status(db_path, task_id, "failed", error=str(exc))
--- a/scripts/test_email_classify.py
+++ b/scripts/test_email_classify.py
@ -0,0 +1,159 @@
 #!/usr/bin/env python
 """
 Compare email classifiers across models on a live sample from IMAP.
 Usage:
    conda run -n job-seeker python scripts/test_email_classify.py
    conda run -n job-seeker python scripts/test_email_classify.py --limit 30
    conda run -n job-seeker python scripts/test_email_classify.py --dry-run  # phrase filter only, no LLM
 Outputs a table: subject | phrase_blocked | phi3 | llama3.1 | vllm
 """
 import argparse
 import re
 import sys
 from datetime import datetime, timedelta
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.imap_sync import (
    load_config, connect, _search_folder, _parse_message,
    _has_recruitment_keyword, _has_rejection_or_ats_signal,
    _CLASSIFY_SYSTEM, _CLASSIFY_LABELS,
    _REJECTION_PHRASES, _SPAM_PHRASES, _ATS_CONFIRM_SUBJECTS, _SPAM_SUBJECT_PREFIXES,
 )
 from scripts.llm_router import LLMRouter
 _ROUTER = LLMRouter()
 MODELS = {
    "phi3":    ("phi3:mini",     ["ollama_research"]),
    "llama3":  ("llama3.1:8b",  ["ollama_research"]),
    "vllm":    ("__auto__",     ["vllm"]),
 }
 BROAD_TERMS = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"]
 def _classify(subject: str, body: str, model_override: str, fallback_order: list) -> str:
    try:
        prompt = f"Subject: {subject}\n\nEmail: {body[:600]}"
        raw = _ROUTER.complete(
            prompt,
            system=_CLASSIFY_SYSTEM,
            model_override=model_override,
            fallback_order=fallback_order,
        )
        text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).lower().strip()
        for label in _CLASSIFY_LABELS:
            if text.startswith(label) or label in text:
                return label
        return f"? ({text[:30]})"
    except Exception as e:
        return f"ERR: {e!s:.20}"
 def _short(s: str, n: int = 55) -> str:
    return s if len(s) <= n else s[:n - 1] + "…"
 def _explain_block(subject: str, body: str) -> str:
    """Return the first phrase/rule that triggered a block."""
    subject_lower = subject.lower().strip()
    for p in _SPAM_SUBJECT_PREFIXES:
        if subject_lower.startswith(p):
            return f"subject prefix: {p!r}"
    for p in _ATS_CONFIRM_SUBJECTS:
        if p in subject_lower:
            return f"ATS subject: {p!r}"
    haystack = subject_lower + " " + body[:800].lower()
    for p in _REJECTION_PHRASES + _SPAM_PHRASES:
        if p in haystack:
            return f"phrase: {p!r}"
    return "unknown"
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--limit", type=int, default=20, help="Max emails to test")
    parser.add_argument("--days", type=int, default=90)
    parser.add_argument("--dry-run", action="store_true",
                        help="Skip LLM calls — show phrase filter only")
    parser.add_argument("--verbose", action="store_true",
                        help="Show which phrase triggered each BLOCK")
    args = parser.parse_args()
    cfg = load_config()
    since = (datetime.now() - timedelta(days=args.days)).strftime("%d-%b-%Y")
    print(f"Connecting to {cfg.get('host')} …")
    conn = connect(cfg)
    # Collect unique UIDs across broad terms
    all_uids: dict[bytes, None] = {}
    for term in BROAD_TERMS:
        for uid in _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since):
            all_uids[uid] = None
    sample = list(all_uids.keys())[: args.limit]
    print(f"Fetched {len(all_uids)} matching UIDs, testing {len(sample)}\n")
    # Header
    if args.dry_run:
        print(f"{'Subject':<56}  {'RK':3}  {'Phrase':7}")
        print("-" * 72)
    else:
        print(f"{'Subject':<56}  {'RK':3}  {'Phrase':7}  {'phi3':<20}  {'llama3':<20}  {'vllm':<20}")
        print("-" * 130)
    passed = skipped = 0
    rows = []
    for uid in sample:
        parsed = _parse_message(conn, uid)
        if not parsed:
            continue
        subj = parsed["subject"]
        body = parsed["body"]
        has_rk      = _has_recruitment_keyword(subj)
        phrase_block = _has_rejection_or_ats_signal(subj, body)
        if args.dry_run:
            rk_mark = "✓" if has_rk else "✗"
            pb_mark = "BLOCK" if phrase_block else "pass"
            line = f"{_short(subj):<56}  {rk_mark:3}  {pb_mark:7}"
            if phrase_block and args.verbose:
                reason = _explain_block(subj, body)
                line += f"  [{reason}]"
            print(line)
            continue
        if phrase_block or not has_rk:
            skipped += 1
            rk_mark = "✓" if has_rk else "✗"
            pb_mark = "BLOCK" if phrase_block else "pass"
            print(f"{_short(subj):<56}  {rk_mark:3}  {pb_mark:7}  {'—':<20}  {'—':<20}  {'—':<20}")
            continue
        passed += 1
        results = {}
        for name, (model, fallback) in MODELS.items():
            results[name] = _classify(subj, body, model, fallback)
        pb_mark = "pass"
        print(f"{_short(subj):<56}  {'✓':3}  {pb_mark:7}  "
              f"{results['phi3']:<20}  {results['llama3']:<20}  {results['vllm']:<20}")
    if not args.dry_run:
        print(f"\nPhrase-blocked or no-keyword: {skipped}  |  Reached LLMs: {passed}")
    try:
        conn.logout()
    except Exception:
        pass
 if __name__ == "__main__":
    main()
--- a/scripts/vision_service/environment.yml
+++ b/scripts/vision_service/environment.yml
@ -0,0 +1,17 @@
 name: job-seeker-vision
 channels:
  - conda-forge
  - defaults
 dependencies:
  - python=3.11
  - pip
  - pip:
    - torch>=2.0.0
    - torchvision>=0.15.0
    - transformers>=4.40.0
    - accelerate>=0.26.0
    - bitsandbytes>=0.43.0
    - einops>=0.7.0
    - Pillow>=10.0.0
    - fastapi>=0.110.0
    - "uvicorn[standard]>=0.27.0"
--- a/scripts/vision_service/main.py
+++ b/scripts/vision_service/main.py
@ -0,0 +1,98 @@
 """
 Vision service — moondream2 inference for survey screenshot analysis.
 Start: bash scripts/manage-vision.sh start
 Or directly: conda run -n job-seeker-vision uvicorn scripts.vision_service.main:app --port 8002
 First run downloads moondream2 from HuggingFace (~1.8GB).
 Model is loaded lazily on first /analyze request and stays resident.
 GPU is used if available (CUDA); falls back to CPU.
 4-bit quantization on GPU keeps VRAM footprint ~1.5GB.
 """
 import base64
 import io
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 app = FastAPI(title="Job Seeker Vision Service")
 # Module-level model state — lazy loaded on first /analyze request
 _model = None
 _tokenizer = None
 _device = "cpu"
 _loading = False
 def _load_model() -> None:
    global _model, _tokenizer, _device, _loading
    if _model is not None:
        return
    _loading = True
    print("[vision] Loading moondream2…")
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    model_id = "vikhyatk/moondream2"
    revision = "2025-01-09"
    _device = "cuda" if torch.cuda.is_available() else "cpu"
    if _device == "cuda":
        from transformers import BitsAndBytesConfig
        bnb = BitsAndBytesConfig(load_in_4bit=True)
        _model = AutoModelForCausalLM.from_pretrained(
            model_id, revision=revision,
            quantization_config=bnb,
            trust_remote_code=True,
            device_map="auto",
        )
    else:
        _model = AutoModelForCausalLM.from_pretrained(
            model_id, revision=revision,
            trust_remote_code=True,
        )
        _model.to(_device)
    _tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    _loading = False
    print(f"[vision] moondream2 ready on {_device}")
 class AnalyzeRequest(BaseModel):
    prompt: str
    image_base64: str
 class AnalyzeResponse(BaseModel):
    text: str
@app.get("/health")
 def health():
    import torch
    return {
        "status": "loading" if _loading else "ok",
        "model": "moondream2",
        "gpu": torch.cuda.is_available(),
        "loaded": _model is not None,
    }
@app.post("/analyze", response_model=AnalyzeResponse)
 def analyze(req: AnalyzeRequest):
    from PIL import Image
    import torch
    _load_model()
    try:
        image_data = base64.b64decode(req.image_base64)
        image = Image.open(io.BytesIO(image_data)).convert("RGB")
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Invalid image: {e}")
    with torch.no_grad():
        enc_image = _model.encode_image(image)
        answer = _model.answer_question(enc_image, req.prompt, _tokenizer)
    return AnalyzeResponse(text=answer)
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_company_research.py
+++ b/tests/test_company_research.py
@ -0,0 +1,84 @@
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.company_research import _score_experiences, _build_resume_context, _load_resume_and_keywords
 RESUME = {
    "experience_details": [
        {
            "position": "Lead Technical Account Manager",
            "company": "UpGuard",
            "employment_period": "10/2022 - 05/2023",
            "key_responsibilities": [
                {"r1": "Managed enterprise security accounts worth $2M ARR"},
                {"r2": "Led QBR cadence with C-suite stakeholders"},
            ],
        },
        {
            "position": "Founder and Principal Consultant",
            "company": "M3 Consulting Services",
            "employment_period": "07/2023 - Present",
            "key_responsibilities": [
                {"r1": "Revenue operations consulting for SaaS clients"},
                {"r2": "Built customer success frameworks"},
            ],
        },
        {
            "position": "Customer Success Manager",
            "company": "Generic Co",
            "employment_period": "01/2020 - 09/2022",
            "key_responsibilities": [
                {"r1": "Managed SMB portfolio"},
            ],
        },
    ]
 }
 KEYWORDS = ["ARR", "QBR", "enterprise", "security", "stakeholder"]
 JD = "Looking for a TAM with enterprise ARR experience and QBR facilitation skills."
 def test_score_experiences_returns_sorted():
    """UpGuard entry should score highest — most keywords present in text and JD."""
    scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD)
    assert scored[0]["company"] == "UpGuard"
 def test_score_experiences_adds_score_key():
    """Each returned entry has a 'score' integer key."""
    scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD)
    for e in scored:
        assert isinstance(e["score"], int)
 def test_build_resume_context_top2_in_full():
    """Top 2 experiences appear with full bullet detail."""
    ctx = _build_resume_context(RESUME, KEYWORDS, JD)
    assert "Lead Technical Account Manager" in ctx
    assert "Managed enterprise security accounts" in ctx
    assert "Founder and Principal Consultant" in ctx
 def test_build_resume_context_rest_condensed():
    """Remaining experiences appear as condensed one-liners, not full bullets."""
    ctx = _build_resume_context(RESUME, KEYWORDS, JD)
    assert "Also in Alex" in ctx
    assert "Generic Co" in ctx
    # Generic Co bullets should NOT appear in full
    assert "Managed SMB portfolio" not in ctx
 def test_upguard_nda_low_score():
    """UpGuard name replaced with 'enterprise security vendor' when score < 3."""
    ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops")
    assert "enterprise security vendor" in ctx
 def test_load_resume_and_keywords_returns_lists():
    """_load_resume_and_keywords returns a tuple of (dict, list[str])."""
    resume, keywords = _load_resume_and_keywords()
    assert isinstance(resume, dict)
    assert isinstance(keywords, list)
    assert all(isinstance(k, str) for k in keywords)
--- a/tests/test_cover_letter.py
+++ b/tests/test_cover_letter.py
@ -0,0 +1,120 @@
 # tests/test_cover_letter.py
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
 # ── prepare_training_data tests ──────────────────────────────────────────────
 def test_extract_role_from_text():
    """extract_role_from_text pulls the role title from the opening sentence."""
    from scripts.prepare_training_data import extract_role_from_text
    text = "Dear Tailscale Hiring Team,\n\nI'm delighted to apply for the Customer Support Manager position at Tailscale."
    assert extract_role_from_text(text) == "Customer Support Manager"
 def test_extract_role_handles_missing():
    """extract_role_from_text returns empty string if no role found."""
    from scripts.prepare_training_data import extract_role_from_text
    assert extract_role_from_text("Dear Team,\n\nHello there.") == ""
 def test_extract_company_from_filename():
    """extract_company_from_filename strips 'Cover Letter' suffix."""
    from scripts.prepare_training_data import extract_company_from_filename
    assert extract_company_from_filename("Tailscale Cover Letter") == "Tailscale"
    assert extract_company_from_filename("Dagster Labs Cover Letter.md") == "Dagster Labs"
 def test_strip_greeting():
    """strip_greeting removes the 'Dear X,' line and returns the body."""
    from scripts.prepare_training_data import strip_greeting
    text = "Dear Hiring Team,\n\nI'm delighted to apply for the CSM role.\n\nBest regards,\nAlex"
    result = strip_greeting(text)
    assert result.startswith("I'm delighted")
    assert "Dear" not in result
 def test_build_records_from_tmp_corpus(tmp_path):
    """build_records parses a small corpus directory into training records."""
    from scripts.prepare_training_data import build_records
    letter = tmp_path / "Acme Corp Cover Letter.md"
    letter.write_text(
        "Dear Acme Hiring Team,\n\n"
        "I'm delighted to apply for the Director of Customer Success position at Acme Corp. "
        "With six years of experience, I bring strong skills.\n\n"
        "Best regards,\nAlex Rivera"
    )
    records = build_records(tmp_path)
    assert len(records) == 1
    assert "Acme Corp" in records[0]["instruction"]
    assert "Director of Customer Success" in records[0]["instruction"]
    assert records[0]["output"].startswith("I'm delighted")
 def test_build_records_skips_empty_files(tmp_path):
    """build_records ignores empty or very short files."""
    from scripts.prepare_training_data import build_records
    (tmp_path / "Empty Cover Letter.md").write_text("")
    (tmp_path / "Tiny Cover Letter.md").write_text("Hi")
    records = build_records(tmp_path)
    assert len(records) == 0
 # ── generate_cover_letter tests ───────────────────────────────────────────────
 def test_find_similar_letters_returns_top_k():
    """find_similar_letters returns at most top_k entries."""
    from scripts.generate_cover_letter import find_similar_letters
    corpus = [
        {"company": "Acme", "text": "customer success technical account management SaaS"},
        {"company": "Beta", "text": "software engineering backend python"},
        {"company": "Gamma", "text": "customer onboarding enterprise NPS"},
        {"company": "Delta", "text": "customer success manager renewal QBR"},
    ]
    results = find_similar_letters("customer success manager enterprise SaaS", corpus, top_k=2)
    assert len(results) == 2
    # Should prefer customer success companies over software engineering
    companies = [r["company"] for r in results]
    assert "Beta" not in companies
 def test_load_corpus_returns_list():
    """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash)."""
    from scripts.generate_cover_letter import load_corpus, LETTERS_DIR
    if LETTERS_DIR.exists():
        corpus = load_corpus()
        assert isinstance(corpus, list)
        if corpus:
            assert "company" in corpus[0]
            assert "text" in corpus[0]
    else:
        pytest.skip("LETTERS_DIR not present in this environment")
 def test_generate_calls_llm_router():
    """generate() calls the router's complete() and returns its output."""
    from scripts.generate_cover_letter import generate
    fake_corpus = [
        {"company": "Acme", "text": "I'm delighted to apply for the CSM role at Acme."},
    ]
    mock_router = MagicMock()
    mock_router.complete.return_value = "Dear Hiring Team,\n\nI'm delighted to apply.\n\nWarm regards,\nAlex Rivera"
    with patch("scripts.generate_cover_letter.load_corpus", return_value=fake_corpus):
        result = generate("Customer Success Manager", "TestCo", "Looking for a CSM",
                          _router=mock_router)
    mock_router.complete.assert_called_once()
    assert "Alex Rivera" in result
--- a/tests/test_craigslist.py
+++ b/tests/test_craigslist.py
@ -0,0 +1,211 @@
 """Tests for Craigslist RSS scraper."""
 from datetime import datetime, timezone, timedelta
 from email.utils import format_datetime
 from unittest.mock import patch, MagicMock
 import xml.etree.ElementTree as ET
 import pytest
 import requests
 # ── RSS fixture helpers ────────────────────────────────────────────────────────
 def _make_rss(items: list[dict]) -> bytes:
    """Build minimal Craigslist-style RSS XML from a list of item dicts."""
    channel = ET.Element("channel")
    for item_data in items:
        item = ET.SubElement(channel, "item")
        for tag, value in item_data.items():
            el = ET.SubElement(item, tag)
            el.text = value
    rss = ET.Element("rss")
    rss.append(channel)
    return ET.tostring(rss, encoding="utf-8", xml_declaration=True)
 def _pubdate(hours_ago: float = 1.0) -> str:
    """Return an RFC 2822 pubDate string for N hours ago."""
    dt = datetime.now(tz=timezone.utc) - timedelta(hours=hours_ago)
    return format_datetime(dt)
 def _mock_resp(content: bytes, status_code: int = 200) -> MagicMock:
    mock = MagicMock()
    mock.status_code = status_code
    mock.content = content
    mock.raise_for_status = MagicMock()
    if status_code >= 400:
        mock.raise_for_status.side_effect = requests.HTTPError(f"HTTP {status_code}")
    return mock
 # ── Fixtures ──────────────────────────────────────────────────────────────────
 _SAMPLE_RSS = _make_rss([{
    "title": "Customer Success Manager",
    "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1234567890.html",
    "description": "Great CSM role at Acme Corp. Salary $120k.",
    "pubDate": _pubdate(1),
 }])
 _TWO_ITEM_RSS = _make_rss([
    {
        "title": "Customer Success Manager",
        "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1111111111.html",
        "description": "CSM role 1.",
        "pubDate": _pubdate(1),
    },
    {
        "title": "Account Manager",
        "link": "https://sfbay.craigslist.org/jjj/d/am-role/2222222222.html",
        "description": "AM role.",
        "pubDate": _pubdate(2),
    },
 ])
 _OLD_ITEM_RSS = _make_rss([{
    "title": "Old Job",
    "link": "https://sfbay.craigslist.org/jjj/d/old-job/9999999999.html",
    "description": "Very old posting.",
    "pubDate": _pubdate(hours_ago=500),
 }])
 _TWO_METRO_CONFIG = {
    "metros": ["sfbay", "newyork"],
    "location_map": {
        "San Francisco Bay Area, CA": "sfbay",
        "New York, NY": "newyork",
    },
    "category": "jjj",
 }
 _SINGLE_METRO_CONFIG = {
    "metros": ["sfbay"],
    "location_map": {"San Francisco Bay Area, CA": "sfbay"},
 }
 _PROFILE = {"titles": ["Customer Success Manager"], "hours_old": 240}
 # ── Tests ─────────────────────────────────────────────────────────────────────
 def test_scrape_returns_empty_on_missing_config():
    """Missing craigslist.yaml → returns [] without raising."""
    from scripts.custom_boards import craigslist
    with patch("scripts.custom_boards.craigslist._load_config",
               side_effect=FileNotFoundError("config not found")):
        result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
    assert result == []
 def test_scrape_remote_hits_all_metros():
    """location='Remote' triggers one RSS fetch per configured metro."""
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_TWO_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "Remote")
    assert mock_get.call_count == 2
    fetched_urls = [call.args[0] for call in mock_get.call_args_list]
    assert any("sfbay" in u for u in fetched_urls)
    assert any("newyork" in u for u in fetched_urls)
    assert all(r["is_remote"] for r in result)
 def test_scrape_location_map_resolves():
    """Known location string maps to exactly one metro."""
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_TWO_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
    assert mock_get.call_count == 1
    assert "sfbay" in mock_get.call_args.args[0]
    assert len(result) == 1
    assert result[0]["is_remote"] is False
 def test_scrape_location_not_in_map_returns_empty():
    """Location not in location_map → [] without raising."""
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_SINGLE_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get") as mock_get:
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "Portland, OR")
    assert result == []
    mock_get.assert_not_called()
 def test_hours_old_filter():
    """Items older than hours_old are excluded."""
    profile = {"titles": ["Customer Success Manager"], "hours_old": 48}
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_SINGLE_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=_mock_resp(_OLD_ITEM_RSS)):
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(profile, "San Francisco Bay Area, CA")
    assert result == []
 def test_dedup_within_run():
    """Same URL from two different metros is only returned once."""
    same_url_rss = _make_rss([{
        "title": "CSM Role",
        "link": "https://sfbay.craigslist.org/jjj/d/csm/1234.html",
        "description": "Same job.",
        "pubDate": _pubdate(1),
    }])
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_TWO_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=_mock_resp(same_url_rss)):
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "Remote")
    urls = [r["url"] for r in result]
    assert len(urls) == len(set(urls))
 def test_http_error_graceful():
    """HTTP error → [] without raising."""
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_SINGLE_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   side_effect=requests.RequestException("timeout")):
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
    assert result == []
 def test_malformed_xml_graceful():
    """Malformed RSS XML → [] without raising."""
    bad_resp = MagicMock()
    bad_resp.content = b"this is not xml <<<<"
    bad_resp.raise_for_status = MagicMock()
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_SINGLE_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=bad_resp):
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
    assert result == []
 def test_results_wanted_cap():
    """Never returns more than results_wanted items."""
    with patch("scripts.custom_boards.craigslist._load_config",
               return_value=_TWO_METRO_CONFIG):
        with patch("scripts.custom_boards.craigslist.requests.get",
                   return_value=_mock_resp(_TWO_ITEM_RSS)):
            from scripts.custom_boards import craigslist
            result = craigslist.scrape(_PROFILE, "Remote", results_wanted=1)
    assert len(result) <= 1
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -0,0 +1,560 @@
 import pytest
 import sqlite3
 from pathlib import Path
 from unittest.mock import patch
 def test_init_db_creates_jobs_table(tmp_path):
    """init_db creates a jobs table with correct schema."""
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    conn = sqlite3.connect(db_path)
    cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'")
    assert cursor.fetchone() is not None
    conn.close()
 def test_insert_job_returns_id(tmp_path):
    """insert_job inserts a row and returns its id."""
    from scripts.db import init_db, insert_job
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job = {
        "title": "CSM", "company": "Acme", "url": "https://example.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "$100k", "description": "Great role", "date_found": "2026-02-20",
    }
    row_id = insert_job(db_path, job)
    assert isinstance(row_id, int)
    assert row_id > 0
 def test_insert_job_skips_duplicate_url(tmp_path):
    """insert_job returns None if URL already exists."""
    from scripts.db import init_db, insert_job
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1",
           "source": "linkedin", "location": "Remote", "is_remote": True,
           "salary": "", "description": "", "date_found": "2026-02-20"}
    insert_job(db_path, job)
    result = insert_job(db_path, job)
    assert result is None
 def test_get_jobs_by_status(tmp_path):
    """get_jobs_by_status returns only jobs with matching status."""
    from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1",
           "source": "linkedin", "location": "Remote", "is_remote": True,
           "salary": "", "description": "", "date_found": "2026-02-20"}
    row_id = insert_job(db_path, job)
    update_job_status(db_path, [row_id], "approved")
    approved = get_jobs_by_status(db_path, "approved")
    pending = get_jobs_by_status(db_path, "pending")
    assert len(approved) == 1
    assert len(pending) == 0
 def test_update_job_status_batch(tmp_path):
    """update_job_status updates multiple rows at once."""
    from scripts.db import init_db, insert_job, update_job_status, get_jobs_by_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    ids = []
    for i in range(3):
        job = {"title": f"Job {i}", "company": "Co", "url": f"https://example.com/{i}",
               "source": "indeed", "location": "Remote", "is_remote": True,
               "salary": "", "description": "", "date_found": "2026-02-20"}
        ids.append(insert_job(db_path, job))
    update_job_status(db_path, ids, "rejected")
    assert len(get_jobs_by_status(db_path, "rejected")) == 3
 def test_migrate_db_adds_columns_to_existing_db(tmp_path):
    """_migrate_db adds cover_letter and applied_at to a db created without them."""
    import sqlite3
    from scripts.db import _migrate_db
    db_path = tmp_path / "legacy.db"
    # Create old-style table without the new columns
    conn = sqlite3.connect(db_path)
    conn.execute("""CREATE TABLE jobs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT, company TEXT, url TEXT UNIQUE, status TEXT DEFAULT 'pending'
    )""")
    conn.commit()
    conn.close()
    _migrate_db(db_path)
    conn = sqlite3.connect(db_path)
    cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()}
    conn.close()
    assert "cover_letter" in cols
    assert "applied_at" in cols
 def test_update_cover_letter(tmp_path):
    """update_cover_letter persists text to the DB."""
    from scripts.db import init_db, insert_job, update_cover_letter, get_jobs_by_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    update_cover_letter(db_path, job_id, "Dear Hiring Manager,\nGreat role!")
    rows = get_jobs_by_status(db_path, "pending")
    assert rows[0]["cover_letter"] == "Dear Hiring Manager,\nGreat role!"
 def test_mark_applied_sets_status_and_date(tmp_path):
    """mark_applied sets status='applied' and populates applied_at."""
    from scripts.db import init_db, insert_job, mark_applied, get_jobs_by_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    mark_applied(db_path, [job_id])
    applied = get_jobs_by_status(db_path, "applied")
    assert len(applied) == 1
    assert applied[0]["status"] == "applied"
    assert applied[0]["applied_at"] is not None
 # ── background_tasks tests ────────────────────────────────────────────────────
 def test_init_db_creates_background_tasks_table(tmp_path):
    """init_db creates a background_tasks table."""
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    import sqlite3
    conn = sqlite3.connect(db_path)
    cur = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'"
    )
    assert cur.fetchone() is not None
    conn.close()
 def test_insert_task_returns_id_and_true(tmp_path):
    """insert_task returns (task_id, True) for a new task."""
    from scripts.db import init_db, insert_job, insert_task
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    task_id, is_new = insert_task(db_path, "cover_letter", job_id)
    assert isinstance(task_id, int) and task_id > 0
    assert is_new is True
 def test_insert_task_deduplicates_active_task(tmp_path):
    """insert_task returns (existing_id, False) if a queued/running task already exists."""
    from scripts.db import init_db, insert_job, insert_task
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    first_id, _ = insert_task(db_path, "cover_letter", job_id)
    second_id, is_new = insert_task(db_path, "cover_letter", job_id)
    assert second_id == first_id
    assert is_new is False
 def test_insert_task_allows_different_types_same_job(tmp_path):
    """insert_task allows cover_letter and company_research for the same job concurrently."""
    from scripts.db import init_db, insert_job, insert_task
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    _, cl_new = insert_task(db_path, "cover_letter", job_id)
    _, res_new = insert_task(db_path, "company_research", job_id)
    assert cl_new is True
    assert res_new is True
 def test_update_task_status_running(tmp_path):
    """update_task_status('running') sets started_at."""
    from scripts.db import init_db, insert_job, insert_task, update_task_status
    import sqlite3
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    task_id, _ = insert_task(db_path, "cover_letter", job_id)
    update_task_status(db_path, task_id, "running")
    conn = sqlite3.connect(db_path)
    row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
    conn.close()
    assert row[0] == "running"
    assert row[1] is not None
 def test_update_task_status_completed(tmp_path):
    """update_task_status('completed') sets finished_at."""
    from scripts.db import init_db, insert_job, insert_task, update_task_status
    import sqlite3
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    task_id, _ = insert_task(db_path, "cover_letter", job_id)
    update_task_status(db_path, task_id, "completed")
    conn = sqlite3.connect(db_path)
    row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
    conn.close()
    assert row[0] == "completed"
    assert row[1] is not None
 def test_update_task_status_failed_stores_error(tmp_path):
    """update_task_status('failed') stores error message and sets finished_at."""
    from scripts.db import init_db, insert_job, insert_task, update_task_status
    import sqlite3
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    task_id, _ = insert_task(db_path, "cover_letter", job_id)
    update_task_status(db_path, task_id, "failed", error="LLM timeout")
    conn = sqlite3.connect(db_path)
    row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
    conn.close()
    assert row[0] == "failed"
    assert row[1] == "LLM timeout"
    assert row[2] is not None
 def test_get_active_tasks_returns_only_active(tmp_path):
    """get_active_tasks returns only queued/running tasks with job info joined."""
    from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    active_id, _ = insert_task(db_path, "cover_letter", job_id)
    done_id, _ = insert_task(db_path, "company_research", job_id)
    update_task_status(db_path, done_id, "completed")
    tasks = get_active_tasks(db_path)
    assert len(tasks) == 1
    assert tasks[0]["id"] == active_id
    assert tasks[0]["company"] == "Acme"
    assert tasks[0]["title"] == "CSM"
 def test_get_task_for_job_returns_latest(tmp_path):
    """get_task_for_job returns the most recent task for the given type+job."""
    from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    first_id, _ = insert_task(db_path, "cover_letter", job_id)
    update_task_status(db_path, first_id, "completed")
    second_id, _ = insert_task(db_path, "cover_letter", job_id)  # allowed since first is done
    task = get_task_for_job(db_path, "cover_letter", job_id)
    assert task is not None
    assert task["id"] == second_id
 def test_get_task_for_job_returns_none_when_absent(tmp_path):
    """get_task_for_job returns None when no task exists for that job+type."""
    from scripts.db import init_db, insert_job, get_task_for_job
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    assert get_task_for_job(db_path, "cover_letter", job_id) is None
 # ── company_research new-column tests ─────────────────────────────────────────
 def test_company_research_has_new_columns(tmp_path):
    """init_db creates company_research with the four extended columns."""
    from scripts.db import init_db
    db = tmp_path / "test.db"
    init_db(db)
    conn = sqlite3.connect(db)
    cols = [r[1] for r in conn.execute("PRAGMA table_info(company_research)").fetchall()]
    conn.close()
    assert "tech_brief" in cols
    assert "funding_brief" in cols
    assert "competitors_brief" in cols
    assert "red_flags" in cols
 def test_save_and_get_research_new_fields(tmp_path):
    """save_research persists and get_research returns the four new brief fields."""
    from scripts.db import init_db, insert_job, save_research, get_research
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "TAM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-21",
    })
    save_research(db, job_id=job_id,
                  company_brief="overview", ceo_brief="ceo",
                  talking_points="points", raw_output="raw",
                  tech_brief="tech stack", funding_brief="series B",
                  competitors_brief="vs competitors", red_flags="none")
    r = get_research(db, job_id=job_id)
    assert r["tech_brief"] == "tech stack"
    assert r["funding_brief"] == "series B"
    assert r["competitors_brief"] == "vs competitors"
    assert r["red_flags"] == "none"
 # ── stage_signal / suggestion_dismissed tests ─────────────────────────────────
 def test_stage_signal_columns_exist(tmp_path):
    """init_db creates stage_signal and suggestion_dismissed columns on job_contacts."""
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    conn = sqlite3.connect(db_path)
    cols = {row[1] for row in conn.execute("PRAGMA table_info(job_contacts)").fetchall()}
    conn.close()
    assert "stage_signal" in cols
    assert "suggestion_dismissed" in cols
 def test_add_contact_with_stage_signal(tmp_path):
    """add_contact stores stage_signal when provided."""
    from scripts.db import init_db, insert_job, add_contact, get_contacts
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-21",
    })
    add_contact(db_path, job_id=job_id, direction="inbound",
                subject="Interview invite", stage_signal="interview_scheduled")
    contacts = get_contacts(db_path, job_id=job_id)
    assert contacts[0]["stage_signal"] == "interview_scheduled"
 def test_get_unread_stage_signals(tmp_path):
    """get_unread_stage_signals returns only non-neutral, non-dismissed signals."""
    from scripts.db import (init_db, insert_job, add_contact,
                            get_unread_stage_signals, dismiss_stage_signal)
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-21",
    })
    c1 = add_contact(db_path, job_id=job_id, direction="inbound",
                     subject="Interview invite", stage_signal="interview_scheduled")
    add_contact(db_path, job_id=job_id, direction="inbound",
                subject="Auto-confirm", stage_signal="neutral")
    signals = get_unread_stage_signals(db_path, job_id)
    assert len(signals) == 1
    assert signals[0]["stage_signal"] == "interview_scheduled"
    dismiss_stage_signal(db_path, c1)
    assert get_unread_stage_signals(db_path, job_id) == []
 def test_get_email_leads(tmp_path):
    """get_email_leads returns only source='email' pending jobs."""
    from scripts.db import init_db, insert_job, get_email_leads
    db_path = tmp_path / "test.db"
    init_db(db_path)
    insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-21",
    })
    insert_job(db_path, {
        "title": "TAM", "company": "Wiz", "url": "email://wiz.com/abc123",
        "source": "email", "location": "", "is_remote": 0,
        "salary": "", "description": "Hi Alex…", "date_found": "2026-02-21",
    })
    leads = get_email_leads(db_path)
    assert len(leads) == 1
    assert leads[0]["company"] == "Wiz"
    assert leads[0]["source"] == "email"
 def test_get_all_message_ids(tmp_path):
    """get_all_message_ids returns all message IDs across jobs."""
    from scripts.db import init_db, insert_job, add_contact, get_all_message_ids
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-21",
    })
    add_contact(db_path, job_id=job_id, message_id="<msg-001@acme.com>")
    add_contact(db_path, job_id=job_id, message_id="<msg-002@acme.com>")
    mids = get_all_message_ids(db_path)
    assert "<msg-001@acme.com>" in mids
    assert "<msg-002@acme.com>" in mids
 # ── survey_responses tests ────────────────────────────────────────────────────
 def test_survey_responses_table_created(tmp_path):
    """init_db creates survey_responses table."""
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    import sqlite3
    conn = sqlite3.connect(db_path)
    cur = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='survey_responses'"
    )
    assert cur.fetchone() is not None
    conn.close()
 def test_survey_at_column_exists(tmp_path):
    """jobs table has survey_at column after init_db."""
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    import sqlite3
    conn = sqlite3.connect(db_path)
    cols = [row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()]
    assert "survey_at" in cols
    conn.close()
 def test_insert_and_get_survey_response(tmp_path):
    """insert_survey_response inserts a row; get_survey_responses returns it."""
    from scripts.db import init_db, insert_job, insert_survey_response, get_survey_responses
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-23",
    })
    row_id = insert_survey_response(
        db_path, job_id=job_id, survey_name="Culture Fit",
        source="text_paste", raw_input="Q1: A B C", mode="quick",
        llm_output="1. B — collaborative", reported_score="82%",
    )
    assert isinstance(row_id, int)
    responses = get_survey_responses(db_path, job_id=job_id)
    assert len(responses) == 1
    assert responses[0]["survey_name"] == "Culture Fit"
    assert responses[0]["reported_score"] == "82%"
 def test_get_interview_jobs_includes_survey(tmp_path):
    """get_interview_jobs returns survey-stage jobs."""
    from scripts.db import init_db, insert_job, update_job_status, get_interview_jobs
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/2",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-23",
    })
    update_job_status(db_path, [job_id], "survey")
    result = get_interview_jobs(db_path)
    assert any(j["id"] == job_id for j in result.get("survey", []))
 def test_advance_to_survey_sets_survey_at(tmp_path):
    """advance_to_stage('survey') sets survey_at timestamp."""
    from scripts.db import init_db, insert_job, update_job_status, advance_to_stage, get_job_by_id
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/3",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-23",
    })
    update_job_status(db_path, [job_id], "applied")
    advance_to_stage(db_path, job_id=job_id, stage="survey")
    job = get_job_by_id(db_path, job_id=job_id)
    assert job["status"] == "survey"
    assert job["survey_at"] is not None
 def test_update_job_fields(tmp_path):
    from scripts.db import init_db, insert_job, update_job_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "Importing…", "company": "", "url": "https://example.com/job/1",
        "source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
    })
    update_job_fields(db, job_id, {
        "title": "Customer Success Manager",
        "company": "Acme Corp",
        "location": "San Francisco, CA",
        "description": "Great role.",
        "salary": "$120k",
        "is_remote": 1,
    })
    import sqlite3
    conn = sqlite3.connect(db)
    conn.row_factory = sqlite3.Row
    row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
    conn.close()
    assert row["title"] == "Customer Success Manager"
    assert row["company"] == "Acme Corp"
    assert row["description"] == "Great role."
    assert row["is_remote"] == 1
 def test_update_job_fields_ignores_unknown_columns(tmp_path):
    from scripts.db import init_db, insert_job, update_job_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "Importing…", "company": "", "url": "https://example.com/job/2",
        "source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
    })
    # Should not raise even with an unknown column
    update_job_fields(db, job_id, {"title": "Real Title", "nonexistent_col": "ignored"})
    import sqlite3
    conn = sqlite3.connect(db)
    conn.row_factory = sqlite3.Row
    row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
    conn.close()
    assert row["title"] == "Real Title"
--- a/tests/test_discover.py
+++ b/tests/test_discover.py
@ -0,0 +1,185 @@
 # tests/test_discover.py
 import pytest
 from unittest.mock import patch, MagicMock
 import pandas as pd
 from pathlib import Path
 SAMPLE_JOB = {
    "title": "Customer Success Manager",
    "company": "Acme Corp",
    "location": "Remote",
    "is_remote": True,
    "job_url": "https://linkedin.com/jobs/view/123456",
    "site": "linkedin",
    "min_amount": 90000,
    "max_amount": 120000,
    "salary_source": "$90,000 - $120,000",
    "description": "Great CS role",
 }
 SAMPLE_FM = {
    "title_field": "Salary", "job_title": "Job Title", "company": "Company Name",
    "url": "Role Link", "source": "Job Source", "status": "Status of Application",
    "status_new": "Application Submitted", "date_found": "Date Found",
    "remote": "Remote", "match_score": "Match Score",
    "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description",
 }
 SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM}
 SAMPLE_PROFILES_CFG = {
    "profiles": [{"name": "cs", "titles": ["Customer Success Manager"],
                  "locations": ["Remote"], "boards": ["linkedin"],
                  "results_per_board": 5, "hours_old": 72}]
 }
 def make_jobs_df(jobs=None):
    return pd.DataFrame(jobs or [SAMPLE_JOB])
 def test_discover_writes_to_sqlite(tmp_path):
    """run_discovery inserts new jobs into SQLite staging db."""
    from scripts.discover import run_discovery
    from scripts.db import get_jobs_by_status
    db_path = tmp_path / "test.db"
    with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
         patch("scripts.discover.Client"):
        run_discovery(db_path=db_path)
    jobs = get_jobs_by_status(db_path, "pending")
    assert len(jobs) == 1
    assert jobs[0]["title"] == "Customer Success Manager"
 def test_discover_skips_duplicate_urls(tmp_path):
    """run_discovery does not insert a job whose URL is already in SQLite."""
    from scripts.discover import run_discovery
    from scripts.db import init_db, insert_job, get_jobs_by_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    insert_job(db_path, {
        "title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-01-01",
    })
    with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
         patch("scripts.discover.Client"):
        run_discovery(db_path=db_path)
    jobs = get_jobs_by_status(db_path, "pending")
    assert len(jobs) == 1  # only the pre-existing one, not a duplicate
 def test_discover_pushes_new_jobs(tmp_path):
    """Legacy: discover still calls push_to_notion when notion_push=True."""
    from scripts.discover import run_discovery
    db_path = tmp_path / "test.db"
    with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
         patch("scripts.discover.push_to_notion") as mock_push, \
         patch("scripts.discover.get_existing_urls", return_value=set()), \
         patch("scripts.discover.Client"):
        run_discovery(db_path=db_path, notion_push=True)
    assert mock_push.call_count == 1
 def test_push_to_notion_sets_status_new():
    """push_to_notion always sets Status to the configured status_new value."""
    from scripts.discover import push_to_notion
    mock_notion = MagicMock()
    push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM)
    call_kwargs = mock_notion.pages.create.call_args[1]
    status = call_kwargs["properties"]["Status of Application"]["select"]["name"]
    assert status == "Application Submitted"
 # ── Custom boards integration ─────────────────────────────────────────────────
 _PROFILE_WITH_CUSTOM = {
    "profiles": [{
        "name": "cs", "titles": ["Customer Success Manager"],
        "locations": ["Remote"], "boards": [],
        "custom_boards": ["adzuna"],
        "results_per_board": 5, "hours_old": 72,
    }]
 }
 _ADZUNA_JOB = {
    "title": "Customer Success Manager",
    "company": "TestCo",
    "url": "https://www.adzuna.com/jobs/details/999",
    "source": "adzuna",
    "location": "Remote",
    "is_remote": True,
    "salary": "$90,000 – $120,000",
    "description": "Great remote CSM role",
 }
 def test_discover_custom_board_inserts_jobs(tmp_path):
    """run_discovery dispatches custom_boards scrapers and inserts returned jobs."""
    from scripts.discover import run_discovery
    from scripts.db import get_jobs_by_status
    db_path = tmp_path / "test.db"
    with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
         patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
         patch("scripts.discover.Client"):
        count = run_discovery(db_path=db_path)
    assert count == 1
    jobs = get_jobs_by_status(db_path, "pending")
    assert jobs[0]["title"] == "Customer Success Manager"
    assert jobs[0]["source"] == "adzuna"
 def test_discover_custom_board_skips_unknown(tmp_path, capsys):
    """run_discovery logs and skips an unregistered custom board name."""
    from scripts.discover import run_discovery
    profile_unknown = {
        "profiles": [{
            "name": "cs", "titles": ["CSM"], "locations": ["Remote"],
            "boards": [], "custom_boards": ["nonexistent_board"],
            "results_per_board": 5, "hours_old": 72,
        }]
    }
    db_path = tmp_path / "test.db"
    with patch("scripts.discover.load_config", return_value=(profile_unknown, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
         patch("scripts.discover.Client"):
        run_discovery(db_path=db_path)
    captured = capsys.readouterr()
    assert "nonexistent_board" in captured.out
    assert "Unknown scraper" in captured.out
 def test_discover_custom_board_deduplicates(tmp_path):
    """Custom board results are deduplicated by URL against pre-existing jobs."""
    from scripts.discover import run_discovery
    from scripts.db import init_db, insert_job, get_jobs_by_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    insert_job(db_path, {
        "title": "CSM", "company": "TestCo",
        "url": "https://www.adzuna.com/jobs/details/999",
        "source": "adzuna", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-01-01",
    })
    with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
         patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
         patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
         patch("scripts.discover.Client"):
        count = run_discovery(db_path=db_path)
    assert count == 0  # duplicate skipped
    assert len(get_jobs_by_status(db_path, "pending")) == 1
--- a/tests/test_enrich_descriptions.py
+++ b/tests/test_enrich_descriptions.py
@ -0,0 +1,96 @@
 # tests/test_enrich_descriptions.py
 """Tests for scripts/enrich_descriptions.py — enrich_craigslist_fields()."""
 from unittest.mock import patch, MagicMock
 import sqlite3
 def test_enrich_craigslist_fields_skips_non_craigslist(tmp_path):
    """Non-craigslist source → returns {} without calling LLM."""
    from scripts.db import init_db, insert_job
    from scripts.enrich_descriptions import enrich_craigslist_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://example.com/1",
        "source": "linkedin", "location": "", "description": "Some company here.",
        "date_found": "2026-02-24",
    })
    with patch("scripts.llm_router.LLMRouter") as mock_llm:
        result = enrich_craigslist_fields(db, job_id)
    assert result == {}
    mock_llm.assert_not_called()
 def test_enrich_craigslist_fields_skips_populated_company(tmp_path):
    """Company already set → returns {} without calling LLM."""
    from scripts.db import init_db, insert_job
    from scripts.enrich_descriptions import enrich_craigslist_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "Acme Corp", "url": "https://sfbay.craigslist.org/jjj/d/1.html",
        "source": "craigslist", "location": "", "description": "Join Acme Corp today.",
        "date_found": "2026-02-24",
    })
    with patch("scripts.llm_router.LLMRouter") as mock_llm:
        result = enrich_craigslist_fields(db, job_id)
    assert result == {}
    mock_llm.assert_not_called()
 def test_enrich_craigslist_fields_skips_empty_description(tmp_path):
    """Empty description → returns {} without calling LLM."""
    from scripts.db import init_db, insert_job
    from scripts.enrich_descriptions import enrich_craigslist_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/2.html",
        "source": "craigslist", "location": "", "description": "",
        "date_found": "2026-02-24",
    })
    with patch("scripts.llm_router.LLMRouter") as mock_llm:
        result = enrich_craigslist_fields(db, job_id)
    assert result == {}
    mock_llm.assert_not_called()
 def test_enrich_craigslist_fields_extracts_and_updates(tmp_path):
    """Valid LLM response → updates company/salary in DB, returns extracted dict."""
    from scripts.db import init_db, insert_job
    from scripts.enrich_descriptions import enrich_craigslist_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/3.html",
        "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $120k/yr.",
        "date_found": "2026-02-24",
    })
    mock_router = MagicMock()
    mock_router.complete.return_value = '{"company": "Acme Corp", "salary": "$120k/yr"}'
    with patch("scripts.llm_router.LLMRouter", return_value=mock_router):
        result = enrich_craigslist_fields(db, job_id)
    assert result == {"company": "Acme Corp", "salary": "$120k/yr"}
    conn = sqlite3.connect(db)
    row = conn.execute("SELECT company, salary FROM jobs WHERE id=?", (job_id,)).fetchone()
    conn.close()
    assert row[0] == "Acme Corp"
    assert row[1] == "$120k/yr"
 def test_enrich_craigslist_fields_handles_bad_llm_json(tmp_path):
    """Unparseable LLM response → returns {} without raising."""
    from scripts.db import init_db, insert_job
    from scripts.enrich_descriptions import enrich_craigslist_fields
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/4.html",
        "source": "craigslist", "location": "", "description": "Great opportunity.",
        "date_found": "2026-02-24",
    })
    mock_router = MagicMock()
    mock_router.complete.return_value = "Sorry, I cannot extract that."
    with patch("scripts.llm_router.LLMRouter", return_value=mock_router):
        result = enrich_craigslist_fields(db, job_id)
    assert result == {}
--- a/tests/test_imap_sync.py
+++ b/tests/test_imap_sync.py
@ -0,0 +1,330 @@
 """Tests for imap_sync helpers (no live IMAP connection required)."""
 import pytest
 from unittest.mock import patch, MagicMock
 def test_classify_stage_signal_interview():
    """classify_stage_signal returns interview_scheduled for a call-scheduling email."""
    from scripts.imap_sync import classify_stage_signal
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.return_value = "interview_scheduled"
        result = classify_stage_signal(
            "Let's schedule a call",
            "Hi Alex, we'd love to book a 30-min phone screen with you.",
        )
    assert result == "interview_scheduled"
 def test_classify_stage_signal_returns_none_on_error():
    """classify_stage_signal returns None when LLM call raises."""
    from scripts.imap_sync import classify_stage_signal
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.side_effect = RuntimeError("model not loaded")
        result = classify_stage_signal("subject", "body")
    assert result is None
 def test_classify_stage_signal_strips_think_tags():
    """classify_stage_signal strips <think>...</think> blocks before parsing."""
    from scripts.imap_sync import classify_stage_signal
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.return_value = "<think>Let me think...</think>\nrejected"
        result = classify_stage_signal("Update on your application", "We went with another candidate.")
    assert result == "rejected"
 def test_normalise_company():
    """_normalise_company strips legal suffixes."""
    from scripts.imap_sync import _normalise_company
    assert _normalise_company("DataStax, Inc.") == "DataStax"
    assert _normalise_company("Wiz Ltd") == "Wiz"
    assert _normalise_company("Crusoe Energy") == "Crusoe Energy"
 def test_company_search_terms_excludes_job_board_sld():
    """Job-board domains like linkedin.com are never used as match terms."""
    from scripts.imap_sync import _company_search_terms
    # LinkedIn-sourced job: SLD "linkedin" must not appear in the terms
    terms = _company_search_terms("Bamboo Health", "https://www.linkedin.com/jobs/view/123")
    assert "linkedin" not in terms
    assert "bamboo health" in terms
    # Company with its own domain: SLD should be included
    terms = _company_search_terms("Crusoe Energy", "https://crusoe.ai/jobs/456")
    assert "crusoe" in terms
    # Indeed-sourced job: "indeed" excluded
    terms = _company_search_terms("DoorDash", "https://www.indeed.com/viewjob?jk=abc")
    assert "indeed" not in terms
    assert "doordash" in terms
 def test_has_recruitment_keyword():
    """_has_recruitment_keyword matches known keywords."""
    from scripts.imap_sync import _has_recruitment_keyword
    assert _has_recruitment_keyword("Interview Invitation — Senior TAM")
    assert _has_recruitment_keyword("Your application with DataStax")
    assert not _has_recruitment_keyword("Team lunch tomorrow")
 def test_extract_lead_info_returns_company_and_title():
    """extract_lead_info parses LLM JSON response into (company, title)."""
    from scripts.imap_sync import extract_lead_info
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.return_value = '{"company": "Wiz", "title": "Senior TAM"}'
        result = extract_lead_info("Senior TAM at Wiz", "Hi Alex, we have a role…", "recruiter@wiz.com")
    assert result == ("Wiz", "Senior TAM")
 def test_extract_lead_info_returns_none_on_bad_json():
    """extract_lead_info returns (None, None) when LLM returns unparseable output."""
    from scripts.imap_sync import extract_lead_info
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.return_value = "I cannot determine the company."
        result = extract_lead_info("Job opportunity", "blah", "noreply@example.com")
    assert result == (None, None)
 def test_classify_labels_includes_survey_received():
    """_CLASSIFY_LABELS includes survey_received."""
    from scripts.imap_sync import _CLASSIFY_LABELS
    assert "survey_received" in _CLASSIFY_LABELS
 def test_classify_stage_signal_returns_survey_received():
    """classify_stage_signal returns 'survey_received' when LLM outputs that label."""
    from unittest.mock import patch
    from scripts.imap_sync import classify_stage_signal
    with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
        mock_router.complete.return_value = "survey_received"
        result = classify_stage_signal("Complete our culture survey", "Please fill out this form")
    assert result == "survey_received"
 def test_sync_job_emails_classifies_inbound(tmp_path):
    """sync_job_emails classifies inbound emails and stores the stage_signal."""
    from scripts.db import init_db, insert_job, get_contacts
    from scripts.imap_sync import sync_job_emails
    db_path = tmp_path / "test.db"
    init_db(db_path)
    job_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme",
        "url": "https://acme.com/jobs/1",
        "source": "linkedin", "location": "Remote",
        "is_remote": True, "salary": "", "description": "",
        "date_found": "2026-02-21",
    })
    job = {"id": job_id, "company": "Acme", "url": "https://acme.com/jobs/1"}
    fake_msg_bytes = (
        b"From: recruiter@acme.com\r\n"
        b"To: alex@example.com\r\n"
        b"Subject: Interview Invitation\r\n"
        b"Message-ID: <unique-001@acme.com>\r\n"
        b"\r\n"
        b"Hi Alex, we'd like to schedule a phone screen."
    )
    conn_mock = MagicMock()
    conn_mock.select.return_value = ("OK", [b"1"])
    conn_mock.search.return_value = ("OK", [b"1"])
    conn_mock.fetch.return_value = ("OK", [(b"1 (RFC822 {123})", fake_msg_bytes)])
    with patch("scripts.imap_sync.classify_stage_signal", return_value="interview_scheduled"):
        inb, out = sync_job_emails(job, conn_mock, {"lookback_days": 90}, db_path)
    assert inb == 1
    contacts = get_contacts(db_path, job_id=job_id)
    assert contacts[0]["stage_signal"] == "interview_scheduled"
 def test_parse_linkedin_alert_extracts_jobs():
    from scripts.imap_sync import parse_linkedin_alert
    body = """\
 Your job alert for customer success manager in United States
 New jobs match your preferences.
 Manage alerts: https://www.linkedin.com/comm/jobs/alerts?...
 Customer Success Manager
 Reflow
 California, United States
 View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz
 ---------------------------------------------------------
 Customer Engagement Manager
 Bitwarden
 United States
 2 school alumni
 Apply with resume & profile
 View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D
 ---------------------------------------------------------
 """
    jobs = parse_linkedin_alert(body)
    assert len(jobs) == 2
    assert jobs[0]["title"] == "Customer Success Manager"
    assert jobs[0]["company"] == "Reflow"
    assert jobs[0]["location"] == "California, United States"
    assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/"
    assert jobs[1]["title"] == "Customer Engagement Manager"
    assert jobs[1]["company"] == "Bitwarden"
    assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/"
 def test_parse_linkedin_alert_skips_blocks_without_view_job():
    from scripts.imap_sync import parse_linkedin_alert
    body = """\
 Customer Success Manager
 Some Company
 United States
 ---------------------------------------------------------
 Valid Job Title
 Valid Company
 Remote
 View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y
 ---------------------------------------------------------
 """
    jobs = parse_linkedin_alert(body)
    assert len(jobs) == 1
    assert jobs[0]["title"] == "Valid Job Title"
 def test_parse_linkedin_alert_empty_body():
    from scripts.imap_sync import parse_linkedin_alert
    assert parse_linkedin_alert("") == []
    assert parse_linkedin_alert("No jobs here.") == []
 # ── _scan_unmatched_leads integration ─────────────────────────────────────────
 _ALERT_BODY = """\
 Your job alert for customer success manager in United States
 New jobs match your preferences.
 Customer Success Manager
 Acme Corp
 California, United States
 View job: https://www.linkedin.com/comm/jobs/view/9999001/?trackingId=abc
 ---------------------------------------------------------
 Director of Customer Success
 Beta Inc
 Remote
 View job: https://www.linkedin.com/comm/jobs/view/9999002/?trackingId=def
 ---------------------------------------------------------
 """
 _ALERT_EMAIL = {
    "message_id": "<alert-001@linkedin.com>",
    "from_addr": "jobalerts-noreply@linkedin.com",
    "to_addr": "alex@example.com",
    "subject": "2 new jobs for customer success manager",
    "body": _ALERT_BODY,
    "date": "2026-02-24 12:00:00",
 }
 def test_scan_unmatched_leads_linkedin_alert_inserts_jobs(tmp_path):
    """_scan_unmatched_leads detects a LinkedIn alert and inserts each job card."""
    import sqlite3
    from unittest.mock import patch, MagicMock
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    conn_mock = MagicMock()
    with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
         patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
         patch("scripts.task_runner.submit_task") as mock_submit:
        from scripts.imap_sync import _scan_unmatched_leads
        known_ids: set = set()
        new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, known_ids)
    assert new_leads == 2
    # Message ID added so it won't be reprocessed
    assert "<alert-001@linkedin.com>" in known_ids
    # Both jobs inserted with correct fields
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    jobs = conn.execute("SELECT * FROM jobs ORDER BY id").fetchall()
    conn.close()
    assert len(jobs) == 2
    assert jobs[0]["title"] == "Customer Success Manager"
    assert jobs[0]["company"] == "Acme Corp"
    assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/9999001/"
    assert jobs[0]["source"] == "linkedin"
    assert jobs[1]["title"] == "Director of Customer Success"
    assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/9999002/"
    # scrape_url task submitted for each inserted job
    assert mock_submit.call_count == 2
    task_types = [call.args[1] for call in mock_submit.call_args_list]
    assert task_types == ["scrape_url", "scrape_url"]
 def test_scan_unmatched_leads_linkedin_alert_skips_duplicates(tmp_path):
    """URLs already in the DB are not re-inserted."""
    from unittest.mock import patch, MagicMock
    from scripts.db import init_db, insert_job
    db_path = tmp_path / "test.db"
    init_db(db_path)
    # Pre-insert one of the two URLs
    insert_job(db_path, {
        "title": "Customer Success Manager", "company": "Acme Corp",
        "url": "https://www.linkedin.com/jobs/view/9999001/",
        "source": "linkedin", "location": "", "is_remote": 0,
        "salary": "", "description": "", "date_found": "2026-02-24",
    })
    conn_mock = MagicMock()
    with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
         patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
         patch("scripts.task_runner.submit_task") as mock_submit:
        from scripts.imap_sync import _scan_unmatched_leads
        new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set())
    # Only one new job (the duplicate was skipped)
    assert new_leads == 1
    assert mock_submit.call_count == 1
 def test_scan_unmatched_leads_linkedin_alert_skips_llm_path(tmp_path):
    """After a LinkedIn alert email, the LLM extraction path is never reached."""
    from unittest.mock import patch, MagicMock
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    conn_mock = MagicMock()
    with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
         patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
         patch("scripts.task_runner.submit_task"), \
         patch("scripts.imap_sync.extract_lead_info") as mock_llm:
        from scripts.imap_sync import _scan_unmatched_leads
        _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set())
    # LLM extraction must never be called for alert emails
    mock_llm.assert_not_called()
--- a/tests/test_llm_router.py
+++ b/tests/test_llm_router.py
@ -0,0 +1,135 @@
 import pytest
 from unittest.mock import patch, MagicMock
 from pathlib import Path
 import yaml
 CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml"
 def test_config_loads():
    """Config file is valid YAML with required keys."""
    cfg = yaml.safe_load(CONFIG_PATH.read_text())
    assert "fallback_order" in cfg
    assert "backends" in cfg
    assert len(cfg["fallback_order"]) >= 1
 def test_router_uses_first_reachable_backend():
    """Router skips unreachable backends and uses the first that responds."""
    from scripts.llm_router import LLMRouter
    router = LLMRouter(CONFIG_PATH)
    mock_response = MagicMock()
    mock_response.choices[0].message.content = "hello"
    with patch.object(router, "_is_reachable", side_effect=[False, True, True, True, True]), \
         patch("scripts.llm_router.OpenAI") as MockOpenAI:
        instance = MockOpenAI.return_value
        instance.chat.completions.create.return_value = mock_response
        mock_model = MagicMock()
        mock_model.id = "test-model"
        instance.models.list.return_value.data = [mock_model]
        result = router.complete("say hello")
    assert result == "hello"
 def test_router_raises_when_all_backends_fail():
    """Router raises RuntimeError when every backend is unreachable or errors."""
    from scripts.llm_router import LLMRouter
    router = LLMRouter(CONFIG_PATH)
    with patch.object(router, "_is_reachable", return_value=False):
        with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
            router.complete("say hello")
 def test_is_reachable_returns_false_on_connection_error():
    """_is_reachable returns False when the health endpoint is unreachable."""
    from scripts.llm_router import LLMRouter
    import requests
    router = LLMRouter(CONFIG_PATH)
    with patch("scripts.llm_router.requests.get", side_effect=requests.ConnectionError):
        result = router._is_reachable("http://localhost:9999/v1")
    assert result is False
 def test_complete_skips_backend_without_image_support(tmp_path):
    """When images= is passed, backends without supports_images are skipped."""
    import yaml
    from scripts.llm_router import LLMRouter
    cfg = {
        "fallback_order": ["ollama", "vision_service"],
        "backends": {
            "ollama": {
                "type": "openai_compat",
                "base_url": "http://localhost:11434/v1",
                "model": "llava",
                "api_key": "ollama",
                "enabled": True,
                "supports_images": False,
            },
            "vision_service": {
                "type": "vision_service",
                "base_url": "http://localhost:8002",
                "enabled": True,
                "supports_images": True,
            },
        },
    }
    cfg_file = tmp_path / "llm.yaml"
    cfg_file.write_text(yaml.dump(cfg))
    from unittest.mock import patch, MagicMock
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.json.return_value = {"text": "B — collaborative"}
    with patch("scripts.llm_router.requests.get") as mock_get, \
         patch("scripts.llm_router.requests.post") as mock_post:
        # health check returns ok for vision_service
        mock_get.return_value = MagicMock(status_code=200)
        mock_post.return_value = mock_resp
        router = LLMRouter(config_path=cfg_file)
        result = router.complete("Which option?", images=["base64data"])
    assert result == "B — collaborative"
    # vision_service POST /analyze should have been called
    assert mock_post.called
 def test_complete_without_images_skips_vision_service(tmp_path):
    """When images=None, vision_service backend is skipped."""
    import yaml
    from scripts.llm_router import LLMRouter
    from unittest.mock import patch, MagicMock
    cfg = {
        "fallback_order": ["vision_service"],
        "backends": {
            "vision_service": {
                "type": "vision_service",
                "base_url": "http://localhost:8002",
                "enabled": True,
                "supports_images": True,
            },
        },
    }
    cfg_file = tmp_path / "llm.yaml"
    cfg_file.write_text(yaml.dump(cfg))
    router = LLMRouter(config_path=cfg_file)
    with patch("scripts.llm_router.requests.post") as mock_post:
        try:
            router.complete("text only prompt")
        except RuntimeError:
            pass  # all backends exhausted is expected
        assert not mock_post.called
--- a/tests/test_match.py
+++ b/tests/test_match.py
@ -0,0 +1,47 @@
 import pytest
 from unittest.mock import patch, MagicMock
 def test_extract_job_description_from_url():
    """extract_job_description fetches and returns visible text from a URL."""
    from scripts.match import extract_job_description
    with patch("scripts.match.requests.get") as mock_get:
        mock_get.return_value.text = "<html><body><p>We need a CSM with Salesforce.</p></body></html>"
        mock_get.return_value.raise_for_status = MagicMock()
        result = extract_job_description("https://example.com/job/123")
    assert "CSM" in result
    assert "Salesforce" in result
 def test_score_is_between_0_and_100():
    """match_score returns a float in [0, 100] and a list of keyword gaps."""
    from scripts.match import match_score
    score, gaps = match_score(
        resume_text="Customer Success Manager with Salesforce experience",
        job_text="Looking for a Customer Success Manager who knows Salesforce and Gainsight",
    )
    assert 0 <= score <= 100
    assert isinstance(gaps, list)
 def test_write_score_to_notion():
    """write_match_to_notion updates the Notion page with score and gaps."""
    from scripts.match import write_match_to_notion
    mock_notion = MagicMock()
    SAMPLE_FM = {
        "match_score": "Match Score",
        "keyword_gaps": "Keyword Gaps",
    }
    write_match_to_notion(mock_notion, "page-id-abc", 85.5, ["Gainsight", "Churnzero"], SAMPLE_FM)
    mock_notion.pages.update.assert_called_once()
    call_kwargs = mock_notion.pages.update.call_args[1]
    assert call_kwargs["page_id"] == "page-id-abc"
    score_val = call_kwargs["properties"]["Match Score"]["number"]
    assert score_val == 85.5
--- a/tests/test_scrape_url.py
+++ b/tests/test_scrape_url.py
@ -0,0 +1,135 @@
 """Tests for URL-based job scraping."""
 from unittest.mock import patch, MagicMock
 def _make_db(tmp_path, url="https://www.linkedin.com/jobs/view/99999/"):
    from scripts.db import init_db, insert_job
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "Importing…", "company": "", "url": url,
        "source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
    })
    return db, job_id
 def test_canonicalize_url_linkedin():
    from scripts.scrape_url import canonicalize_url
    messy = (
        "https://www.linkedin.com/jobs/view/4376518925/"
        "?trk=eml-email_job_alert&refId=abc%3D%3D&trackingId=xyz"
    )
    assert canonicalize_url(messy) == "https://www.linkedin.com/jobs/view/4376518925/"
 def test_canonicalize_url_linkedin_comm():
    from scripts.scrape_url import canonicalize_url
    comm = "https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc"
    assert canonicalize_url(comm) == "https://www.linkedin.com/jobs/view/4376518925/"
 def test_canonicalize_url_generic_strips_utm():
    from scripts.scrape_url import canonicalize_url
    url = "https://jobs.example.com/post/42?utm_source=linkedin&utm_medium=email&jk=real_param"
    result = canonicalize_url(url)
    assert "utm_source" not in result
    assert "real_param" in result
 def test_detect_board_linkedin():
    from scripts.scrape_url import _detect_board
    assert _detect_board("https://www.linkedin.com/jobs/view/12345/") == "linkedin"
    assert _detect_board("https://linkedin.com/jobs/view/12345/?tracking=abc") == "linkedin"
 def test_detect_board_indeed():
    from scripts.scrape_url import _detect_board
    assert _detect_board("https://www.indeed.com/viewjob?jk=abc123") == "indeed"
 def test_detect_board_glassdoor():
    from scripts.scrape_url import _detect_board
    assert _detect_board("https://www.glassdoor.com/job-listing/foo-bar-123.htm") == "glassdoor"
 def test_detect_board_generic():
    from scripts.scrape_url import _detect_board
    assert _detect_board("https://jobs.example.com/posting/42") == "generic"
 def test_extract_linkedin_job_id():
    from scripts.scrape_url import _extract_linkedin_job_id
    assert _extract_linkedin_job_id("https://www.linkedin.com/jobs/view/4376518925/") == "4376518925"
    assert _extract_linkedin_job_id("https://www.linkedin.com/comm/jobs/view/4376518925/?tracking=x") == "4376518925"
    assert _extract_linkedin_job_id("https://example.com/no-id") is None
 def test_scrape_linkedin_updates_job(tmp_path):
    db, job_id = _make_db(tmp_path)
    linkedin_html = """<html><head></head><body>
        <h2 class="top-card-layout__title">Customer Success Manager</h2>
        <a class="topcard__org-name-link">Acme Corp</a>
        <span class="topcard__flavor--bullet">San Francisco, CA</span>
        <div class="show-more-less-html__markup">Exciting CSM role with great benefits.</div>
    </body></html>"""
    mock_resp = MagicMock()
    mock_resp.text = linkedin_html
    mock_resp.raise_for_status = MagicMock()
    with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
        from scripts.scrape_url import scrape_job_url
        result = scrape_job_url(db, job_id)
    assert result.get("title") == "Customer Success Manager"
    assert result.get("company") == "Acme Corp"
    assert "CSM role" in result.get("description", "")
    import sqlite3
    conn = sqlite3.connect(db)
    conn.row_factory = sqlite3.Row
    row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
    conn.close()
    assert row["title"] == "Customer Success Manager"
    assert row["company"] == "Acme Corp"
 def test_scrape_url_generic_json_ld(tmp_path):
    db, job_id = _make_db(tmp_path, url="https://jobs.example.com/post/42")
    json_ld_html = """<html><head>
        <script type="application/ld+json">
        {"@type": "JobPosting", "title": "TAM Role", "description": "Tech account mgmt.",
         "hiringOrganization": {"name": "TechCo"},
         "jobLocation": {"address": {"addressLocality": "Austin, TX"}}}
        </script>
    </head><body></body></html>"""
    mock_resp = MagicMock()
    mock_resp.text = json_ld_html
    mock_resp.raise_for_status = MagicMock()
    with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
        from scripts.scrape_url import scrape_job_url
        result = scrape_job_url(db, job_id)
    assert result.get("title") == "TAM Role"
    assert result.get("company") == "TechCo"
 def test_scrape_url_graceful_on_http_error(tmp_path):
    db, job_id = _make_db(tmp_path)
    import requests as req
    with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")):
        from scripts.scrape_url import scrape_job_url
        result = scrape_job_url(db, job_id)
    # Should return empty dict and not raise; job row still exists
    assert isinstance(result, dict)
    import sqlite3
    conn = sqlite3.connect(db)
    row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone()
    conn.close()
    assert row is not None
--- a/tests/test_sync.py
+++ b/tests/test_sync.py
@ -0,0 +1,88 @@
 # tests/test_sync.py
 import pytest
 from unittest.mock import patch, MagicMock
 from pathlib import Path
 SAMPLE_FM = {
    "title_field": "Salary", "job_title": "Job Title", "company": "Company Name",
    "url": "Role Link", "source": "Job Source", "status": "Status of Application",
    "status_new": "Application Submitted", "date_found": "Date Found",
    "remote": "Remote", "match_score": "Match Score",
    "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description",
 }
 SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM}
 def test_sync_pushes_approved_jobs(tmp_path):
    """sync_to_notion pushes approved jobs and marks them synced."""
    from scripts.sync import sync_to_notion
    from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    row_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://example.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "$100k", "description": "Good role", "date_found": "2026-02-20",
    })
    update_job_status(db_path, [row_id], "approved")
    mock_notion = MagicMock()
    mock_notion.pages.create.return_value = {"id": "notion-page-abc"}
    with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
         patch("scripts.sync.Client", return_value=mock_notion):
        count = sync_to_notion(db_path=db_path)
    assert count == 1
    mock_notion.pages.create.assert_called_once()
    synced = get_jobs_by_status(db_path, "synced")
    assert len(synced) == 1
 def test_sync_falls_back_to_core_fields_on_validation_error(tmp_path):
    """When Notion returns a validation_error (missing column), sync retries without optional fields."""
    from scripts.sync import sync_to_notion
    from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
    db_path = tmp_path / "test.db"
    init_db(db_path)
    row_id = insert_job(db_path, {
        "title": "CSM", "company": "Acme", "url": "https://example.com/2",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "", "date_found": "2026-02-20",
    })
    update_job_status(db_path, [row_id], "approved")
    mock_notion = MagicMock()
    # First call raises validation_error; second call (fallback) succeeds
    mock_notion.pages.create.side_effect = [
        Exception("validation_error: Could not find property with name: Match Score"),
        {"id": "notion-page-fallback"},
    ]
    with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
         patch("scripts.sync.Client", return_value=mock_notion):
        count = sync_to_notion(db_path=db_path)
    assert count == 1
    assert mock_notion.pages.create.call_count == 2
    synced = get_jobs_by_status(db_path, "synced")
    assert len(synced) == 1
 def test_sync_returns_zero_when_nothing_approved(tmp_path):
    """sync_to_notion returns 0 when there are no approved jobs."""
    from scripts.sync import sync_to_notion
    from scripts.db import init_db
    db_path = tmp_path / "test.db"
    init_db(db_path)
    with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
         patch("scripts.sync.Client"):
        count = sync_to_notion(db_path=db_path)
    assert count == 0
--- a/tests/test_task_runner.py
+++ b/tests/test_task_runner.py
@ -0,0 +1,210 @@
 import threading
 import time
 import pytest
 from pathlib import Path
 from unittest.mock import patch
 import sqlite3
 def _make_db(tmp_path):
    from scripts.db import init_db, insert_job
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "Acme", "url": "https://ex.com/1",
        "source": "linkedin", "location": "Remote", "is_remote": True,
        "salary": "", "description": "Great role.", "date_found": "2026-02-20",
    })
    return db, job_id
 def test_submit_task_returns_id_and_true(tmp_path):
    """submit_task returns (task_id, True) and spawns a thread."""
    db, job_id = _make_db(tmp_path)
    with patch("scripts.task_runner._run_task"):  # don't actually call LLM
        from scripts.task_runner import submit_task
        task_id, is_new = submit_task(db, "cover_letter", job_id)
    assert isinstance(task_id, int) and task_id > 0
    assert is_new is True
 def test_submit_task_deduplicates(tmp_path):
    """submit_task returns (existing_id, False) for a duplicate in-flight task."""
    db, job_id = _make_db(tmp_path)
    with patch("scripts.task_runner._run_task"):
        from scripts.task_runner import submit_task
        first_id, _ = submit_task(db, "cover_letter", job_id)
        second_id, is_new = submit_task(db, "cover_letter", job_id)
    assert second_id == first_id
    assert is_new is False
 def test_run_task_cover_letter_success(tmp_path):
    """_run_task marks running→completed and saves cover letter to DB."""
    db, job_id = _make_db(tmp_path)
    from scripts.db import insert_task, get_task_for_job
    task_id, _ = insert_task(db, "cover_letter", job_id)
    with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "cover_letter", job_id)
    task = get_task_for_job(db, "cover_letter", job_id)
    assert task["status"] == "completed"
    assert task["error"] is None
    conn = sqlite3.connect(db)
    row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone()
    conn.close()
    assert row[0] == "Dear Hiring Manager,\nGreat fit!"
 def test_run_task_company_research_success(tmp_path):
    """_run_task marks running→completed and saves research to DB."""
    db, job_id = _make_db(tmp_path)
    from scripts.db import insert_task, get_task_for_job, get_research
    task_id, _ = insert_task(db, "company_research", job_id)
    fake_result = {
        "raw_output": "raw", "company_brief": "brief",
        "ceo_brief": "ceo", "talking_points": "points",
    }
    with patch("scripts.company_research.research_company", return_value=fake_result):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "company_research", job_id)
    task = get_task_for_job(db, "company_research", job_id)
    assert task["status"] == "completed"
    research = get_research(db, job_id=job_id)
    assert research["company_brief"] == "brief"
 def test_run_task_marks_failed_on_exception(tmp_path):
    """_run_task marks status=failed and stores error when generator raises."""
    db, job_id = _make_db(tmp_path)
    from scripts.db import insert_task, get_task_for_job
    task_id, _ = insert_task(db, "cover_letter", job_id)
    with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "cover_letter", job_id)
    task = get_task_for_job(db, "cover_letter", job_id)
    assert task["status"] == "failed"
    assert "LLM timeout" in task["error"]
 def test_run_task_discovery_success(tmp_path):
    """_run_task with task_type=discovery calls run_discovery and stores count in error field."""
    from scripts.db import init_db, insert_task, get_task_for_job
    db = tmp_path / "test.db"
    init_db(db)
    task_id, _ = insert_task(db, "discovery", 0)
    with patch("scripts.discover.run_discovery", return_value=7):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "discovery", 0)
    task = get_task_for_job(db, "discovery", 0)
    assert task["status"] == "completed"
    assert "7 new listings" in task["error"]
 def test_run_task_email_sync_success(tmp_path):
    """email_sync task calls sync_all and marks completed with summary."""
    db, _ = _make_db(tmp_path)
    from scripts.db import insert_task, get_task_for_job
    task_id, _ = insert_task(db, "email_sync", 0)
    summary = {"synced": 3, "inbound": 5, "outbound": 2, "new_leads": 1, "errors": []}
    with patch("scripts.imap_sync.sync_all", return_value=summary):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "email_sync", 0)
    task = get_task_for_job(db, "email_sync", 0)
    assert task["status"] == "completed"
    assert "3 jobs" in task["error"]
 def test_run_task_email_sync_file_not_found(tmp_path):
    """email_sync marks failed with helpful message when config is missing."""
    db, _ = _make_db(tmp_path)
    from scripts.db import insert_task, get_task_for_job
    task_id, _ = insert_task(db, "email_sync", 0)
    with patch("scripts.imap_sync.sync_all", side_effect=FileNotFoundError("config/email.yaml")):
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "email_sync", 0)
    task = get_task_for_job(db, "email_sync", 0)
    assert task["status"] == "failed"
    assert "email" in task["error"].lower()
 def test_submit_task_actually_completes(tmp_path):
    """Integration: submit_task spawns a thread that completes asynchronously."""
    db, job_id = _make_db(tmp_path)
    from scripts.db import get_task_for_job
    with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"):
        from scripts.task_runner import submit_task
        task_id, _ = submit_task(db, "cover_letter", job_id)
        # Wait for thread to complete (max 5s)
        for _ in range(50):
            task = get_task_for_job(db, "cover_letter", job_id)
            if task and task["status"] in ("completed", "failed"):
                break
            time.sleep(0.1)
    task = get_task_for_job(db, "cover_letter", job_id)
    assert task["status"] == "completed"
 def test_run_task_enrich_craigslist_success(tmp_path):
    """enrich_craigslist task calls enrich_craigslist_fields and marks completed."""
    from scripts.db import init_db, insert_job, insert_task, get_task_for_job
    from unittest.mock import MagicMock
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/9.html",
        "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $100k.",
        "date_found": "2026-02-24",
    })
    task_id, _ = insert_task(db, "enrich_craigslist", job_id)
    with patch("scripts.enrich_descriptions.enrich_craigslist_fields",
               return_value={"company": "Acme Corp", "salary": "$100k"}) as mock_enrich:
        from scripts.task_runner import _run_task
        _run_task(db, task_id, "enrich_craigslist", job_id)
    mock_enrich.assert_called_once_with(db, job_id)
    task = get_task_for_job(db, "enrich_craigslist", job_id)
    assert task["status"] == "completed"
 def test_scrape_url_submits_enrich_craigslist_for_craigslist_job(tmp_path):
    """After scrape_url completes for a craigslist job with empty company, enrich_craigslist is queued."""
    from scripts.db import init_db, insert_job, insert_task, get_task_for_job
    db = tmp_path / "test.db"
    init_db(db)
    job_id = insert_job(db, {
        "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/10.html",
        "source": "craigslist", "location": "", "description": "",
        "date_found": "2026-02-24",
    })
    task_id, _ = insert_task(db, "scrape_url", job_id)
    with patch("scripts.scrape_url.scrape_job_url", return_value={"title": "CSM", "company": ""}):
        with patch("scripts.task_runner.submit_task", wraps=None) as mock_submit:
            # Use wraps=None so we can capture calls without actually spawning threads
            mock_submit.return_value = (99, True)
            from scripts.task_runner import _run_task
            _run_task(db, task_id, "scrape_url", job_id)
    # submit_task should have been called with enrich_craigslist
    assert mock_submit.called
    call_args = mock_submit.call_args
    assert call_args[0][1] == "enrich_craigslist"
    assert call_args[0][2] == job_id
		`@ -0,0 +1 @@`
							`# Custom job board scrapers — each module exposes scrape(profile, location, results_wanted) -> list[dict]`