commit f11a38eb0b84eda973c675c4235a5251ae010916 Author: pyr0ball Date: Tue Feb 24 18:25:39 2026 -0800 chore: seed Peregrine from personal job-seeker (pre-generalization) App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..75174d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +.env +config/notion.yaml +config/tokens.yaml +config/email.yaml +config/adzuna.yaml +config/craigslist.yaml +__pycache__/ +*.pyc +.pytest_cache/ +output/ +aihawk/ +resume_matcher/ +staging.db +.streamlit.log +.streamlit.pid +.coverage +log/ +unsloth_compiled_cache/ +data/survey_screenshots/* +!data/survey_screenshots/.gitkeep diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..84b09f7 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,212 @@ +# Job Seeker Platform — Claude Context + +## Project +Automated job discovery + resume matching + application pipeline for Alex Rivera. + +Full pipeline: +``` +JobSpy → discover.py → SQLite (staging.db) → match.py → Job Review UI +→ Apply Workspace (cover letter + PDF) → Interviews kanban +→ phone_screen → interviewing → offer → hired + ↓ + Notion DB (synced via sync.py) +``` + +## Environment +- Python env: `conda run -n job-seeker ` — always use this, never bare python +- Run tests: `/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v` + (use direct binary — `conda run pytest` can spawn runaway processes) +- Run discovery: `conda run -n job-seeker python scripts/discover.py` +- Recreate env: `conda env create -f environment.yml` +- pytest.ini scopes test collection to `tests/` only — never widen this + +## ⚠️ AIHawk env isolation — CRITICAL +- NEVER `pip install -r aihawk/requirements.txt` into the job-seeker env +- AIHawk pulls torch + CUDA (~7GB) which causes OOM during test runs +- AIHawk must run in its own env: `conda create -n aihawk-env python=3.12` +- job-seeker env must stay lightweight (no torch, no sentence-transformers, no CUDA) + +## Web UI (Streamlit) +- Run: `bash scripts/manage-ui.sh start` → http://localhost:8501 +- Manage: `start | stop | restart | status | logs` +- Direct binary: `/devl/miniconda3/envs/job-seeker/bin/streamlit run app/app.py` +- Entry point: `app/app.py` (uses `st.navigation()` — do NOT run `app/Home.py` directly) +- `staging.db` is gitignored — SQLite staging layer between discovery and Notion + +### Pages +| Page | File | Purpose | +|------|------|---------| +| Home | `app/Home.py` | Dashboard, discovery trigger, danger-zone purge | +| Job Review | `app/pages/1_Job_Review.py` | Batch approve/reject with sorting | +| Settings | `app/pages/2_Settings.py` | LLM backends, search profiles, Notion, services | +| Resume Profile | Settings → Resume Profile tab | Edit AIHawk YAML profile (was standalone `3_Resume_Editor.py`) | +| Apply Workspace | `app/pages/4_Apply.py` | Cover letter gen + PDF export + mark applied + reject listing | +| Interviews | `app/pages/5_Interviews.py` | Kanban: phone_screen→interviewing→offer→hired | +| Interview Prep | `app/pages/6_Interview_Prep.py` | Live reference sheet during calls + Practice Q&A | +| Survey Assistant | `app/pages/7_Survey.py` | Culture-fit survey help: text paste + screenshot (moondream2) | + +## Job Status Pipeline +``` +pending → approved/rejected (Job Review) +approved → applied (Apply Workspace — mark applied) +approved → rejected (Apply Workspace — reject listing button) +applied → survey (Interviews — "📋 Survey" button; pre-kanban section) +applied → phone_screen (Interviews — triggers company research) +survey → phone_screen (Interviews — after survey completed) +phone_screen → interviewing +interviewing → offer +offer → hired +any stage → rejected (rejection_stage captured for analytics) +applied/approved → synced (sync.py → Notion) +``` + +## SQLite Schema (`staging.db`) +### `jobs` table key columns +- Standard: `id, title, company, url, source, location, is_remote, salary, description` +- Scores: `match_score, keyword_gaps` +- Dates: `date_found, applied_at, survey_at, phone_screen_at, interviewing_at, offer_at, hired_at` +- Interview: `interview_date, rejection_stage` +- Content: `cover_letter, notion_page_id` + +### Additional tables +- `job_contacts` — email thread log per job (direction, subject, from/to, body, received_at) +- `company_research` — LLM-generated brief per job (company_brief, ceo_brief, talking_points, raw_output, accessibility_brief) +- `background_tasks` — async LLM task queue (task_type, job_id, status: queued/running/completed/failed) +- `survey_responses` — per-job Q&A pairs (survey_name, received_at, source, raw_input, image_path, mode, llm_output, reported_score) + +## Scripts +| Script | Purpose | +|--------|---------| +| `scripts/discover.py` | JobSpy + custom board scrape → SQLite insert | +| `scripts/custom_boards/adzuna.py` | Adzuna Jobs API (app_id + app_key in config/adzuna.yaml) | +| `scripts/custom_boards/theladders.py` | The Ladders scraper via curl_cffi + __NEXT_DATA__ SSR parse | +| `scripts/match.py` | Resume keyword matching → match_score | +| `scripts/sync.py` | Push approved/applied jobs to Notion | +| `scripts/llm_router.py` | LLM fallback chain (reads config/llm.yaml) | +| `scripts/generate_cover_letter.py` | Cover letter via LLM; detects mission-aligned companies (music/animal welfare/education) and injects Para 3 hint | +| `scripts/company_research.py` | Pre-interview brief via LLM + optional SearXNG scrape; includes Inclusion & Accessibility section | +| `scripts/prepare_training_data.py` | Extract cover letter JSONL for fine-tuning | +| `scripts/finetune_local.py` | Unsloth QLoRA fine-tune on local GPU | +| `scripts/db.py` | All SQLite helpers (single source of truth) | +| `scripts/task_runner.py` | Background thread executor — `submit_task(db, type, job_id)` dispatches daemon threads for LLM jobs | +| `scripts/vision_service/main.py` | FastAPI moondream2 inference on port 8002; `manage-vision.sh` lifecycle | + +## LLM Router +- Config: `config/llm.yaml` +- Cover letter fallback order: `claude_code → ollama (alex-cover-writer:latest) → vllm → copilot → anthropic` +- Research fallback order: `claude_code → vllm (__auto__, ouroboros) → ollama_research (llama3.1:8b) → ...` +- `alex-cover-writer:latest` is cover-letter only — it doesn't follow structured markdown prompts for research +- `LLMRouter.complete()` accepts `fallback_order=` override for per-task routing +- `LLMRouter.complete()` accepts `images: list[str]` (base64) — vision backends only; non-vision backends skipped when images present +- Vision fallback order config key: `vision_fallback_order: [vision_service, claude_code, anthropic]` +- `vision_service` backend type: POST to `/analyze`; skipped automatically when no images provided +- Claude Code wrapper: `/Library/Documents/Post Fight Processing/server-openai-wrapper-v2.js` +- Copilot wrapper: `/Library/Documents/Post Fight Processing/manage-copilot.sh start` + +## Fine-Tuned Model +- Model: `alex-cover-writer:latest` registered in Ollama +- Base: `unsloth/Llama-3.2-3B-Instruct` (QLoRA, rank 16, 10 epochs) +- Training data: 62 cover letters from `/Library/Documents/JobSearch/` +- JSONL: `/Library/Documents/JobSearch/training_data/cover_letters.jsonl` +- Adapter: `/Library/Documents/JobSearch/training_data/finetune_output/adapter/` +- Merged: `/Library/Documents/JobSearch/training_data/gguf/alex-cover-writer/` +- Re-train: `conda run -n ogma python scripts/finetune_local.py` + (uses `ogma` env with unsloth + trl; pin to GPU 0 with `CUDA_VISIBLE_DEVICES=0`) + +## Background Tasks +- Cover letter gen and company research run as daemon threads via `scripts/task_runner.py` +- Tasks survive page navigation; results written to existing tables when done +- On server restart, `app.py` startup clears any stuck `running`/`queued` rows to `failed` +- Dedup: only one queued/running task per `(task_type, job_id)` at a time +- Sidebar indicator (`app/app.py`) polls every 3s via `@st.fragment(run_every=3)` +- ⚠️ Streamlit fragment + sidebar: use `with st.sidebar: _fragment()` — sidebar context must WRAP the call, not be inside the fragment body + +## Vision Service +- Script: `scripts/vision_service/main.py` (FastAPI, port 8002) +- Model: `vikhyatk/moondream2` revision `2025-01-09` — lazy-loaded on first `/analyze` (~1.8GB download) +- GPU: 4-bit quantization when CUDA available (~1.5GB VRAM); CPU fallback +- Conda env: `job-seeker-vision` — separate from job-seeker (torch + transformers live here) +- Create env: `conda env create -f scripts/vision_service/environment.yml` +- Manage: `bash scripts/manage-vision.sh start|stop|restart|status|logs` +- Survey page degrades gracefully to text-only when vision service is down +- ⚠️ Never install vision deps (torch, bitsandbytes, transformers) into the job-seeker env + +## Company Research +- Script: `scripts/company_research.py` +- Auto-triggered when a job moves to `phone_screen` in the Interviews kanban +- Three-phase: (1) SearXNG company scrape → (1b) SearXNG news snippets → (2) LLM synthesis +- SearXNG scraper: `/Library/Development/scrapers/companyScraper.py` +- SearXNG Docker: run `docker compose up -d` from `/Library/Development/scrapers/SearXNG/` (port 8888) +- `beautifulsoup4` and `fake-useragent` are installed in job-seeker env (required for scraper) +- News search hits `/search?format=json` — JSON format must be enabled in `searxng-config/settings.yml` +- ⚠️ `settings.yml` owned by UID 977 (container user) — use `docker cp` to update, not direct writes +- ⚠️ `settings.yml` requires `use_default_settings: true` at the top or SearXNG fails schema validation +- `companyScraper` calls `sys.exit()` on missing deps — use `except BaseException` not `except Exception` + +## Email Classifier Labels +Six labels: `interview_request`, `rejection`, `offer`, `follow_up`, `survey_received`, `other` +- `survey_received` — links or requests to complete a culture-fit survey/assessment + +## Services (managed via Settings → Services tab) +| Service | Port | Notes | +|---------|------|-------| +| Streamlit UI | 8501 | `bash scripts/manage-ui.sh start` | +| Ollama | 11434 | `sudo systemctl start ollama` | +| Claude Code Wrapper | 3009 | `manage-services.sh start` in Post Fight Processing | +| GitHub Copilot Wrapper | 3010 | `manage-copilot.sh start` in Post Fight Processing | +| vLLM Server | 8000 | Manual start only | +| SearXNG | 8888 | `docker compose up -d` in scrapers/SearXNG/ | +| Vision Service | 8002 | `bash scripts/manage-vision.sh start` — moondream2 survey screenshot analysis | + +## Notion +- DB: "Tracking Job Applications" (ID: `1bd75cff-7708-8007-8c00-f1de36620a0a`) +- `config/notion.yaml` is gitignored (live token); `.example` is committed +- Field names are non-obvious — always read from `field_map` in `config/notion.yaml` +- "Salary" = Notion title property (unusual — it's the page title field) +- "Job Source" = `multi_select` type +- "Role Link" = URL field +- "Status of Application" = status field; new listings use "Application Submitted" +- Sync pushes `approved` + `applied` jobs; marks them `synced` after + +## Key Config Files +- `config/notion.yaml` — gitignored, has token + field_map +- `config/notion.yaml.example` — committed template +- `config/search_profiles.yaml` — titles, locations, boards, custom_boards, exclude_keywords, mission_tags (per profile) +- `config/llm.yaml` — LLM backend priority chain + enabled flags +- `config/tokens.yaml` — gitignored, stores HF token (chmod 600) +- `config/adzuna.yaml` — gitignored, Adzuna API app_id + app_key +- `config/adzuna.yaml.example` — committed template + +## Custom Job Board Scrapers +- `scripts/custom_boards/adzuna.py` — Adzuna Jobs API; credentials in `config/adzuna.yaml` +- `scripts/custom_boards/theladders.py` — The Ladders SSR scraper; needs `curl_cffi` installed +- Scrapers registered in `CUSTOM_SCRAPERS` dict in `discover.py` +- Activated per-profile via `custom_boards: [adzuna, theladders]` in `search_profiles.yaml` +- `enrich_all_descriptions()` in `enrich_descriptions.py` covers all sources (not just Glassdoor) +- Home page "Fill Missing Descriptions" button dispatches `enrich_descriptions` task + +## Mission Alignment & Accessibility +- Preferred industries: music, animal welfare, children's education (hardcoded in `generate_cover_letter.py`) +- `detect_mission_alignment(company, description)` injects a Para 3 hint into cover letters for aligned companies +- Company research includes an "Inclusion & Accessibility" section (8th section of the brief) in every brief +- Accessibility search query in `_SEARCH_QUERIES` hits SearXNG for ADA/ERG/disability signals +- `accessibility_brief` column in `company_research` table; shown in Interview Prep under ♿ section +- This info is for personal decision-making ONLY — never disclosed in applications +- In generalization: these become `profile.mission_industries` + `profile.accessibility_priority` in `user.yaml` + +## Document Rule +Resumes and cover letters live in `/Library/Documents/JobSearch/` or Notion — never committed to this repo. + +## AIHawk (LinkedIn Easy Apply) +- Cloned to `aihawk/` (gitignored) +- Config: `aihawk/data_folder/plain_text_resume.yaml` — search FILL_IN for gaps +- Self-ID: non-binary, pronouns any, no disability/drug-test disclosure +- Run: `conda run -n job-seeker python aihawk/main.py` +- Playwright: `conda run -n job-seeker python -m playwright install chromium` + +## Git Remote +- Forgejo self-hosted at https://git.opensourcesolarpunk.com (username: pyr0ball) +- `git remote add origin https://git.opensourcesolarpunk.com/pyr0ball/job-seeker.git` + +## Subagents +Use `general-purpose` subagent type (not `Bash`) when tasks require file writes. diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml new file mode 100644 index 0000000..218fba5 --- /dev/null +++ b/app/.streamlit/config.toml @@ -0,0 +1,7 @@ +[theme] +base = "dark" +primaryColor = "#2DD4BF" +backgroundColor = "#0F172A" +secondaryBackgroundColor = "#1E293B" +textColor = "#F1F5F9" +font = "sans serif" diff --git a/app/Home.py b/app/Home.py new file mode 100644 index 0000000..c516250 --- /dev/null +++ b/app/Home.py @@ -0,0 +1,475 @@ +# app/Home.py +""" +Job Seeker Dashboard — Home page. +Shows counts, Run Discovery button, and Sync to Notion button. +""" +import subprocess +import sys +from pathlib import Path + +import streamlit as st + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \ + purge_non_remote, archive_jobs, kill_stuck_tasks, get_task_for_job, get_active_tasks, \ + insert_job, get_existing_urls +from scripts.task_runner import submit_task + +init_db(DEFAULT_DB) + + +def _dismissible(key: str, status: str, msg: str) -> None: + """Render a dismissible success/error message. key must be unique per task result.""" + if st.session_state.get(f"dismissed_{key}"): + return + col_msg, col_x = st.columns([10, 1]) + with col_msg: + if status == "completed": + st.success(msg) + else: + st.error(msg) + with col_x: + st.write("") + if st.button("✕", key=f"dismiss_{key}", help="Dismiss"): + st.session_state[f"dismissed_{key}"] = True + st.rerun() + + +def _queue_url_imports(db_path: Path, urls: list) -> int: + """Insert each URL as a pending manual job and queue a scrape_url task. + Returns count of newly queued jobs.""" + from datetime import datetime + from scripts.scrape_url import canonicalize_url + existing = get_existing_urls(db_path) + queued = 0 + for url in urls: + url = canonicalize_url(url.strip()) + if not url.startswith("http"): + continue + if url in existing: + continue + job_id = insert_job(db_path, { + "title": "Importing…", + "company": "", + "url": url, + "source": "manual", + "location": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + submit_task(db_path, "scrape_url", job_id) + queued += 1 + return queued + + +st.title("🔍 Alex's Job Search") +st.caption("Discover → Review → Sync to Notion") + +st.divider() + + +@st.fragment(run_every=10) +def _live_counts(): + counts = get_job_counts(DEFAULT_DB) + col1, col2, col3, col4, col5 = st.columns(5) + col1.metric("Pending Review", counts.get("pending", 0)) + col2.metric("Approved", counts.get("approved", 0)) + col3.metric("Applied", counts.get("applied", 0)) + col4.metric("Synced to Notion", counts.get("synced", 0)) + col5.metric("Rejected", counts.get("rejected", 0)) + + +_live_counts() + +st.divider() + +left, enrich_col, mid, right = st.columns(4) + +with left: + st.subheader("Find New Jobs") + st.caption("Scrapes all configured boards and adds new listings to your review queue.") + + _disc_task = get_task_for_job(DEFAULT_DB, "discovery", 0) + _disc_running = _disc_task and _disc_task["status"] in ("queued", "running") + + if st.button("🚀 Run Discovery", use_container_width=True, type="primary", + disabled=bool(_disc_running)): + submit_task(DEFAULT_DB, "discovery", 0) + st.rerun() + + if _disc_running: + @st.fragment(run_every=4) + def _disc_status(): + t = get_task_for_job(DEFAULT_DB, "discovery", 0) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Scraping job boards… this may take a minute" + st.info(f"⏳ {lbl}") + else: + st.rerun() + _disc_status() + elif _disc_task and _disc_task["status"] == "completed": + _dismissible(f"disc_{_disc_task['id']}", "completed", + f"✅ Discovery complete — {_disc_task.get('error', '')}. Head to Job Review.") + elif _disc_task and _disc_task["status"] == "failed": + _dismissible(f"disc_{_disc_task['id']}", "failed", + f"Discovery failed: {_disc_task.get('error', '')}") + +with enrich_col: + st.subheader("Enrich Descriptions") + st.caption("Re-fetch missing descriptions for any listing (LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, generic).") + + _enrich_task = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0) + _enrich_running = _enrich_task and _enrich_task["status"] in ("queued", "running") + + if st.button("🔍 Fill Missing Descriptions", use_container_width=True, type="primary", + disabled=bool(_enrich_running)): + submit_task(DEFAULT_DB, "enrich_descriptions", 0) + st.rerun() + + if _enrich_running: + @st.fragment(run_every=4) + def _enrich_status(): + t = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Fetching descriptions…") + else: + st.rerun() + _enrich_status() + elif _enrich_task and _enrich_task["status"] == "completed": + _dismissible(f"enrich_{_enrich_task['id']}", "completed", + f"✅ {_enrich_task.get('error', 'Done')}") + elif _enrich_task and _enrich_task["status"] == "failed": + _dismissible(f"enrich_{_enrich_task['id']}", "failed", + f"Enrich failed: {_enrich_task.get('error', '')}") + +with mid: + unscored = sum(1 for j in __import__("scripts.db", fromlist=["get_jobs_by_status"]) + .get_jobs_by_status(DEFAULT_DB, "pending") + if j.get("match_score") is None and j.get("description")) + st.subheader("Score Listings") + st.caption(f"Run TF-IDF match scoring against Alex's resume. {unscored} pending job{'s' if unscored != 1 else ''} unscored.") + if st.button("📊 Score All Unscored Jobs", use_container_width=True, type="primary", + disabled=unscored == 0): + with st.spinner("Scoring…"): + result = subprocess.run( + ["conda", "run", "-n", "job-seeker", "python", "scripts/match.py"], + capture_output=True, text=True, + cwd=str(Path(__file__).parent.parent), + ) + if result.returncode == 0: + st.success("Scoring complete!") + st.code(result.stdout) + else: + st.error("Scoring failed.") + st.code(result.stderr) + st.rerun() + +with right: + approved_count = get_job_counts(DEFAULT_DB).get("approved", 0) + st.subheader("Send to Notion") + st.caption("Push all approved jobs to your Notion tracking database.") + if approved_count == 0: + st.info("No approved jobs yet. Review and approve some listings first.") + else: + if st.button( + f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion", + use_container_width=True, type="primary", + ): + with st.spinner("Syncing to Notion…"): + from scripts.sync import sync_to_notion + count = sync_to_notion(DEFAULT_DB) + st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!") + st.rerun() + +st.divider() + +# ── Email Sync ──────────────────────────────────────────────────────────────── +email_left, email_right = st.columns([3, 1]) + +with email_left: + st.subheader("Sync Emails") + st.caption("Pull inbound recruiter emails and match them to active applications. " + "New recruiter outreach is added to your Job Review queue.") + +with email_right: + _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0) + _email_running = _email_task and _email_task["status"] in ("queued", "running") + + if st.button("📧 Sync Emails", use_container_width=True, type="primary", + disabled=bool(_email_running)): + submit_task(DEFAULT_DB, "email_sync", 0) + st.rerun() + + if _email_running: + @st.fragment(run_every=4) + def _email_status(): + t = get_task_for_job(DEFAULT_DB, "email_sync", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Syncing emails…") + else: + st.rerun() + _email_status() + elif _email_task and _email_task["status"] == "completed": + _dismissible(f"email_{_email_task['id']}", "completed", + f"✅ {_email_task.get('error', 'Done')}") + elif _email_task and _email_task["status"] == "failed": + _dismissible(f"email_{_email_task['id']}", "failed", + f"Sync failed: {_email_task.get('error', '')}") + +st.divider() + +# ── Add Jobs by URL ─────────────────────────────────────────────────────────── +add_left, _add_right = st.columns([3, 1]) +with add_left: + st.subheader("Add Jobs by URL") + st.caption("Paste job listing URLs to import and scrape in the background. " + "Supports LinkedIn, Indeed, Glassdoor, and most job boards.") + +url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"]) + +with url_tab: + url_text = st.text_area( + "urls", + placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc", + height=100, + label_visibility="collapsed", + ) + if st.button("📥 Add Jobs", key="add_urls_btn", use_container_width=True, + disabled=not (url_text or "").strip()): + _urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")] + if _urls: + _n = _queue_url_imports(DEFAULT_DB, _urls) + if _n: + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.") + else: + st.info("All URLs already in the database.") + st.rerun() + +with csv_tab: + csv_file = st.file_uploader("CSV with a URL column", type=["csv"], + label_visibility="collapsed") + if csv_file: + import csv as _csv + import io as _io + reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace"))) + _csv_urls = [] + for row in reader: + for val in row.values(): + if val and val.strip().startswith("http"): + _csv_urls.append(val.strip()) + break + if _csv_urls: + st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.") + if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True): + _n = _queue_url_imports(DEFAULT_DB, _csv_urls) + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.") + st.rerun() + else: + st.warning("No URLs found — CSV must have a column whose values start with http.") + + +@st.fragment(run_every=3) +def _scrape_status(): + import sqlite3 as _sq + conn = _sq.connect(DEFAULT_DB) + conn.row_factory = _sq.Row + rows = conn.execute( + """SELECT bt.status, bt.error, j.title, j.company, j.url + FROM background_tasks bt + JOIN jobs j ON j.id = bt.job_id + WHERE bt.task_type = 'scrape_url' + AND bt.updated_at >= datetime('now', '-5 minutes') + ORDER BY bt.updated_at DESC LIMIT 20""" + ).fetchall() + conn.close() + if not rows: + return + st.caption("Recent URL imports:") + for r in rows: + if r["status"] == "running": + st.info(f"⏳ Scraping {r['url']}") + elif r["status"] == "completed": + label = r["title"] + (f" @ {r['company']}" if r["company"] else "") + st.success(f"✅ {label}") + elif r["status"] == "failed": + st.error(f"❌ {r['url']} — {r['error'] or 'scrape failed'}") + + +_scrape_status() + +st.divider() + +# ── Danger zone: purge + re-scrape ──────────────────────────────────────────── +with st.expander("⚠️ Danger Zone", expanded=False): + st.caption( + "**Purge** permanently deletes jobs from the local database. " + "Applied and synced jobs are never touched." + ) + + purge_col, rescrape_col, email_col, tasks_col = st.columns(4) + + with purge_col: + st.markdown("**Purge pending & rejected**") + st.caption("Removes all _pending_ and _rejected_ listings so the next discovery starts fresh.") + if st.button("🗑 Purge Pending + Rejected", use_container_width=True): + st.session_state["confirm_purge"] = "partial" + + if st.session_state.get("confirm_purge") == "partial": + st.warning("Are you sure? This cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["pending", "rejected"]) + st.success(f"Purged {deleted} jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with email_col: + st.markdown("**Purge email data**") + st.caption("Clears all email thread logs and email-sourced pending jobs so the next sync starts fresh.") + if st.button("📧 Purge Email Data", use_container_width=True): + st.session_state["confirm_purge"] = "email" + + if st.session_state.get("confirm_purge") == "email": + st.warning("This deletes all email contacts and email-sourced jobs. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge emails", type="primary", use_container_width=True): + contacts, jobs = purge_email_data(DEFAULT_DB) + st.success(f"Purged {contacts} email contacts, {jobs} email jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with tasks_col: + _active = get_active_tasks(DEFAULT_DB) + st.markdown("**Kill stuck tasks**") + st.caption(f"Force-fail all queued/running background tasks. Currently **{len(_active)}** active.") + if st.button("⏹ Kill All Tasks", use_container_width=True, disabled=len(_active) == 0): + killed = kill_stuck_tasks(DEFAULT_DB) + st.success(f"Killed {killed} task(s).") + st.rerun() + + with rescrape_col: + st.markdown("**Purge all & re-scrape**") + st.caption("Wipes _all_ non-applied, non-synced jobs then immediately runs a fresh discovery.") + if st.button("🔄 Purge All + Re-scrape", use_container_width=True): + st.session_state["confirm_purge"] = "full" + + if st.session_state.get("confirm_purge") == "full": + st.warning("This will delete ALL pending, approved, and rejected jobs, then re-scrape. Applied and synced records are kept.") + c1, c2 = st.columns(2) + if c1.button("Yes, wipe + scrape", type="primary", use_container_width=True): + purge_jobs(DEFAULT_DB, statuses=["pending", "approved", "rejected"]) + submit_task(DEFAULT_DB, "discovery", 0) + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + st.divider() + + pending_col, nonremote_col, approved_col, _ = st.columns(4) + + with pending_col: + st.markdown("**Purge pending review**") + st.caption("Removes only _pending_ listings, keeping your rejected history intact.") + if st.button("🗑 Purge Pending Only", use_container_width=True): + st.session_state["confirm_purge"] = "pending_only" + + if st.session_state.get("confirm_purge") == "pending_only": + st.warning("Deletes all pending jobs. Rejected jobs are kept. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge pending", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["pending"]) + st.success(f"Purged {deleted} pending jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with nonremote_col: + st.markdown("**Purge non-remote**") + st.caption("Removes pending/approved/rejected jobs where remote is not set. Keeps anything already in the pipeline.") + if st.button("🏢 Purge On-site Jobs", use_container_width=True): + st.session_state["confirm_purge"] = "non_remote" + + if st.session_state.get("confirm_purge") == "non_remote": + st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge on-site", type="primary", use_container_width=True): + deleted = purge_non_remote(DEFAULT_DB) + st.success(f"Purged {deleted} non-remote jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with approved_col: + st.markdown("**Purge approved (unapplied)**") + st.caption("Removes _approved_ jobs you haven't applied to yet — e.g. to reset after a review pass.") + if st.button("🗑 Purge Approved", use_container_width=True): + st.session_state["confirm_purge"] = "approved_only" + + if st.session_state.get("confirm_purge") == "approved_only": + st.warning("Deletes all approved-but-not-applied jobs. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge approved", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["approved"]) + st.success(f"Purged {deleted} approved jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + st.divider() + + archive_col1, archive_col2, _, _ = st.columns(4) + + with archive_col1: + st.markdown("**Archive remaining**") + st.caption( + "Move all _pending_ and _rejected_ jobs to archived status. " + "Archived jobs stay in the DB for dedup — they just won't appear in Job Review." + ) + if st.button("📦 Archive Pending + Rejected", use_container_width=True): + st.session_state["confirm_purge"] = "archive_remaining" + + if st.session_state.get("confirm_purge") == "archive_remaining": + st.info("Jobs will be archived (not deleted) — URLs are kept for dedup.") + c1, c2 = st.columns(2) + if c1.button("Yes, archive", type="primary", use_container_width=True): + archived = archive_jobs(DEFAULT_DB, statuses=["pending", "rejected"]) + st.success(f"Archived {archived} jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with archive_col2: + st.markdown("**Archive approved (unapplied)**") + st.caption("Archive _approved_ listings you decided to skip — keeps history without cluttering the apply queue.") + if st.button("📦 Archive Approved", use_container_width=True): + st.session_state["confirm_purge"] = "archive_approved" + + if st.session_state.get("confirm_purge") == "archive_approved": + st.info("Approved jobs will be archived (not deleted).") + c1, c2 = st.columns(2) + if c1.button("Yes, archive approved", type="primary", use_container_width=True): + archived = archive_jobs(DEFAULT_DB, statuses=["approved"]) + st.success(f"Archived {archived} approved jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..5f29348 --- /dev/null +++ b/app/app.py @@ -0,0 +1,119 @@ +# app/app.py +""" +Streamlit entry point — uses st.navigation() to control the sidebar. +Main workflow pages are listed at the top; Settings is separated into +a "System" section so it doesn't crowd the navigation. + +Run: streamlit run app/app.py + bash scripts/manage-ui.sh start +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import streamlit as st +from scripts.db import DEFAULT_DB, init_db, get_active_tasks +import sqlite3 + +st.set_page_config( + page_title="Job Seeker", + page_icon="💼", + layout="wide", +) + +init_db(DEFAULT_DB) + +# ── Startup cleanup — runs once per server process via cache_resource ────────── +@st.cache_resource +def _startup() -> None: + """Runs exactly once per server lifetime (st.cache_resource). + 1. Marks zombie tasks as failed. + 2. Auto-queues re-runs for any research generated without SearXNG data, + if SearXNG is now reachable. + """ + conn = sqlite3.connect(DEFAULT_DB) + conn.execute( + "UPDATE background_tasks SET status='failed', error='Interrupted by server restart'," + " finished_at=datetime('now') WHERE status IN ('queued','running')" + ) + conn.commit() + + # Auto-recovery: re-run LLM-only research when SearXNG is available + try: + import requests as _req + if _req.get("http://localhost:8888/", timeout=3).status_code == 200: + from scripts.task_runner import submit_task + _ACTIVE_STAGES = ("phone_screen", "interviewing", "offer", "hired") + rows = conn.execute( + """SELECT cr.job_id FROM company_research cr + JOIN jobs j ON j.id = cr.job_id + WHERE (cr.scrape_used IS NULL OR cr.scrape_used = 0) + AND j.status IN ({})""".format(",".join("?" * len(_ACTIVE_STAGES))), + _ACTIVE_STAGES, + ).fetchall() + for (job_id,) in rows: + submit_task(str(DEFAULT_DB), "company_research", job_id) + except Exception: + pass # never block startup + + conn.close() + +_startup() + +# ── Navigation ───────────────────────────────────────────────────────────────── +# st.navigation() must be called before any sidebar writes so it can establish +# the navigation structure first; sidebar additions come after. +pages = { + "": [ + st.Page("Home.py", title="Home", icon="🏠"), + st.Page("pages/1_Job_Review.py", title="Job Review", icon="📋"), + st.Page("pages/4_Apply.py", title="Apply Workspace", icon="🚀"), + st.Page("pages/5_Interviews.py", title="Interviews", icon="🎯"), + st.Page("pages/6_Interview_Prep.py", title="Interview Prep", icon="📞"), + st.Page("pages/7_Survey.py", title="Survey Assistant", icon="📋"), + ], + "System": [ + st.Page("pages/2_Settings.py", title="Settings", icon="⚙️"), + ], +} + +pg = st.navigation(pages) + +# ── Background task sidebar indicator ───────────────────────────────────────── +# Fragment polls every 3s so stage labels update live without a full page reload. +# The sidebar context WRAPS the fragment call — do not write to st.sidebar inside it. +@st.fragment(run_every=3) +def _task_indicator(): + tasks = get_active_tasks(DEFAULT_DB) + if not tasks: + return + st.divider() + st.markdown(f"**⏳ {len(tasks)} task(s) running**") + for t in tasks: + icon = "⏳" if t["status"] == "running" else "🕐" + task_type = t["task_type"] + if task_type == "cover_letter": + label = "Cover letter" + elif task_type == "company_research": + label = "Research" + elif task_type == "email_sync": + label = "Email sync" + elif task_type == "discovery": + label = "Discovery" + elif task_type == "enrich_descriptions": + label = "Enriching" + elif task_type == "scrape_url": + label = "Scraping URL" + elif task_type == "enrich_craigslist": + label = "Enriching listing" + else: + label = task_type.replace("_", " ").title() + stage = t.get("stage") or "" + detail = f" · {stage}" if stage else (f" — {t.get('company')}" if t.get("company") else "") + st.caption(f"{icon} {label}{detail}") + +with st.sidebar: + _task_indicator() + +pg.run() diff --git a/app/pages/1_Job_Review.py b/app/pages/1_Job_Review.py new file mode 100644 index 0000000..8f2c397 --- /dev/null +++ b/app/pages/1_Job_Review.py @@ -0,0 +1,203 @@ +# app/pages/1_Job_Review.py +""" +Job Review — browse listings, approve/reject inline, generate cover letters, +and mark approved jobs as applied. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +from scripts.db import ( + DEFAULT_DB, init_db, get_jobs_by_status, update_job_status, + update_cover_letter, mark_applied, get_email_leads, +) + +st.title("📋 Job Review") + +init_db(DEFAULT_DB) + +_email_leads = get_email_leads(DEFAULT_DB) + +# ── Sidebar filters ──────────────────────────────────────────────────────────── +with st.sidebar: + st.header("Filters") + show_status = st.selectbox( + "Show", + ["pending", "approved", "applied", "rejected", "synced"], + index=0, + ) + remote_only = st.checkbox("Remote only", value=False) + min_score = st.slider("Min match score", 0, 100, 0) + + st.header("Sort") + sort_by = st.selectbox( + "Sort by", + ["Date Found (newest)", "Date Found (oldest)", "Match Score (high→low)", "Match Score (low→high)", "Company A–Z", "Title A–Z"], + index=0, + ) + +jobs = get_jobs_by_status(DEFAULT_DB, show_status) + +if remote_only: + jobs = [j for j in jobs if j.get("is_remote")] +if min_score > 0: + jobs = [j for j in jobs if (j.get("match_score") or 0) >= min_score] + +# Apply sort +if sort_by == "Date Found (newest)": + jobs = sorted(jobs, key=lambda j: j.get("date_found") or "", reverse=True) +elif sort_by == "Date Found (oldest)": + jobs = sorted(jobs, key=lambda j: j.get("date_found") or "") +elif sort_by == "Match Score (high→low)": + jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0, reverse=True) +elif sort_by == "Match Score (low→high)": + jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0) +elif sort_by == "Company A–Z": + jobs = sorted(jobs, key=lambda j: (j.get("company") or "").lower()) +elif sort_by == "Title A–Z": + jobs = sorted(jobs, key=lambda j: (j.get("title") or "").lower()) + +if not jobs: + st.info(f"No {show_status} jobs matching your filters.") + st.stop() + +st.caption(f"Showing {len(jobs)} {show_status} job{'s' if len(jobs) != 1 else ''}") +st.divider() + +if show_status == "pending" and _email_leads: + st.subheader(f"📧 Email Leads ({len(_email_leads)})") + st.caption( + "Inbound recruiter emails not yet matched to a scraped listing. " + "Approve to add to Job Review; Reject to dismiss." + ) + for lead in _email_leads: + lead_id = lead["id"] + with st.container(border=True): + left_l, right_l = st.columns([7, 3]) + with left_l: + st.markdown(f"**{lead['title']}** — {lead['company']}") + badge_cols = st.columns(4) + badge_cols[0].caption("📧 Email Lead") + badge_cols[1].caption(f"📅 {lead.get('date_found', '')}") + if lead.get("description"): + with st.expander("📄 Email excerpt", expanded=False): + st.text(lead["description"][:500]) + with right_l: + if st.button("✅ Approve", key=f"el_approve_{lead_id}", + type="primary", use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "approved") + st.rerun() + if st.button("❌ Reject", key=f"el_reject_{lead_id}", + use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "rejected") + st.rerun() + st.divider() + +# Filter email leads out of the main pending list (already shown above) +if show_status == "pending": + jobs = [j for j in jobs if j.get("source") != "email"] + +# ── Job cards ────────────────────────────────────────────────────────────────── +for job in jobs: + job_id = job["id"] + + score = job.get("match_score") + if score is None: + score_badge = "⬜ No score" + elif score >= 70: + score_badge = f"🟢 {score:.0f}%" + elif score >= 40: + score_badge = f"🟡 {score:.0f}%" + else: + score_badge = f"🔴 {score:.0f}%" + + remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site" + src = (job.get("source") or "").lower() + source_badge = f"🤖 {src.title()}" if src == "linkedin" else f"👤 {src.title() or 'Manual'}" + + with st.container(border=True): + left, right = st.columns([7, 3]) + + # ── Left: job info ───────────────────────────────────────────────────── + with left: + st.markdown(f"**{job['title']}** — {job['company']}") + + badge_cols = st.columns(4) + badge_cols[0].caption(remote_badge) + badge_cols[1].caption(source_badge) + badge_cols[2].caption(score_badge) + badge_cols[3].caption(f"📅 {job.get('date_found', '')}") + + if job.get("keyword_gaps"): + st.caption(f"**Keyword gaps:** {job['keyword_gaps']}") + + # Cover letter expander (approved view) + if show_status == "approved": + _cl_key = f"cl_{job_id}" + if _cl_key not in st.session_state: + st.session_state[_cl_key] = job.get("cover_letter") or "" + + cl_exists = bool(st.session_state[_cl_key]) + with st.expander("📝 Cover Letter", expanded=cl_exists): + gen_label = "Regenerate" if cl_exists else "Generate Cover Letter" + if st.button(gen_label, key=f"gen_{job_id}"): + with st.spinner("Generating via LLM…"): + try: + from scripts.generate_cover_letter import generate as _gen + st.session_state[_cl_key] = _gen( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + st.rerun() + except Exception as e: + st.error(f"Generation failed: {e}") + + st.text_area( + "cover_letter_edit", + key=_cl_key, + height=300, + label_visibility="collapsed", + ) + save_col, _ = st.columns([2, 5]) + if save_col.button("💾 Save draft", key=f"save_cl_{job_id}"): + update_cover_letter(DEFAULT_DB, job_id, st.session_state[_cl_key]) + st.success("Saved!") + + # Applied date + cover letter preview (applied/synced) + if show_status in ("applied", "synced") and job.get("applied_at"): + st.caption(f"✅ Applied: {job['applied_at']}") + if show_status in ("applied", "synced") and job.get("cover_letter"): + with st.expander("📝 Cover Letter (sent)"): + st.text(job["cover_letter"]) + + # ── Right: actions ───────────────────────────────────────────────────── + with right: + if job.get("url"): + st.link_button("View listing →", job["url"], use_container_width=True) + if job.get("salary"): + st.caption(f"💰 {job['salary']}") + + if show_status == "pending": + if st.button("✅ Approve", key=f"approve_{job_id}", + type="primary", use_container_width=True): + update_job_status(DEFAULT_DB, [job_id], "approved") + st.rerun() + if st.button("❌ Reject", key=f"reject_{job_id}", + use_container_width=True): + update_job_status(DEFAULT_DB, [job_id], "rejected") + st.rerun() + + elif show_status == "approved": + if st.button("🚀 Apply →", key=f"apply_page_{job_id}", + type="primary", use_container_width=True): + st.session_state["apply_job_id"] = job_id + st.switch_page("pages/4_Apply.py") + if st.button("✅ Mark Applied", key=f"applied_{job_id}", + use_container_width=True): + cl_text = st.session_state.get(f"cl_{job_id}", "") + if cl_text: + update_cover_letter(DEFAULT_DB, job_id, cl_text) + mark_applied(DEFAULT_DB, [job_id]) + st.rerun() diff --git a/app/pages/2_Settings.py b/app/pages/2_Settings.py new file mode 100644 index 0000000..9e37a04 --- /dev/null +++ b/app/pages/2_Settings.py @@ -0,0 +1,842 @@ +# app/pages/2_Settings.py +""" +Settings — edit search profiles, LLM backends, Notion connection, services, +and resume profile (paste-able bullets used in Apply Workspace). +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.title("⚙️ Settings") + +CONFIG_DIR = Path(__file__).parent.parent.parent / "config" +SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml" +BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" +LLM_CFG = CONFIG_DIR / "llm.yaml" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml" + +def load_yaml(path: Path) -> dict: + if path.exists(): + return yaml.safe_load(path.read_text()) or {} + return {} + +def save_yaml(path: Path, data: dict) -> None: + path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True)) + + +def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict: + """Call LLM to suggest additional job titles and exclude keywords.""" + import json + import re + from scripts.llm_router import LLMRouter + + resume_context = "" + if resume_path.exists(): + resume = load_yaml(resume_path) + lines = [] + for exp in (resume.get("experience_details") or [])[:3]: + pos = exp.get("position", "") + co = exp.get("company", "") + skills = ", ".join((exp.get("skills_acquired") or [])[:5]) + lines.append(f"- {pos} at {co}: {skills}") + resume_context = "\n".join(lines) + + titles_str = "\n".join(f"- {t}" for t in current_titles) + prompt = f"""You are helping a job seeker optimize their search criteria. + +Their background (from resume): +{resume_context or "Customer success and technical account management leader"} + +Current job titles being searched: +{titles_str} + +Suggest: +1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants) +2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings) + +Return ONLY valid JSON in this exact format: +{{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}""" + + result = LLMRouter().complete(prompt).strip() + m = re.search(r"\{.*\}", result, re.DOTALL) + if m: + try: + return json.loads(m.group()) + except Exception: + pass + return {"suggested_titles": [], "suggested_excludes": []} + +tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs( + ["🔎 Search", "🤖 LLM Backends", "📚 Notion", "🔌 Services", "📝 Resume Profile", "📧 Email", "🏷️ Skills"] +) + +# ── Search tab ─────────────────────────────────────────────────────────────── +with tab_search: + cfg = load_yaml(SEARCH_CFG) + profiles = cfg.get("profiles", [{}]) + p = profiles[0] if profiles else {} + + # Seed session state from config on first load (or when config changes after save) + _sp_hash = str(p.get("titles", [])) + str(p.get("exclude_keywords", [])) + if st.session_state.get("_sp_hash") != _sp_hash: + st.session_state["_sp_titles"] = "\n".join(p.get("titles", [])) + st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", [])) + st.session_state["_sp_hash"] = _sp_hash + + # ── Titles ──────────────────────────────────────────────────────────────── + title_row, suggest_btn_col = st.columns([4, 1]) + with title_row: + st.subheader("Job Titles to Search") + with suggest_btn_col: + st.write("") # vertical align + _run_suggest = st.button("✨ Suggest", key="sp_suggest_btn", + help="Ask the LLM to suggest additional titles and exclude keywords based on your resume") + + titles_text = st.text_area( + "One title per line", + key="_sp_titles", + height=150, + help="JobSpy will search for any of these titles across all configured boards.", + label_visibility="visible", + ) + + # ── LLM suggestions panel ──────────────────────────────────────────────── + if _run_suggest: + current = [t.strip() for t in titles_text.splitlines() if t.strip()] + with st.spinner("Asking LLM for suggestions…"): + suggestions = _suggest_search_terms(current, RESUME_PATH) + st.session_state["_sp_suggestions"] = suggestions + + if st.session_state.get("_sp_suggestions"): + sugg = st.session_state["_sp_suggestions"] + s_titles = sugg.get("suggested_titles", []) + s_excl = sugg.get("suggested_excludes", []) + + existing_titles = {t.lower() for t in titles_text.splitlines() if t.strip()} + existing_excl = {e.lower() for e in st.session_state.get("_sp_excludes", "").splitlines() if e.strip()} + + if s_titles: + st.caption("**Suggested titles** — click to add:") + cols = st.columns(min(len(s_titles), 4)) + for i, title in enumerate(s_titles): + with cols[i % 4]: + if title.lower() not in existing_titles: + if st.button(f"+ {title}", key=f"sp_add_title_{i}"): + st.session_state["_sp_titles"] = ( + st.session_state.get("_sp_titles", "").rstrip("\n") + f"\n{title}" + ) + st.rerun() + else: + st.caption(f"✓ {title}") + + if s_excl: + st.caption("**Suggested exclusions** — click to add:") + cols2 = st.columns(min(len(s_excl), 4)) + for i, kw in enumerate(s_excl): + with cols2[i % 4]: + if kw.lower() not in existing_excl: + if st.button(f"+ {kw}", key=f"sp_add_excl_{i}"): + st.session_state["_sp_excludes"] = ( + st.session_state.get("_sp_excludes", "").rstrip("\n") + f"\n{kw}" + ) + st.rerun() + else: + st.caption(f"✓ {kw}") + + if st.button("✕ Clear suggestions", key="sp_clear_sugg"): + st.session_state.pop("_sp_suggestions", None) + st.rerun() + + st.subheader("Locations") + locations_text = st.text_area( + "One location per line", + value="\n".join(p.get("locations", [])), + height=100, + ) + + st.subheader("Exclude Keywords") + st.caption("Jobs whose **title or description** contain any of these words are silently dropped before entering the queue. Case-insensitive.") + exclude_text = st.text_area( + "One keyword or phrase per line", + key="_sp_excludes", + height=150, + help="e.g. 'sales', 'account executive', 'SDR'", + ) + + st.subheader("Job Boards") + board_options = ["linkedin", "indeed", "glassdoor", "zip_recruiter", "google"] + selected_boards = st.multiselect( + "Standard boards (via JobSpy)", board_options, + default=[b for b in p.get("boards", board_options) if b in board_options], + help="Google Jobs aggregates listings from many sources and often finds roles the other boards miss.", + ) + + _custom_board_options = ["adzuna", "theladders"] + _custom_board_labels = { + "adzuna": "Adzuna (free API — requires app_id + app_key in config/adzuna.yaml)", + "theladders": "The Ladders (curl_cffi scraper — $100K+ roles, requires curl_cffi)", + } + st.caption("**Custom boards** — scrapers built into this app, not part of JobSpy.") + selected_custom = st.multiselect( + "Custom boards", + options=_custom_board_options, + default=[b for b in p.get("custom_boards", []) if b in _custom_board_options], + format_func=lambda b: _custom_board_labels.get(b, b), + ) + + col1, col2 = st.columns(2) + results_per = col1.slider("Results per board", 5, 100, p.get("results_per_board", 25)) + hours_old = col2.slider("How far back to look (hours)", 24, 720, p.get("hours_old", 72)) + + if st.button("💾 Save search settings", type="primary"): + profiles[0] = { + **p, + "titles": [t.strip() for t in titles_text.splitlines() if t.strip()], + "locations": [loc.strip() for loc in locations_text.splitlines() if loc.strip()], + "boards": selected_boards, + "custom_boards": selected_custom, + "results_per_board": results_per, + "hours_old": hours_old, + "exclude_keywords": [k.strip() for k in exclude_text.splitlines() if k.strip()], + } + save_yaml(SEARCH_CFG, {"profiles": profiles}) + st.session_state["_sp_hash"] = "" # force re-seed on next load + st.session_state.pop("_sp_suggestions", None) + st.success("Search settings saved!") + + st.divider() + + # ── Blocklist ────────────────────────────────────────────────────────────── + with st.expander("🚫 Blocklist — companies, industries, and locations I will never work at", expanded=False): + st.caption( + "Listings matching any rule below are **silently dropped before entering the review queue**, " + "across all search profiles and custom boards. Changes take effect on the next discovery run." + ) + bl = load_yaml(BLOCKLIST_CFG) + + bl_companies = st.text_area( + "Company names (partial match, one per line)", + value="\n".join(bl.get("companies", [])), + height=120, + help="e.g. 'Amazon' blocks any listing where the company name contains 'amazon' (case-insensitive).", + key="bl_companies", + ) + bl_industries = st.text_area( + "Industry / content keywords (one per line)", + value="\n".join(bl.get("industries", [])), + height=100, + help="Blocked if the keyword appears in the company name OR job description. " + "e.g. 'gambling', 'crypto', 'tobacco', 'defense contractor'.", + key="bl_industries", + ) + bl_locations = st.text_area( + "Location strings to exclude (one per line)", + value="\n".join(bl.get("locations", [])), + height=80, + help="e.g. 'Dallas' blocks any listing whose location contains 'dallas'.", + key="bl_locations", + ) + + if st.button("💾 Save blocklist", type="primary", key="save_blocklist"): + save_yaml(BLOCKLIST_CFG, { + "companies": [c.strip() for c in bl_companies.splitlines() if c.strip()], + "industries": [i.strip() for i in bl_industries.splitlines() if i.strip()], + "locations": [loc.strip() for loc in bl_locations.splitlines() if loc.strip()], + }) + st.success("Blocklist saved — takes effect on next discovery run.") + +# ── LLM Backends tab ───────────────────────────────────────────────────────── +with tab_llm: + import requests as _req + + def _ollama_models(base_url: str) -> list[str]: + """Fetch installed model names from the Ollama /api/tags endpoint.""" + try: + r = _req.get(base_url.rstrip("/v1").rstrip("/") + "/api/tags", timeout=2) + if r.ok: + return [m["name"] for m in r.json().get("models", [])] + except Exception: + pass + return [] + + cfg = load_yaml(LLM_CFG) + backends = cfg.get("backends", {}) + fallback_order = cfg.get("fallback_order", list(backends.keys())) + + # Persist reordering across reruns triggered by ↑↓ buttons. + # Reset to config order whenever the config file is fresher than the session key. + _cfg_key = str(fallback_order) + if st.session_state.get("_llm_order_cfg_key") != _cfg_key: + st.session_state["_llm_order"] = list(fallback_order) + st.session_state["_llm_order_cfg_key"] = _cfg_key + new_order: list[str] = st.session_state["_llm_order"] + + # All known backends (in current order first, then any extras) + all_names = list(new_order) + [n for n in backends if n not in new_order] + + st.caption("Enable/disable backends and drag their priority with the ↑ ↓ buttons. " + "First enabled + reachable backend wins on each call.") + + updated_backends = {} + + for name in all_names: + b = backends.get(name, {}) + enabled = b.get("enabled", True) + label = name.replace("_", " ").title() + pos = new_order.index(name) + 1 if name in new_order else "—" + header = f"{'🟢' if enabled else '⚫'} **{pos}. {label}**" + + with st.expander(header, expanded=False): + col_tog, col_up, col_dn, col_spacer = st.columns([2, 1, 1, 4]) + + new_enabled = col_tog.checkbox("Enabled", value=enabled, key=f"{name}_enabled") + + # Up / Down only apply to backends currently in the order + if name in new_order: + idx = new_order.index(name) + if col_up.button("↑", key=f"{name}_up", disabled=idx == 0): + new_order[idx], new_order[idx - 1] = new_order[idx - 1], new_order[idx] + st.session_state["_llm_order"] = new_order + st.rerun() + if col_dn.button("↓", key=f"{name}_dn", disabled=idx == len(new_order) - 1): + new_order[idx], new_order[idx + 1] = new_order[idx + 1], new_order[idx] + st.session_state["_llm_order"] = new_order + st.rerun() + + if b.get("type") == "openai_compat": + url = st.text_input("URL", value=b.get("base_url", ""), key=f"{name}_url") + + # Ollama gets a live model picker; other backends get a text input + if name == "ollama": + ollama_models = _ollama_models(b.get("base_url", "http://localhost:11434")) + current_model = b.get("model", "") + if ollama_models: + options = ollama_models + idx_default = options.index(current_model) if current_model in options else 0 + model = st.selectbox( + "Model", + options, + index=idx_default, + key=f"{name}_model", + help="Lists models currently installed in Ollama. Pull new ones with `ollama pull `.", + ) + else: + st.caption("_Ollama not reachable — enter model name manually_") + model = st.text_input("Model", value=current_model, key=f"{name}_model") + else: + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + + updated_backends[name] = {**b, "base_url": url, "model": model, "enabled": new_enabled} + elif b.get("type") == "anthropic": + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + updated_backends[name] = {**b, "model": model, "enabled": new_enabled} + else: + updated_backends[name] = {**b, "enabled": new_enabled} + + if b.get("type") == "openai_compat": + if st.button(f"Test connection", key=f"test_{name}"): + with st.spinner("Testing…"): + try: + from scripts.llm_router import LLMRouter + r = LLMRouter() + reachable = r._is_reachable(b.get("base_url", "")) + if reachable: + st.success("Reachable ✓") + else: + st.warning("Not reachable ✗") + except Exception as e: + st.error(f"Error: {e}") + + st.divider() + st.caption("Current priority: " + " → ".join( + f"{'✓' if backends.get(n, {}).get('enabled', True) else '✗'} {n}" + for n in new_order + )) + + if st.button("💾 Save LLM settings", type="primary"): + save_yaml(LLM_CFG, {**cfg, "backends": updated_backends, "fallback_order": new_order}) + st.session_state.pop("_llm_order", None) + st.session_state.pop("_llm_order_cfg_key", None) + st.success("LLM settings saved!") + +# ── Notion tab ──────────────────────────────────────────────────────────────── +with tab_notion: + cfg = load_yaml(NOTION_CFG) if NOTION_CFG.exists() else {} + + st.subheader("Notion Connection") + token = st.text_input( + "Integration Token", + value=cfg.get("token", ""), + type="password", + help="Find this at notion.so/my-integrations → your integration → Internal Integration Token", + ) + db_id = st.text_input( + "Database ID", + value=cfg.get("database_id", ""), + help="The 32-character ID from your Notion database URL", + ) + + col_save, col_test = st.columns(2) + if col_save.button("💾 Save Notion settings", type="primary"): + save_yaml(NOTION_CFG, {**cfg, "token": token, "database_id": db_id}) + st.success("Notion settings saved!") + + if col_test.button("🔌 Test connection"): + with st.spinner("Connecting…"): + try: + from notion_client import Client + n = Client(auth=token) + db = n.databases.retrieve(db_id) + st.success(f"Connected to: **{db['title'][0]['plain_text']}**") + except Exception as e: + st.error(f"Connection failed: {e}") + +# ── Services tab ─────────────────────────────────────────────────────────────── +with tab_services: + import socket + import subprocess as _sp + + TOKENS_CFG = CONFIG_DIR / "tokens.yaml" + PFP_DIR = Path("/Library/Documents/Post Fight Processing") + + # Service definitions: (display_name, port, start_cmd, stop_cmd, notes) + SERVICES = [ + { + "name": "Streamlit UI", + "port": 8501, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "note": "Job Seeker web interface", + }, + { + "name": "Ollama (local LLM)", + "port": 11434, + "start": ["sudo", "systemctl", "start", "ollama"], + "stop": ["sudo", "systemctl", "stop", "ollama"], + "cwd": "/", + "note": "Local inference engine — systemd service", + }, + { + "name": "Claude Code Wrapper", + "port": 3009, + "start": ["bash", str(PFP_DIR / "manage-services.sh"), "start"], + "stop": ["bash", str(PFP_DIR / "manage-services.sh"), "stop"], + "cwd": str(PFP_DIR), + "note": "OpenAI-compat proxy → Claude Code (port 3009)", + }, + { + "name": "GitHub Copilot Wrapper", + "port": 3010, + "start": ["bash", str(PFP_DIR / "manage-copilot.sh"), "start"], + "stop": ["bash", str(PFP_DIR / "manage-copilot.sh"), "stop"], + "cwd": str(PFP_DIR), + "note": "OpenAI-compat proxy → GitHub Copilot (port 3010)", + }, + { + "name": "vLLM Server", + "port": 8000, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "model_dir": "/Library/Assets/LLM/vllm/models", + "note": "Local vLLM inference — Ouro model family (port 8000, GPU 1)", + }, + { + "name": "Vision Service (moondream2)", + "port": 8002, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "note": "Survey screenshot analysis — moondream2 (port 8002, optional)", + }, + { + "name": "SearXNG (company scraper)", + "port": 8888, + "start": ["docker", "compose", "up", "-d"], + "stop": ["docker", "compose", "down"], + "cwd": str(Path("/Library/Development/scrapers/SearXNG")), + "note": "Privacy-respecting meta-search used for company research (port 8888)", + }, + ] + + def _port_open(port: int) -> bool: + try: + with socket.create_connection(("127.0.0.1", port), timeout=1): + return True + except OSError: + return False + + st.caption("Monitor and control the LLM backend services. Status is checked live on each page load.") + + for svc in SERVICES: + up = _port_open(svc["port"]) + badge = "🟢 Running" if up else "🔴 Stopped" + header = f"**{svc['name']}** — {badge}" + + with st.container(border=True): + left_col, right_col = st.columns([3, 1]) + with left_col: + st.markdown(header) + st.caption(f"Port {svc['port']} · {svc['note']}") + + # Model selector for services backed by a local model directory (e.g. vLLM) + if "model_dir" in svc: + _mdir = Path(svc["model_dir"]) + _models = ( + sorted(d.name for d in _mdir.iterdir() if d.is_dir()) + if _mdir.exists() else [] + ) + _mk = f"svc_model_{svc['port']}" + _loaded_file = Path("/tmp/vllm-server.model") + _loaded = _loaded_file.read_text().strip() if (_loaded_file.exists()) else "" + if _models: + _default = _models.index(_loaded) if _loaded in _models else 0 + st.selectbox( + "Model", + _models, + index=_default, + key=_mk, + disabled=up, + help="Model to load on start. Stop then Start to swap models.", + ) + else: + st.caption(f"_No models found in {svc['model_dir']}_") + + with right_col: + if svc["start"] is None: + st.caption("_Manual start only_") + elif up: + if st.button("⏹ Stop", key=f"svc_stop_{svc['port']}", use_container_width=True): + with st.spinner(f"Stopping {svc['name']}…"): + r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"]) + if r.returncode == 0: + st.success("Stopped.") + else: + st.error(f"Error: {r.stderr or r.stdout}") + st.rerun() + else: + # Build start command, appending selected model for services with model_dir + _start_cmd = list(svc["start"]) + if "model_dir" in svc: + _sel = st.session_state.get(f"svc_model_{svc['port']}") + if _sel: + _start_cmd.append(_sel) + if st.button("▶ Start", key=f"svc_start_{svc['port']}", use_container_width=True, type="primary"): + with st.spinner(f"Starting {svc['name']}…"): + r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"]) + if r.returncode == 0: + st.success("Started!") + else: + st.error(f"Error: {r.stderr or r.stdout}") + st.rerun() + + st.divider() + st.subheader("🤗 Hugging Face") + st.caption( + "Used for uploading training data and running fine-tune jobs on HF infrastructure. " + "Token is stored in `config/tokens.yaml` (git-ignored). " + "Create a **write-permission** token at huggingface.co/settings/tokens." + ) + + tok_cfg = load_yaml(TOKENS_CFG) if TOKENS_CFG.exists() else {} + hf_token = st.text_input( + "HF Token", + value=tok_cfg.get("hf_token", ""), + type="password", + placeholder="hf_…", + ) + + col_save_hf, col_test_hf = st.columns(2) + if col_save_hf.button("💾 Save HF token", type="primary"): + save_yaml(TOKENS_CFG, {**tok_cfg, "hf_token": hf_token}) + TOKENS_CFG.chmod(0o600) + st.success("Saved!") + + if col_test_hf.button("🔌 Test HF token"): + with st.spinner("Checking…"): + try: + import requests as _r + resp = _r.get( + "https://huggingface.co/api/whoami", + headers={"Authorization": f"Bearer {hf_token}"}, + timeout=5, + ) + if resp.ok: + info = resp.json() + name = info.get("name") or info.get("fullname") or "unknown" + auth = info.get("auth", {}) + perm = auth.get("accessToken", {}).get("role", "read") + st.success(f"Logged in as **{name}** · permission: `{perm}`") + if perm == "read": + st.warning("Token is read-only — create a **write** token to upload datasets and run Jobs.") + else: + st.error(f"Invalid token ({resp.status_code})") + except Exception as e: + st.error(f"Error: {e}") + +# ── Resume Profile tab ──────────────────────────────────────────────────────── +with tab_resume: + st.caption( + "Edit Alex's application profile. " + "Bullets are used as paste-able shortcuts in the Apply Workspace." + ) + + if not RESUME_PATH.exists(): + st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.stop() + + _data = yaml.safe_load(RESUME_PATH.read_text()) or {} + + def _field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str: + needs_attention = str(value).startswith("FILL_IN") or value == "" + if needs_attention: + st.markdown( + '

⚠️ Needs attention

', + unsafe_allow_html=True, + ) + return st.text_input(label, value=value or "", key=key, help=help, + type="password" if password else "default") + + # ── Personal Info ───────────────────────────────────────────────────────── + with st.expander("👤 Personal Information", expanded=True): + _info = _data.get("personal_information", {}) + _c1, _c2 = st.columns(2) + with _c1: + _name = _field("First Name", _info.get("name", ""), "rp_name") + _email = _field("Email", _info.get("email", ""), "rp_email") + _phone = _field("Phone", _info.get("phone", ""), "rp_phone") + _city = _field("City", _info.get("city", ""), "rp_city") + with _c2: + _surname = _field("Last Name", _info.get("surname", ""), "rp_surname") + _linkedin = _field("LinkedIn URL", _info.get("linkedin", ""), "rp_linkedin") + _zip_code = _field("Zip Code", _info.get("zip_code", ""), "rp_zip") + _dob = _field("Date of Birth", _info.get("date_of_birth", ""), "rp_dob", + help="MM/DD/YYYY") + + # ── Experience ──────────────────────────────────────────────────────────── + with st.expander("💼 Work Experience"): + _exp_list = _data.get("experience_details", [{}]) + if "rp_exp_count" not in st.session_state: + st.session_state.rp_exp_count = len(_exp_list) + if st.button("+ Add Experience Entry", key="rp_add_exp"): + st.session_state.rp_exp_count += 1 + _exp_list.append({}) + + _updated_exp = [] + for _i in range(st.session_state.rp_exp_count): + _exp = _exp_list[_i] if _i < len(_exp_list) else {} + st.markdown(f"**Position {_i + 1}**") + _ec1, _ec2 = st.columns(2) + with _ec1: + _pos = _field("Job Title", _exp.get("position", ""), f"rp_pos_{_i}") + _co = _field("Company", _exp.get("company", ""), f"rp_co_{_i}") + _period = _field("Period", _exp.get("employment_period", ""), f"rp_period_{_i}", + help="e.g. 01/2022 - Present") + with _ec2: + _loc = st.text_input("Location", _exp.get("location", ""), key=f"rp_loc_{_i}") + _ind = st.text_input("Industry", _exp.get("industry", ""), key=f"rp_ind_{_i}") + _resp_raw = st.text_area( + "Key Responsibilities (one per line)", + value="\n".join( + r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r) + for j, r in enumerate(_exp.get("key_responsibilities", [])) + ), + key=f"rp_resp_{_i}", height=100, + ) + _skills_raw = st.text_input( + "Skills (comma-separated)", + value=", ".join(_exp.get("skills_acquired", [])), + key=f"rp_skills_{_i}", + ) + _updated_exp.append({ + "position": _pos, "company": _co, "employment_period": _period, + "location": _loc, "industry": _ind, + "key_responsibilities": [{"responsibility_1": r.strip()} for r in _resp_raw.splitlines() if r.strip()], + "skills_acquired": [s.strip() for s in _skills_raw.split(",") if s.strip()], + }) + st.divider() + + # ── Preferences ─────────────────────────────────────────────────────────── + with st.expander("⚙️ Preferences & Availability"): + _wp = _data.get("work_preferences", {}) + _sal = _data.get("salary_expectations", {}) + _avail = _data.get("availability", {}) + _pc1, _pc2 = st.columns(2) + with _pc1: + _salary_range = st.text_input("Salary Range (USD)", _sal.get("salary_range_usd", ""), + key="rp_salary", help="e.g. 120000 - 180000") + _notice = st.text_input("Notice Period", _avail.get("notice_period", "2 weeks"), key="rp_notice") + with _pc2: + _remote = st.checkbox("Open to Remote", value=_wp.get("remote_work", "Yes") == "Yes", key="rp_remote") + _reloc = st.checkbox("Open to Relocation", value=_wp.get("open_to_relocation", "No") == "Yes", key="rp_reloc") + _assessments = st.checkbox("Willing to complete assessments", + value=_wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="rp_assess") + _bg = st.checkbox("Willing to undergo background checks", + value=_wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="rp_bg") + + # ── Self-ID ─────────────────────────────────────────────────────────────── + with st.expander("🏳️‍🌈 Self-Identification (optional)"): + _sid = _data.get("self_identification", {}) + _sc1, _sc2 = st.columns(2) + with _sc1: + _gender = st.text_input("Gender identity", _sid.get("gender", "Non-binary"), key="rp_gender") + _pronouns = st.text_input("Pronouns", _sid.get("pronouns", "Any"), key="rp_pronouns") + _ethnicity = _field("Ethnicity", _sid.get("ethnicity", ""), "rp_ethnicity") + with _sc2: + _vet_opts = ["No", "Yes", "Prefer not to say"] + _veteran = st.selectbox("Veteran status", _vet_opts, + index=_vet_opts.index(_sid.get("veteran", "No")), key="rp_vet") + _dis_opts = ["Prefer not to say", "No", "Yes"] + _disability = st.selectbox("Disability disclosure", _dis_opts, + index=_dis_opts.index(_sid.get("disability", "Prefer not to say")), + key="rp_dis") + + st.divider() + if st.button("💾 Save Resume Profile", type="primary", use_container_width=True, key="rp_save"): + _data["personal_information"] = { + **_data.get("personal_information", {}), + "name": _name, "surname": _surname, "email": _email, "phone": _phone, + "city": _city, "zip_code": _zip_code, "linkedin": _linkedin, "date_of_birth": _dob, + } + _data["experience_details"] = _updated_exp + _data["salary_expectations"] = {"salary_range_usd": _salary_range} + _data["availability"] = {"notice_period": _notice} + _data["work_preferences"] = { + **_data.get("work_preferences", {}), + "remote_work": "Yes" if _remote else "No", + "open_to_relocation": "Yes" if _reloc else "No", + "willing_to_complete_assessments": "Yes" if _assessments else "No", + "willing_to_undergo_background_checks": "Yes" if _bg else "No", + } + _data["self_identification"] = { + "gender": _gender, "pronouns": _pronouns, "veteran": _veteran, + "disability": _disability, "ethnicity": _ethnicity, + } + RESUME_PATH.write_text(yaml.dump(_data, default_flow_style=False, allow_unicode=True)) + st.success("✅ Resume profile saved!") + st.balloons() + +# ── Email tab ───────────────────────────────────────────────────────────────── +with tab_email: + EMAIL_CFG = CONFIG_DIR / "email.yaml" + EMAIL_EXAMPLE = CONFIG_DIR / "email.yaml.example" + + st.caption( + "Connect Alex's email via IMAP to automatically associate recruitment " + "emails with job applications. Only emails that mention the company name " + "AND contain a recruitment keyword are ever imported — no personal emails " + "are touched." + ) + + if not EMAIL_CFG.exists(): + st.info("No email config found — fill in your credentials below and click **Save** to create it.") + + em_cfg = load_yaml(EMAIL_CFG) if EMAIL_CFG.exists() else {} + + col_a, col_b = st.columns(2) + with col_a: + em_host = st.text_input("IMAP Host", em_cfg.get("host", "imap.gmail.com"), key="em_host") + em_port = st.number_input("Port", value=int(em_cfg.get("port", 993)), + min_value=1, max_value=65535, key="em_port") + em_ssl = st.checkbox("Use SSL", value=em_cfg.get("use_ssl", True), key="em_ssl") + with col_b: + em_user = st.text_input("Username (email address)", em_cfg.get("username", ""), key="em_user") + em_pass = st.text_input("Password / App Password", em_cfg.get("password", ""), + type="password", key="em_pass") + em_sent = st.text_input("Sent folder (blank = auto-detect)", + em_cfg.get("sent_folder", ""), key="em_sent", + placeholder='e.g. "[Gmail]/Sent Mail"') + + em_days = st.slider("Look-back window (days)", 14, 365, + int(em_cfg.get("lookback_days", 90)), key="em_days") + + st.caption( + "**Gmail users:** create an App Password at " + "myaccount.google.com/apppasswords (requires 2-Step Verification). " + "Enable IMAP at Gmail Settings → Forwarding and POP/IMAP." + ) + + col_save, col_test = st.columns(2) + + if col_save.button("💾 Save email settings", type="primary", key="em_save"): + save_yaml(EMAIL_CFG, { + "host": em_host, "port": int(em_port), "use_ssl": em_ssl, + "username": em_user, "password": em_pass, + "sent_folder": em_sent, "lookback_days": int(em_days), + }) + EMAIL_CFG.chmod(0o600) + st.success("Saved!") + + if col_test.button("🔌 Test connection", key="em_test"): + with st.spinner("Connecting…"): + try: + import imaplib as _imap + _conn = (_imap.IMAP4_SSL if em_ssl else _imap.IMAP4)(em_host, int(em_port)) + _conn.login(em_user, em_pass) + _, _caps = _conn.capability() + _conn.logout() + st.success(f"Connected successfully to {em_host}") + except Exception as e: + st.error(f"Connection failed: {e}") + +# ── Skills & Keywords tab ───────────────────────────────────────────────────── +with tab_skills: + st.subheader("🏷️ Skills & Keywords") + st.caption( + "These are matched against job descriptions to select Alex's most relevant " + "experience and highlight keyword overlap in the research brief." + ) + + if not KEYWORDS_CFG.exists(): + st.warning("resume_keywords.yaml not found — create it at config/resume_keywords.yaml") + else: + kw_data = load_yaml(KEYWORDS_CFG) + + changed = False + for category in ["skills", "domains", "keywords"]: + st.markdown(f"**{category.title()}**") + tags: list[str] = kw_data.get(category, []) + + if not tags: + st.caption("No tags yet — add one below.") + + # Render existing tags as removable chips (value-based keys for stability) + n_cols = min(max(len(tags), 1), 6) + cols = st.columns(n_cols) + to_remove = None + for i, tag in enumerate(tags): + with cols[i % n_cols]: + if st.button(f"× {tag}", key=f"rm_{category}_{tag}", use_container_width=True): + to_remove = tag + if to_remove: + tags.remove(to_remove) + kw_data[category] = tags + changed = True + + # Add new tag + new_col, btn_col = st.columns([4, 1]) + new_tag = new_col.text_input( + "Add", + key=f"new_{category}", + label_visibility="collapsed", + placeholder=f"Add {category[:-1] if category.endswith('s') else category}…", + ) + if btn_col.button("＋ Add", key=f"add_{category}"): + tag = new_tag.strip() + if tag and tag not in tags: + tags.append(tag) + kw_data[category] = tags + changed = True + + st.markdown("---") + + if changed: + save_yaml(KEYWORDS_CFG, kw_data) + st.success("Saved.") + st.rerun() diff --git a/app/pages/3_Resume_Editor.py b/app/pages/3_Resume_Editor.py new file mode 100644 index 0000000..092c2a3 --- /dev/null +++ b/app/pages/3_Resume_Editor.py @@ -0,0 +1,191 @@ +# app/pages/3_Resume_Editor.py +""" +Resume Editor — form-based editor for Alex's AIHawk profile YAML. +FILL_IN fields highlighted in amber. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.set_page_config(page_title="Resume Editor", page_icon="📝", layout="wide") +st.title("📝 Resume Editor") +st.caption("Edit Alex's application profile used by AIHawk for LinkedIn Easy Apply.") + +RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" + +if not RESUME_PATH.exists(): + st.error(f"Resume file not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.stop() + +data = yaml.safe_load(RESUME_PATH.read_text()) or {} + + +def field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str: + """Render a text input, highlighted amber if value is FILL_IN or empty.""" + needs_attention = str(value).startswith("FILL_IN") or value == "" + if needs_attention: + st.markdown( + '

⚠️ Needs your attention

We need a CSM with Salesforce.

" + mock_get.return_value.raise_for_status = MagicMock() + result = extract_job_description("https://example.com/job/123") + + assert "CSM" in result + assert "Salesforce" in result + + +def test_score_is_between_0_and_100(): + """match_score returns a float in [0, 100].""" + from scripts.match import match_score + + # Provide minimal inputs that the scorer can handle + score, gaps = match_score( + resume_text="Customer Success Manager with Salesforce experience", + job_text="Looking for a Customer Success Manager who knows Salesforce and Gainsight", + ) + assert 0 <= score <= 100 + assert isinstance(gaps, list) + + +def test_write_score_to_notion(): + """write_match_to_notion updates the Notion page with score and gaps.""" + from scripts.match import write_match_to_notion + + mock_notion = MagicMock() + write_match_to_notion(mock_notion, "page-id-abc", 85.5, ["Gainsight", "Churnzero"]) + + mock_notion.pages.update.assert_called_once() + call_kwargs = mock_notion.pages.update.call_args[1] + assert call_kwargs["page_id"] == "page-id-abc" + score_val = call_kwargs["properties"]["Match Score"]["number"] + assert score_val == 85.5 +``` + +**Step 2: Run tests to verify they fail** + +```bash +conda run -n job-seeker pytest tests/test_match.py -v +``` + +Expected: `ImportError` — `scripts.match` doesn't exist. + +**Step 3: Write scripts/match.py** + +```python +# scripts/match.py +""" +Resume Matcher integration: score a Notion job listing against Alex's resume. +Writes Match Score and Keyword Gaps back to the Notion page. + +Usage: + conda run -n job-seeker python scripts/match.py +""" +import re +import sys +from pathlib import Path + +import requests +import yaml +from bs4 import BeautifulSoup +from notion_client import Client + +CONFIG_DIR = Path(__file__).parent.parent / "config" +RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf") + + +def load_notion() -> tuple[Client, str]: + cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + return Client(auth=cfg["token"]), cfg["database_id"] + + +def extract_page_id(url_or_id: str) -> str: + """Extract 32-char Notion page ID from a URL or return as-is.""" + match = re.search(r"[0-9a-f]{32}", url_or_id.replace("-", "")) + if match: + return match.group(0) + return url_or_id.strip() + + +def get_job_url_from_notion(notion: Client, page_id: str) -> str: + page = notion.pages.retrieve(page_id) + return page["properties"]["URL"]["url"] + + +def extract_job_description(url: str) -> str: + """Fetch a job listing URL and return its visible text.""" + resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "header", "footer"]): + tag.decompose() + return " ".join(soup.get_text(separator=" ").split()) + + +def read_resume_text() -> str: + """Extract text from the ATS-clean PDF resume.""" + try: + import pypdf + reader = pypdf.PdfReader(str(RESUME_PATH)) + return " ".join(page.extract_text() or "" for page in reader.pages) + except ImportError: + import PyPDF2 + with open(RESUME_PATH, "rb") as f: + reader = PyPDF2.PdfReader(f) + return " ".join(p.extract_text() or "" for p in reader.pages) + + +def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]: + """ + Score resume against job description using TF-IDF keyword overlap. + Returns (score 0-100, list of keywords in job not found in resume). + """ + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + import numpy as np + + vectorizer = TfidfVectorizer(stop_words="english", max_features=200) + tfidf = vectorizer.fit_transform([resume_text, job_text]) + score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100 + + # Keyword gap: terms in job description not present in resume (lowercased) + job_terms = set(job_text.lower().split()) + resume_terms = set(resume_text.lower().split()) + feature_names = vectorizer.get_feature_names_out() + job_tfidf = tfidf[1].toarray()[0] + top_indices = np.argsort(job_tfidf)[::-1][:30] + top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0] + gaps = [t for t in top_job_terms if t not in resume_terms][:10] + + return round(score, 1), gaps + + +def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str]) -> None: + notion.pages.update( + page_id=page_id, + properties={ + "Match Score": {"number": score}, + "Keyword Gaps": {"rich_text": [{"text": {"content": ", ".join(gaps)}}]}, + }, + ) + + +def run_match(page_url_or_id: str) -> None: + notion, _ = load_notion() + page_id = extract_page_id(page_url_or_id) + + print(f"[match] Page ID: {page_id}") + job_url = get_job_url_from_notion(notion, page_id) + print(f"[match] Fetching job description from: {job_url}") + + job_text = extract_job_description(job_url) + resume_text = read_resume_text() + + score, gaps = match_score(resume_text, job_text) + print(f"[match] Score: {score}/100") + print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}") + + write_match_to_notion(notion, page_id, score, gaps) + print("[match] Written to Notion.") + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python scripts/match.py ") + sys.exit(1) + run_match(sys.argv[1]) +``` + +**Step 4: Install sklearn (needed by match.py)** + +```bash +conda run -n job-seeker pip install scikit-learn beautifulsoup4 pypdf +``` + +**Step 5: Run tests** + +```bash +conda run -n job-seeker pytest tests/test_match.py -v +``` + +Expected: 3 tests PASS. + +**Step 6: Commit** + +```bash +cd /devl/job-seeker +git add scripts/match.py tests/test_match.py +git commit -m "feat: add resume match scoring with Notion write-back" +``` + +--- + +## Task 8: Clone and Configure AIHawk + +**Step 1: Clone AIHawk** + +```bash +cd /devl/job-seeker +git clone https://github.com/feder-cr/Auto_Jobs_Applier_AIHawk.git aihawk +``` + +**Step 2: Install AIHawk dependencies** + +```bash +conda run -n job-seeker pip install -r /devl/job-seeker/aihawk/requirements.txt +``` + +**Step 3: Install Playwright browsers (AIHawk uses Playwright for browser automation)** + +```bash +conda run -n job-seeker playwright install chromium +``` + +**Step 4: Create AIHawk personal info config** + +AIHawk reads a `personal_info.yaml`. Create it in AIHawk's data directory: + +```bash +cp /devl/job-seeker/aihawk/data_folder/plain_text_resume.yaml \ + /devl/job-seeker/aihawk/data_folder/plain_text_resume.yaml.bak +``` + +Edit `/devl/job-seeker/aihawk/data_folder/plain_text_resume.yaml` with Alex's info. +Key fields to fill: +- `personal_information`: name, email, phone, linkedin, github (leave blank), location +- `work_experience`: pull from the SVG content already extracted +- `education`: Texas State University, Mass Communications & PR, 2012-2015 +- `skills`: Zendesk, Intercom, Asana, Jira, etc. + +**Step 5: Configure AIHawk to use the LLM router** + +AIHawk's config (`aihawk/data_folder/config.yaml`) has an `llm_model_type` and `llm_model` field. +Set it to use the local OpenAI-compatible endpoint: + +```yaml +# In aihawk/data_folder/config.yaml +llm_model_type: openai +llm_model: claude-code-terminal +openai_api_url: http://localhost:3009/v1 # or whichever backend is running +``` + +If 3009 is down, change to `http://localhost:11434/v1` (Ollama). + +**Step 6: Run AIHawk in dry-run mode first** + +```bash +conda run -n job-seeker python /devl/job-seeker/aihawk/main.py --help +``` + +Review the flags. Start with a test run before enabling real submissions. + +**Step 7: Commit the environment update** + +```bash +cd /devl/job-seeker +conda env export -n job-seeker > environment.yml +git add environment.yml +git commit -m "chore: update environment.yml with all installed packages" +``` + +--- + +## Task 9: End-to-End Smoke Test + +**Step 1: Run full test suite** + +```bash +conda run -n job-seeker pytest tests/ -v +``` + +Expected: all tests PASS. + +**Step 2: Run discovery** + +```bash +conda run -n job-seeker python scripts/discover.py +``` + +Expected: new listings appear in Notion with Status=New. + +**Step 3: Run match on one listing** + +Copy the URL of a Notion page from the DB and run: + +```bash +conda run -n job-seeker python scripts/match.py "https://www.notion.so/..." +``` + +Expected: Match Score and Keyword Gaps written back to that Notion page. + +**Step 4: Commit anything left** + +```bash +cd /devl/job-seeker +git status +git add -p # stage only code/config, not secrets +git commit -m "chore: final smoke test cleanup" +``` + +--- + +## Quick Reference + +| Command | What it does | +|---|---| +| `conda run -n job-seeker python scripts/discover.py` | Scrape boards → push new listings to Notion | +| `conda run -n job-seeker python scripts/match.py ` | Score a listing → write back to Notion | +| `conda run -n job-seeker streamlit run resume_matcher/streamlit_app.py --server.port 8501` | Open Resume Matcher UI | +| `conda run -n job-seeker pytest tests/ -v` | Run all tests | +| `cd "/Library/Documents/Post Fight Processing" && ./manage.sh start` | Start Claude Code pipeline (port 3009) | +| `cd "/Library/Documents/Post Fight Processing" && ./manage-copilot.sh start` | Start Copilot wrapper (port 3010) | diff --git a/docs/plans/2026-02-20-ui-design.md b/docs/plans/2026-02-20-ui-design.md new file mode 100644 index 0000000..3088b0a --- /dev/null +++ b/docs/plans/2026-02-20-ui-design.md @@ -0,0 +1,148 @@ +# Job Seeker Platform — Web UI Design + +**Date:** 2026-02-20 +**Status:** Approved + +## Overview + +A Streamlit multi-page web UI that gives Alex (and her partner) a friendly interface to review scraped job listings, curate them before they hit Notion, edit search/LLM/Notion settings, and fill out her AIHawk application profile. Designed to be usable by anyone — no technical knowledge required. + +--- + +## Architecture & Data Flow + +``` +discover.py → SQLite staging.db (status: pending) + ↓ + Streamlit UI + review / approve / reject + ↓ + "Sync N approved jobs" button + ↓ + Notion DB (status: synced) +``` + +`discover.py` is modified to write to SQLite instead of directly to Notion. +A new `sync.py` handles the approved → Notion push. +`db.py` provides shared SQLite helpers used by both scripts and UI pages. + +### SQLite Schema (`staging.db`, gitignored) + +```sql +CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, + company TEXT, + url TEXT UNIQUE, + source TEXT, + location TEXT, + is_remote INTEGER, + salary TEXT, + description TEXT, + match_score REAL, + keyword_gaps TEXT, + date_found TEXT, + status TEXT DEFAULT 'pending', -- pending / approved / rejected / synced + notion_page_id TEXT +); +``` + +--- + +## Pages + +### Home (Dashboard) +- Stat cards: Pending / Approved / Rejected / Synced counts +- "Run Discovery" button — runs `discover.py` as subprocess, streams output +- "Sync N approved jobs → Notion" button — visible only when approved count > 0 +- Recent activity list (last 10 jobs found) + +### Job Review +- Filterable table/card view of pending jobs +- Filters: source (LinkedIn/Indeed/etc), remote only toggle, minimum match score slider +- Checkboxes for batch selection +- "Approve Selected" / "Reject Selected" buttons +- Rejected jobs hidden by default, togglable +- Match score shown as colored badge (green ≥70, amber 40–69, red <40) + +### Settings +Three tabs: + +**Search** — edit `config/search_profiles.yaml`: +- Job titles (add/remove tags) +- Locations (add/remove) +- Boards checkboxes +- Hours old slider +- Results per board slider + +**LLM Backends** — edit `config/llm.yaml`: +- Fallback order (drag or up/down arrows) +- Per-backend: URL, model name, enabled toggle +- "Test connection" button per backend + +**Notion** — edit `config/notion.yaml`: +- Token field (masked, show/hide toggle) +- Database ID +- "Test connection" button + +### Resume Editor +Sectioned form over `aihawk/data_folder/plain_text_resume.yaml`: +- **Personal Info** — name, email, phone, LinkedIn, city, zip +- **Education** — list of entries, add/remove buttons +- **Experience** — list of entries, add/remove buttons +- **Skills & Interests** — tag-style inputs +- **Preferences** — salary range, notice period, remote/relocation toggles +- **Self-Identification** — gender, pronouns, veteran, disability, ethnicity (with "prefer not to say" options) +- **Legal** — work authorization checkboxes + +`FILL_IN` fields highlighted in amber with "Needs your attention" note. +Save button writes back to YAML. No raw YAML shown by default. + +--- + +## Theme & Styling + +Central theme at `app/.streamlit/config.toml`: +- Dark base, accent color teal/green (job search = growth) +- Consistent font (Inter or system sans-serif) +- Responsive column layouts — usable on tablet/mobile +- No jargon — "Run Discovery" not "Execute scrape", "Sync to Notion" not "Push records" + +--- + +## File Layout + +``` +app/ +├── .streamlit/ +│ └── config.toml # central theme +├── Home.py # dashboard +└── pages/ + ├── 1_Job_Review.py + ├── 2_Settings.py + └── 3_Resume_Editor.py +scripts/ +├── db.py # new: SQLite helpers +├── sync.py # new: approved → Notion push +├── discover.py # modified: write to SQLite not Notion +├── match.py # unchanged +└── llm_router.py # unchanged +``` + +Run: `conda run -n job-seeker streamlit run app/Home.py` + +--- + +## New Dependencies + +None — `streamlit` already installed via resume_matcher deps. +`sqlite3` is Python stdlib. + +--- + +## Out of Scope + +- Real-time collaboration +- Mobile native app +- Cover letter editor (handled separately via LoRA fine-tune task) +- AIHawk trigger from UI (run manually for now) diff --git a/docs/plans/2026-02-20-ui-implementation.md b/docs/plans/2026-02-20-ui-implementation.md new file mode 100644 index 0000000..ba235ae --- /dev/null +++ b/docs/plans/2026-02-20-ui-implementation.md @@ -0,0 +1,1458 @@ +# Job Seeker Web UI Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Build a Streamlit web UI with SQLite staging so Alex can review scraped jobs, approve/batch-sync to Notion, edit settings, and complete her AIHawk profile. + +**Architecture:** `discover.py` writes to a local SQLite `staging.db` instead of Notion directly. Streamlit pages read/write SQLite for job review, YAML files for settings and resume. A new `sync.py` pushes approved jobs to Notion on demand. + +**Tech Stack:** Python 3.12, Streamlit (already installed), sqlite3 (stdlib), pyyaml, notion-client, conda env `job-seeker` + +--- + +## Task 1: SQLite DB helpers (`db.py`) + +**Files:** +- Create: `scripts/db.py` +- Create: `tests/test_db.py` +- Modify: `.gitignore` (add `staging.db`) + +**Step 1: Add staging.db to .gitignore** + +```bash +echo "staging.db" >> /devl/job-seeker/.gitignore +``` + +**Step 2: Write failing tests** + +```python +# tests/test_db.py +import pytest +import sqlite3 +from pathlib import Path +from unittest.mock import patch + + +def test_init_db_creates_jobs_table(tmp_path): + """init_db creates a jobs table with correct schema.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'") + assert cursor.fetchone() is not None + conn.close() + + +def test_insert_job_returns_id(tmp_path): + """insert_job inserts a row and returns its id.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Great role", "date_found": "2026-02-20", + } + row_id = insert_job(db_path, job) + assert isinstance(row_id, int) + assert row_id > 0 + + +def test_insert_job_skips_duplicate_url(tmp_path): + """insert_job returns None if URL already exists.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + insert_job(db_path, job) + result = insert_job(db_path, job) + assert result is None + + +def test_get_jobs_by_status(tmp_path): + """get_jobs_by_status returns only jobs with matching status.""" + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + row_id = insert_job(db_path, job) + update_job_status(db_path, [row_id], "approved") + approved = get_jobs_by_status(db_path, "approved") + pending = get_jobs_by_status(db_path, "pending") + assert len(approved) == 1 + assert len(pending) == 0 + + +def test_update_job_status_batch(tmp_path): + """update_job_status updates multiple rows at once.""" + from scripts.db import init_db, insert_job, update_job_status, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + ids = [] + for i in range(3): + job = {"title": f"Job {i}", "company": "Co", "url": f"https://example.com/{i}", + "source": "indeed", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + ids.append(insert_job(db_path, job)) + update_job_status(db_path, ids, "rejected") + assert len(get_jobs_by_status(db_path, "rejected")) == 3 +``` + +**Step 3: Run tests — expect ImportError** + +```bash +conda run -n job-seeker pytest tests/test_db.py -v +``` + +Expected: `ModuleNotFoundError: No module named 'scripts.db'` + +**Step 4: Write `scripts/db.py`** + +```python +# scripts/db.py +""" +SQLite staging layer for job listings. +Jobs flow: pending → approved/rejected → synced +""" +import sqlite3 +from pathlib import Path +from typing import Optional + +DEFAULT_DB = Path(__file__).parent.parent / "staging.db" + +CREATE_JOBS = """ +CREATE TABLE IF NOT EXISTS jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, + company TEXT, + url TEXT UNIQUE, + source TEXT, + location TEXT, + is_remote INTEGER DEFAULT 0, + salary TEXT, + description TEXT, + match_score REAL, + keyword_gaps TEXT, + date_found TEXT, + status TEXT DEFAULT 'pending', + notion_page_id TEXT +); +""" + + +def init_db(db_path: Path = DEFAULT_DB) -> None: + """Create tables if they don't exist.""" + conn = sqlite3.connect(db_path) + conn.execute(CREATE_JOBS) + conn.commit() + conn.close() + + +def insert_job(db_path: Path = DEFAULT_DB, job: dict = None) -> Optional[int]: + """ + Insert a job. Returns row id, or None if URL already exists. + """ + if job is None: + return None + conn = sqlite3.connect(db_path) + try: + cursor = conn.execute( + """INSERT INTO jobs + (title, company, url, source, location, is_remote, salary, description, date_found) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + job.get("title", ""), + job.get("company", ""), + job.get("url", ""), + job.get("source", ""), + job.get("location", ""), + int(bool(job.get("is_remote", False))), + job.get("salary", ""), + job.get("description", ""), + job.get("date_found", ""), + ), + ) + conn.commit() + return cursor.lastrowid + except sqlite3.IntegrityError: + return None # duplicate URL + finally: + conn.close() + + +def get_jobs_by_status(db_path: Path = DEFAULT_DB, status: str = "pending") -> list[dict]: + """Return all jobs with the given status as a list of dicts.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM jobs WHERE status = ? ORDER BY date_found DESC, id DESC", + (status,), + ) + rows = [dict(row) for row in cursor.fetchall()] + conn.close() + return rows + + +def get_job_counts(db_path: Path = DEFAULT_DB) -> dict: + """Return counts per status.""" + conn = sqlite3.connect(db_path) + cursor = conn.execute( + "SELECT status, COUNT(*) as n FROM jobs GROUP BY status" + ) + counts = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + return counts + + +def update_job_status(db_path: Path = DEFAULT_DB, ids: list[int] = None, status: str = "approved") -> None: + """Batch-update status for a list of job IDs.""" + if not ids: + return + conn = sqlite3.connect(db_path) + conn.execute( + f"UPDATE jobs SET status = ? WHERE id IN ({','.join('?' * len(ids))})", + [status] + list(ids), + ) + conn.commit() + conn.close() + + +def get_existing_urls(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all URLs already in staging (any status).""" + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT url FROM jobs") + urls = {row[0] for row in cursor.fetchall()} + conn.close() + return urls + + +def write_match_scores(db_path: Path = DEFAULT_DB, job_id: int = None, + score: float = 0.0, gaps: str = "") -> None: + """Write match score and keyword gaps back to a job row.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE jobs SET match_score = ?, keyword_gaps = ? WHERE id = ?", + (score, gaps, job_id), + ) + conn.commit() + conn.close() +``` + +**Step 5: Run tests — expect 5 passing** + +```bash +conda run -n job-seeker pytest tests/test_db.py -v +``` + +Expected: `5 passed` + +**Step 6: Commit** + +```bash +cd /devl/job-seeker +git add scripts/db.py tests/test_db.py .gitignore +git commit -m "feat: add SQLite staging layer (db.py)" +``` + +--- + +## Task 2: Update `discover.py` to write to SQLite + +**Files:** +- Modify: `scripts/discover.py` +- Modify: `tests/test_discover.py` + +**Step 1: Update the tests** + +Replace the existing `tests/test_discover.py` with this version that tests SQLite writes: + +```python +# tests/test_discover.py +import pytest +from unittest.mock import patch, MagicMock +import pandas as pd +from pathlib import Path + +SAMPLE_JOB = { + "title": "Customer Success Manager", + "company": "Acme Corp", + "location": "Remote", + "is_remote": True, + "job_url": "https://linkedin.com/jobs/view/123456", + "site": "linkedin", + "min_amount": 90000, + "max_amount": 120000, + "salary_source": "$90,000 - $120,000", + "description": "Great CS role", +} + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} +SAMPLE_PROFILES_CFG = { + "profiles": [{"name": "cs", "titles": ["Customer Success Manager"], + "locations": ["Remote"], "boards": ["linkedin"], + "results_per_board": 5, "hours_old": 72}] +} + + +def make_jobs_df(jobs=None): + return pd.DataFrame(jobs or [SAMPLE_JOB]) + + +def test_discover_writes_to_sqlite(tmp_path): + """run_discovery inserts new jobs into SQLite staging db.""" + from scripts.discover import run_discovery + from scripts.db import get_jobs_by_status + + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 + assert jobs[0]["title"] == "Customer Success Manager" + + +def test_discover_skips_duplicate_urls(tmp_path): + """run_discovery does not insert a job whose URL is already in SQLite.""" + from scripts.discover import run_discovery + from scripts.db import init_db, insert_job, get_jobs_by_status + + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-01-01", + }) + + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 # only the pre-existing one, not a duplicate + + +def test_discover_pushes_new_jobs(): + """Legacy: discover still calls push_to_notion when notion_push=True.""" + from scripts.discover import run_discovery + import tempfile, os + db_path = Path(tempfile.mktemp(suffix=".db")) + try: + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.push_to_notion") as mock_push, \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path, notion_push=True) + assert mock_push.call_count == 1 + finally: + if db_path.exists(): + os.unlink(db_path) + + +def test_push_to_notion_sets_status_new(): + """push_to_notion always sets Status to the configured status_new value.""" + from scripts.discover import push_to_notion + mock_notion = MagicMock() + push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM) + call_kwargs = mock_notion.pages.create.call_args[1] + status = call_kwargs["properties"]["Status of Application"]["select"]["name"] + assert status == "Application Submitted" +``` + +**Step 2: Run tests — some will fail** + +```bash +conda run -n job-seeker pytest tests/test_discover.py -v +``` + +Expected: `test_discover_writes_to_sqlite` and `test_discover_skips_duplicate_urls` fail. + +**Step 3: Update `scripts/discover.py`** + +Add `db_path` and `notion_push` parameters to `run_discovery`. Default writes to SQLite only: + +```python +# scripts/discover.py +""" +JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True). + +Usage: + conda run -n job-seeker python scripts/discover.py +""" +import yaml +from datetime import datetime +from pathlib import Path + +import pandas as pd +from jobspy import scrape_jobs +from notion_client import Client + +from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls + +CONFIG_DIR = Path(__file__).parent.parent / "config" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml" + + +def load_config() -> tuple[dict, dict]: + profiles = yaml.safe_load(PROFILES_CFG.read_text()) + notion_cfg = yaml.safe_load(NOTION_CFG.read_text()) + return profiles, notion_cfg + + +def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]: + """Return the set of all job URLs already tracked in Notion (for notion_push mode).""" + existing: set[str] = set() + has_more = True + start_cursor = None + while has_more: + kwargs: dict = {"database_id": db_id, "page_size": 100} + if start_cursor: + kwargs["start_cursor"] = start_cursor + resp = notion.databases.query(**kwargs) + for page in resp["results"]: + url = page["properties"].get(url_field, {}).get("url") + if url: + existing.add(url) + has_more = resp.get("has_more", False) + start_cursor = resp.get("next_cursor") + return existing + + +def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None: + """Create a new page in the Notion jobs database for a single listing.""" + min_amt = job.get("min_amount") + max_amt = job.get("max_amount") + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + title_content = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""): + title_content = str(job["salary_source"]) + else: + title_content = str(job.get("title", "Unknown")) + + job_url = str(job.get("job_url", "") or "") + if job_url in ("nan", "None"): + job_url = "" + + notion.pages.create( + parent={"database_id": db_id}, + properties={ + fm["title_field"]: {"title": [{"text": {"content": title_content}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]}, + fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]}, + fm["url"]: {"url": job_url or None}, + fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", False))}, + fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}}, + }, + ) + + +def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False) -> None: + profiles_cfg, notion_cfg = load_config() + fm = notion_cfg["field_map"] + + # SQLite dedup + init_db(db_path) + existing_urls = db_existing_urls(db_path) + + # Notion dedup (only in notion_push mode) + notion = None + if notion_push: + notion = Client(auth=notion_cfg["token"]) + existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"]) + + print(f"[discover] {len(existing_urls)} existing listings") + new_count = 0 + + for profile in profiles_cfg["profiles"]: + print(f"\n[discover] Profile: {profile['name']}") + for location in profile["locations"]: + print(f" Scraping: {location}") + jobs: pd.DataFrame = scrape_jobs( + site_name=profile["boards"], + search_term=" OR ".join(f'"{t}"' for t in profile["titles"]), + location=location, + results_wanted=profile.get("results_per_board", 25), + hours_old=profile.get("hours_old", 72), + linkedin_fetch_description=True, + ) + + for _, job in jobs.iterrows(): + url = str(job.get("job_url", "") or "") + if not url or url in ("nan", "None") or url in existing_urls: + continue + + job_dict = job.to_dict() + + # Always write to SQLite staging + min_amt = job_dict.get("min_amount") + max_amt = job_dict.get("max_amount") + salary_str = "" + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + salary_str = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""): + salary_str = str(job_dict["salary_source"]) + + insert_job(db_path, { + "title": str(job_dict.get("title", "")), + "company": str(job_dict.get("company", "") or ""), + "url": url, + "source": str(job_dict.get("site", "")), + "location": str(job_dict.get("location", "") or ""), + "is_remote": bool(job_dict.get("is_remote", False)), + "salary": salary_str, + "description": str(job_dict.get("description", "") or ""), + "date_found": datetime.now().isoformat()[:10], + }) + + # Optionally also push straight to Notion + if notion_push: + push_to_notion(notion, notion_cfg["database_id"], job_dict, fm) + + existing_urls.add(url) + new_count += 1 + print(f" + {job.get('title')} @ {job.get('company')}") + + print(f"\n[discover] Done — {new_count} new listings staged.") + + +if __name__ == "__main__": + run_discovery() +``` + +**Step 4: Run tests — expect 4 passing** + +```bash +conda run -n job-seeker pytest tests/test_discover.py -v +``` + +Expected: `4 passed` + +**Step 5: Run full suite** + +```bash +conda run -n job-seeker pytest tests/ -v +``` + +Expected: all tests pass. + +**Step 6: Commit** + +```bash +cd /devl/job-seeker +git add scripts/discover.py tests/test_discover.py +git commit -m "feat: route discover.py through SQLite staging layer" +``` + +--- + +## Task 3: `sync.py` — approved → Notion push + +**Files:** +- Create: `scripts/sync.py` +- Create: `tests/test_sync.py` + +**Step 1: Write failing tests** + +```python +# tests/test_sync.py +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path + + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} + +SAMPLE_JOB = { + "id": 1, "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": 1, + "salary": "$100k", "description": "Good role", "match_score": 80.0, + "keyword_gaps": "Gainsight, Churnzero", "date_found": "2026-02-20", + "status": "approved", "notion_page_id": None, +} + + +def test_sync_pushes_approved_jobs(tmp_path): + """sync_to_notion pushes approved jobs and marks them synced.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + + db_path = tmp_path / "test.db" + init_db(db_path) + row_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Good role", "date_found": "2026-02-20", + }) + update_job_status(db_path, [row_id], "approved") + + mock_notion = MagicMock() + mock_notion.pages.create.return_value = {"id": "notion-page-abc"} + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client", return_value=mock_notion): + count = sync_to_notion(db_path=db_path) + + assert count == 1 + mock_notion.pages.create.assert_called_once() + synced = get_jobs_by_status(db_path, "synced") + assert len(synced) == 1 + + +def test_sync_returns_zero_when_nothing_approved(tmp_path): + """sync_to_notion returns 0 when there are no approved jobs.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client"): + count = sync_to_notion(db_path=db_path) + + assert count == 0 +``` + +**Step 2: Run tests — expect ImportError** + +```bash +conda run -n job-seeker pytest tests/test_sync.py -v +``` + +Expected: `ModuleNotFoundError: No module named 'scripts.sync'` + +**Step 3: Write `scripts/sync.py`** + +```python +# scripts/sync.py +""" +Push approved jobs from SQLite staging to Notion. + +Usage: + conda run -n job-seeker python scripts/sync.py +""" +import yaml +from pathlib import Path +from datetime import datetime + +from notion_client import Client + +from scripts.db import DEFAULT_DB, get_jobs_by_status, update_job_status + +CONFIG_DIR = Path(__file__).parent.parent / "config" + + +def load_notion_config() -> dict: + return yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + + +def sync_to_notion(db_path: Path = DEFAULT_DB) -> int: + """Push all approved jobs to Notion. Returns count synced.""" + cfg = load_notion_config() + notion = Client(auth=cfg["token"]) + db_id = cfg["database_id"] + fm = cfg["field_map"] + + approved = get_jobs_by_status(db_path, "approved") + if not approved: + print("[sync] No approved jobs to sync.") + return 0 + + synced_ids = [] + for job in approved: + try: + page = notion.pages.create( + parent={"database_id": db_id}, + properties={ + fm["title_field"]: {"title": [{"text": {"content": job.get("salary") or job.get("title", "")}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": job.get("title", "")}}]}, + fm["company"]: {"rich_text": [{"text": {"content": job.get("company", "")}}]}, + fm["url"]: {"url": job.get("url") or None}, + fm["source"]: {"multi_select": [{"name": job.get("source", "unknown").title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", 0))}, + fm["date_found"]: {"date": {"start": job.get("date_found", datetime.now().isoformat()[:10])}}, + fm["match_score"]: {"number": job.get("match_score")}, + fm["keyword_gaps"]: {"rich_text": [{"text": {"content": job.get("keyword_gaps") or ""}}]}, + }, + ) + synced_ids.append(job["id"]) + print(f"[sync] + {job.get('title')} @ {job.get('company')}") + except Exception as e: + print(f"[sync] Error syncing {job.get('url')}: {e}") + + update_job_status(db_path, synced_ids, "synced") + print(f"[sync] Done — {len(synced_ids)} jobs synced to Notion.") + return len(synced_ids) + + +if __name__ == "__main__": + sync_to_notion() +``` + +**Step 4: Run tests — expect 2 passing** + +```bash +conda run -n job-seeker pytest tests/test_sync.py -v +``` + +Expected: `2 passed` + +**Step 5: Full suite** + +```bash +conda run -n job-seeker pytest tests/ -v +``` + +Expected: all pass. + +**Step 6: Commit** + +```bash +cd /devl/job-seeker +git add scripts/sync.py tests/test_sync.py +git commit -m "feat: add sync.py to push approved jobs from SQLite to Notion" +``` + +--- + +## Task 4: Streamlit theme + app scaffold + +**Files:** +- Create: `app/.streamlit/config.toml` +- Create: `app/Home.py` +- Create: `app/pages/1_Job_Review.py` (stub) +- Create: `app/pages/2_Settings.py` (stub) +- Create: `app/pages/3_Resume_Editor.py` (stub) + +No tests for Streamlit page rendering — test helper functions instead. + +**Step 1: Create theme** + +```toml +# app/.streamlit/config.toml +[theme] +base = "dark" +primaryColor = "#2DD4BF" # teal +backgroundColor = "#0F172A" # slate-900 +secondaryBackgroundColor = "#1E293B" # slate-800 +textColor = "#F1F5F9" # slate-100 +font = "sans serif" +``` + +**Step 2: Create `app/Home.py`** + +```python +# app/Home.py +""" +Job Seeker Dashboard — Home page. +Shows counts, Run Discovery button, and Sync to Notion button. +""" +import subprocess +import sys +from pathlib import Path + +import streamlit as st + +# Make scripts importable +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db, get_job_counts + +st.set_page_config( + page_title="Alex's Job Search", + page_icon="🔍", + layout="wide", +) + +init_db(DEFAULT_DB) +counts = get_job_counts(DEFAULT_DB) + +st.title("🔍 Alex's Job Search") +st.caption("Discover → Review → Sync to Notion") + +st.divider() + +# Stat cards +col1, col2, col3, col4 = st.columns(4) +col1.metric("Pending Review", counts.get("pending", 0)) +col2.metric("Approved", counts.get("approved", 0)) +col3.metric("Synced to Notion", counts.get("synced", 0)) +col4.metric("Rejected", counts.get("rejected", 0)) + +st.divider() + +# Actions +left, right = st.columns(2) + +with left: + st.subheader("Find New Jobs") + st.caption("Scrapes all configured boards and adds new listings to your review queue.") + if st.button("🚀 Run Discovery", use_container_width=True, type="primary"): + with st.spinner("Scraping job boards…"): + result = subprocess.run( + ["conda", "run", "-n", "job-seeker", "python", "scripts/discover.py"], + capture_output=True, text=True, + cwd=str(Path(__file__).parent.parent), + ) + if result.returncode == 0: + st.success("Discovery complete! Head to Job Review to see new listings.") + st.code(result.stdout) + else: + st.error("Discovery failed.") + st.code(result.stderr) + +with right: + approved_count = counts.get("approved", 0) + st.subheader("Send to Notion") + st.caption("Push all approved jobs to your Notion tracking database.") + if approved_count == 0: + st.info("No approved jobs yet. Review and approve some listings first.") + else: + if st.button(f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion", + use_container_width=True, type="primary"): + with st.spinner("Syncing to Notion…"): + from scripts.sync import sync_to_notion + count = sync_to_notion(DEFAULT_DB) + st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!") + st.rerun() +``` + +**Step 3: Create page stubs** + +```python +# app/pages/1_Job_Review.py +import streamlit as st +st.set_page_config(page_title="Job Review", page_icon="📋", layout="wide") +st.title("📋 Job Review") +st.info("Coming soon — Task 5") +``` + +```python +# app/pages/2_Settings.py +import streamlit as st +st.set_page_config(page_title="Settings", page_icon="⚙️", layout="wide") +st.title("⚙️ Settings") +st.info("Coming soon — Task 6") +``` + +```python +# app/pages/3_Resume_Editor.py +import streamlit as st +st.set_page_config(page_title="Resume Editor", page_icon="📝", layout="wide") +st.title("📝 Resume Editor") +st.info("Coming soon — Task 7") +``` + +**Step 4: Smoke test** + +```bash +conda run -n job-seeker streamlit run /devl/job-seeker/app/Home.py --server.headless true & +sleep 4 +curl -s http://localhost:8501 | grep -q "Alex" && echo "OK" || echo "FAIL" +kill %1 +``` + +Expected: `OK` + +**Step 5: Commit** + +```bash +cd /devl/job-seeker +git add app/ +git commit -m "feat: add Streamlit app scaffold with dark theme and dashboard" +``` + +--- + +## Task 5: Job Review page + +**Files:** +- Modify: `app/pages/1_Job_Review.py` + +No separate unit tests — logic is inline Streamlit. Test manually after implement. + +**Step 1: Replace stub with full implementation** + +```python +# app/pages/1_Job_Review.py +""" +Job Review — browse pending listings, batch approve or reject. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +from scripts.db import DEFAULT_DB, init_db, get_jobs_by_status, update_job_status + +st.set_page_config(page_title="Job Review", page_icon="📋", layout="wide") +st.title("📋 Job Review") + +init_db(DEFAULT_DB) + +# Filters sidebar +with st.sidebar: + st.header("Filters") + show_status = st.selectbox("Show", ["pending", "approved", "rejected", "synced"], index=0) + remote_only = st.checkbox("Remote only", value=False) + min_score = st.slider("Min match score", 0, 100, 0) + st.divider() + st.caption("Use checkboxes to select jobs, then approve or reject in bulk.") + +jobs = get_jobs_by_status(DEFAULT_DB, show_status) + +# Apply filters +if remote_only: + jobs = [j for j in jobs if j.get("is_remote")] +if min_score > 0: + jobs = [j for j in jobs if (j.get("match_score") or 0) >= min_score] + +if not jobs: + st.info(f"No {show_status} jobs matching your filters.") + st.stop() + +st.caption(f"Showing {len(jobs)} {show_status} job{'s' if len(jobs) != 1 else ''}") + +# Batch action buttons (only relevant for pending) +if show_status == "pending": + col_a, col_b, col_c = st.columns([2, 2, 6]) + select_all = col_a.button("Select all", use_container_width=True) + clear_all = col_b.button("Clear all", use_container_width=True) + + if "selected_ids" not in st.session_state: + st.session_state.selected_ids = set() + if select_all: + st.session_state.selected_ids = {j["id"] for j in jobs} + if clear_all: + st.session_state.selected_ids = set() + + col_approve, col_reject, _ = st.columns([2, 2, 6]) + if col_approve.button("✅ Approve selected", use_container_width=True, type="primary", + disabled=not st.session_state.selected_ids): + update_job_status(DEFAULT_DB, list(st.session_state.selected_ids), "approved") + st.session_state.selected_ids = set() + st.success("Approved!") + st.rerun() + if col_reject.button("❌ Reject selected", use_container_width=True, + disabled=not st.session_state.selected_ids): + update_job_status(DEFAULT_DB, list(st.session_state.selected_ids), "rejected") + st.session_state.selected_ids = set() + st.success("Rejected.") + st.rerun() + +st.divider() + +# Job cards +for job in jobs: + score = job.get("match_score") + if score is None: + score_badge = "⬜ No score" + elif score >= 70: + score_badge = f"🟢 {score:.0f}%" + elif score >= 40: + score_badge = f"🟡 {score:.0f}%" + else: + score_badge = f"🔴 {score:.0f}%" + + remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site" + source_badge = job.get("source", "").title() + + with st.container(border=True): + left, right = st.columns([8, 2]) + with left: + checked = st.checkbox( + f"**{job['title']}** — {job['company']}", + key=f"chk_{job['id']}", + value=job["id"] in st.session_state.get("selected_ids", set()), + ) + if checked: + st.session_state.setdefault("selected_ids", set()).add(job["id"]) + else: + st.session_state.setdefault("selected_ids", set()).discard(job["id"]) + + cols = st.columns(4) + cols[0].caption(remote_badge) + cols[1].caption(f"📌 {source_badge}") + cols[2].caption(score_badge) + cols[3].caption(f"📅 {job.get('date_found', '')}") + + if job.get("keyword_gaps"): + st.caption(f"**Keyword gaps:** {job['keyword_gaps']}") + + with right: + if job.get("url"): + st.link_button("View listing →", job["url"], use_container_width=True) + if job.get("salary"): + st.caption(f"💰 {job['salary']}") +``` + +**Step 2: Manual smoke test** + +```bash +conda run -n job-seeker streamlit run /devl/job-seeker/app/Home.py +``` + +Open http://localhost:8501, navigate to Job Review. Confirm filters and empty state work. + +**Step 3: Commit** + +```bash +cd /devl/job-seeker +git add app/pages/1_Job_Review.py +git commit -m "feat: add Job Review page with batch approve/reject" +``` + +--- + +## Task 6: Settings page + +**Files:** +- Modify: `app/pages/2_Settings.py` + +**Step 1: Replace stub** + +```python +# app/pages/2_Settings.py +""" +Settings — edit search profiles, LLM backends, and Notion connection. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.set_page_config(page_title="Settings", page_icon="⚙️", layout="wide") +st.title("⚙️ Settings") + +CONFIG_DIR = Path(__file__).parent.parent.parent / "config" +SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml" +LLM_CFG = CONFIG_DIR / "llm.yaml" +NOTION_CFG = CONFIG_DIR / "notion.yaml" + + +def load_yaml(path: Path) -> dict: + if path.exists(): + return yaml.safe_load(path.read_text()) or {} + return {} + + +def save_yaml(path: Path, data: dict) -> None: + path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True)) + + +tab_search, tab_llm, tab_notion = st.tabs(["🔎 Search", "🤖 LLM Backends", "📚 Notion"]) + +# ── Search tab ────────────────────────────────────────────────────────────── +with tab_search: + cfg = load_yaml(SEARCH_CFG) + profiles = cfg.get("profiles", [{}]) + p = profiles[0] # edit first profile for now + + st.subheader("Job Titles to Search") + titles_text = st.text_area( + "One title per line", + value="\n".join(p.get("titles", [])), + height=150, + help="JobSpy will search for any of these titles across all configured boards.", + ) + + st.subheader("Locations") + locations_text = st.text_area( + "One location per line", + value="\n".join(p.get("locations", [])), + height=100, + ) + + st.subheader("Job Boards") + board_options = ["linkedin", "indeed", "glassdoor", "zip_recruiter"] + selected_boards = st.multiselect( + "Active boards", board_options, + default=p.get("boards", board_options), + ) + + col1, col2 = st.columns(2) + results_per = col1.slider("Results per board", 5, 100, p.get("results_per_board", 25)) + hours_old = col2.slider("How far back to look (hours)", 24, 720, p.get("hours_old", 72)) + + if st.button("💾 Save search settings", type="primary"): + profiles[0] = { + **p, + "titles": [t.strip() for t in titles_text.splitlines() if t.strip()], + "locations": [l.strip() for l in locations_text.splitlines() if l.strip()], + "boards": selected_boards, + "results_per_board": results_per, + "hours_old": hours_old, + } + save_yaml(SEARCH_CFG, {"profiles": profiles}) + st.success("Search settings saved!") + +# ── LLM Backends tab ──────────────────────────────────────────────────────── +with tab_llm: + cfg = load_yaml(LLM_CFG) + backends = cfg.get("backends", {}) + fallback_order = cfg.get("fallback_order", list(backends.keys())) + + st.subheader("Fallback Order") + st.caption("Backends are tried top-to-bottom. First reachable one wins.") + st.write(" → ".join(fallback_order)) + + st.subheader("Backend Configuration") + updated_backends = {} + for name in fallback_order: + b = backends.get(name, {}) + with st.expander(f"**{name.replace('_', ' ').title()}**", expanded=False): + if b.get("type") == "openai_compat": + url = st.text_input("URL", value=b.get("base_url", ""), key=f"{name}_url") + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + updated_backends[name] = {**b, "base_url": url, "model": model} + elif b.get("type") == "anthropic": + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + updated_backends[name] = {**b, "model": model} + else: + updated_backends[name] = b + + if st.button(f"Test {name}", key=f"test_{name}"): + with st.spinner("Testing…"): + try: + import sys + sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + from scripts.llm_router import LLMRouter + r = LLMRouter() + reachable = r._is_reachable(b.get("base_url", "")) + st.success("Reachable ✓") if reachable else st.warning("Not reachable") + except Exception as e: + st.error(f"Error: {e}") + + if st.button("💾 Save LLM settings", type="primary"): + save_yaml(LLM_CFG, {**cfg, "backends": updated_backends}) + st.success("LLM settings saved!") + +# ── Notion tab ─────────────────────────────────────────────────────────────── +with tab_notion: + cfg = load_yaml(NOTION_CFG) if NOTION_CFG.exists() else {} + + st.subheader("Notion Connection") + token = st.text_input( + "Integration Token", + value=cfg.get("token", ""), + type="password", + help="Find this at notion.so/my-integrations → your integration → Internal Integration Token", + ) + db_id = st.text_input( + "Database ID", + value=cfg.get("database_id", ""), + help="The 32-character ID from your Notion database URL", + ) + + col_save, col_test = st.columns(2) + if col_save.button("💾 Save Notion settings", type="primary"): + save_yaml(NOTION_CFG, {**cfg, "token": token, "database_id": db_id}) + st.success("Notion settings saved!") + + if col_test.button("🔌 Test connection"): + with st.spinner("Connecting…"): + try: + from notion_client import Client + n = Client(auth=token) + db = n.databases.retrieve(db_id) + st.success(f"Connected to: **{db['title'][0]['plain_text']}**") + except Exception as e: + st.error(f"Connection failed: {e}") +``` + +**Step 2: Manual smoke test** + +Navigate to Settings in the running Streamlit app. Confirm all three tabs render, save/load works. + +**Step 3: Commit** + +```bash +cd /devl/job-seeker +git add app/pages/2_Settings.py +git commit -m "feat: add Settings page with search, LLM, and Notion tabs" +``` + +--- + +## Task 7: Resume Editor page + +**Files:** +- Modify: `app/pages/3_Resume_Editor.py` + +**Step 1: Replace stub** + +```python +# app/pages/3_Resume_Editor.py +""" +Resume Editor — form-based editor for Alex's AIHawk profile YAML. +FILL_IN fields highlighted in amber. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.set_page_config(page_title="Resume Editor", page_icon="📝", layout="wide") +st.title("📝 Resume Editor") +st.caption("Edit Alex's application profile used by AIHawk for LinkedIn Easy Apply.") + +RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" + +if not RESUME_PATH.exists(): + st.error(f"Resume file not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.stop() + +data = yaml.safe_load(RESUME_PATH.read_text()) or {} + + +def field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str: + """Render a text input, highlighted amber if value is FILL_IN.""" + needs_attention = str(value).startswith("FILL_IN") or value == "" + if needs_attention: + st.markdown( + f'

⚠️ Needs your attention

', + unsafe_allow_html=True, + ) + return st.text_input(label, value=value or "", key=key, help=help, + type="password" if password else "default") + + +st.divider() + +# ── Personal Info ────────────────────────────────────────────────────────── +with st.expander("👤 Personal Information", expanded=True): + info = data.get("personal_information", {}) + col1, col2 = st.columns(2) + with col1: + name = field("First Name", info.get("name", ""), "pi_name") + email = field("Email", info.get("email", ""), "pi_email") + phone = field("Phone", info.get("phone", ""), "pi_phone") + city = field("City", info.get("city", ""), "pi_city") + with col2: + surname = field("Last Name", info.get("surname", ""), "pi_surname") + linkedin = field("LinkedIn URL", info.get("linkedin", ""), "pi_linkedin") + zip_code = field("Zip Code", info.get("zip_code", ""), "pi_zip") + dob = field("Date of Birth", info.get("date_of_birth", ""), "pi_dob", + help="Format: MM/DD/YYYY") + +# ── Education ───────────────────────────────────────────────────────────── +with st.expander("🎓 Education"): + edu_list = data.get("education_details", [{}]) + updated_edu = [] + for i, edu in enumerate(edu_list): + st.markdown(f"**Entry {i+1}**") + col1, col2 = st.columns(2) + with col1: + inst = field("Institution", edu.get("institution", ""), f"edu_inst_{i}") + field_study = st.text_input("Field of Study", edu.get("field_of_study", ""), key=f"edu_field_{i}") + start = st.text_input("Start Year", edu.get("start_date", ""), key=f"edu_start_{i}") + with col2: + level = st.selectbox("Degree Level", + ["Bachelor's Degree", "Master's Degree", "Some College", "Associate's Degree", "High School", "Other"], + index=["Bachelor's Degree", "Master's Degree", "Some College", "Associate's Degree", "High School", "Other"].index( + edu.get("education_level", "Some College") + ) if edu.get("education_level") in ["Bachelor's Degree", "Master's Degree", "Some College", "Associate's Degree", "High School", "Other"] else 2, + key=f"edu_level_{i}") + end = st.text_input("Completion Year", edu.get("year_of_completion", ""), key=f"edu_end_{i}") + updated_edu.append({ + "education_level": level, "institution": inst, "field_of_study": field_study, + "start_date": start, "year_of_completion": end, "final_evaluation_grade": "", "exam": {}, + }) + st.divider() + +# ── Experience ───────────────────────────────────────────────────────────── +with st.expander("💼 Work Experience"): + exp_list = data.get("experience_details", [{}]) + if "exp_count" not in st.session_state: + st.session_state.exp_count = len(exp_list) + if st.button("+ Add Experience Entry"): + st.session_state.exp_count += 1 + exp_list.append({}) + + updated_exp = [] + for i in range(st.session_state.exp_count): + exp = exp_list[i] if i < len(exp_list) else {} + st.markdown(f"**Position {i+1}**") + col1, col2 = st.columns(2) + with col1: + pos = field("Job Title", exp.get("position", ""), f"exp_pos_{i}") + company = field("Company", exp.get("company", ""), f"exp_co_{i}") + period = field("Employment Period", exp.get("employment_period", ""), f"exp_period_{i}", + help="e.g. 01/2022 - Present") + with col2: + location = st.text_input("Location", exp.get("location", ""), key=f"exp_loc_{i}") + industry = st.text_input("Industry", exp.get("industry", ""), key=f"exp_ind_{i}") + + responsibilities = st.text_area( + "Key Responsibilities (one per line)", + value="\n".join( + r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r) + for j, r in enumerate(exp.get("key_responsibilities", [])) + ), + key=f"exp_resp_{i}", height=100, + ) + skills = st.text_input( + "Skills (comma-separated)", + value=", ".join(exp.get("skills_acquired", [])), + key=f"exp_skills_{i}", + ) + resp_list = [{"responsibility_1": r.strip()} for r in responsibilities.splitlines() if r.strip()] + skill_list = [s.strip() for s in skills.split(",") if s.strip()] + updated_exp.append({ + "position": pos, "company": company, "employment_period": period, + "location": location, "industry": industry, + "key_responsibilities": resp_list, "skills_acquired": skill_list, + }) + st.divider() + +# ── Preferences ──────────────────────────────────────────────────────────── +with st.expander("⚙️ Preferences & Availability"): + wp = data.get("work_preferences", {}) + sal = data.get("salary_expectations", {}) + avail = data.get("availability", {}) + col1, col2 = st.columns(2) + with col1: + salary_range = st.text_input("Salary Range (USD)", sal.get("salary_range_usd", ""), key="pref_salary", + help="e.g. 120000 - 180000") + notice = st.text_input("Notice Period", avail.get("notice_period", "2 weeks"), key="pref_notice") + with col2: + remote_work = st.checkbox("Open to Remote", value=wp.get("remote_work", "Yes") == "Yes", key="pref_remote") + relocation = st.checkbox("Open to Relocation", value=wp.get("open_to_relocation", "No") == "Yes", key="pref_reloc") + assessments = st.checkbox("Willing to complete assessments", + value=wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="pref_assess") + bg_checks = st.checkbox("Willing to undergo background checks", + value=wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="pref_bg") + +# ── Self-ID ──────────────────────────────────────────────────────────────── +with st.expander("🏳️‍🌈 Self-Identification (optional)"): + sid = data.get("self_identification", {}) + col1, col2 = st.columns(2) + with col1: + gender = st.text_input("Gender identity", sid.get("gender", "Non-binary"), key="sid_gender", + help="Select 'Non-binary' or 'Prefer not to say' when options allow") + pronouns = st.text_input("Pronouns", sid.get("pronouns", "Any"), key="sid_pronouns") + ethnicity = field("Ethnicity", sid.get("ethnicity", ""), "sid_ethnicity", + help="'Prefer not to say' is always an option") + with col2: + veteran = st.selectbox("Veteran status", ["No", "Yes", "Prefer not to say"], + index=["No", "Yes", "Prefer not to say"].index(sid.get("veteran", "No")), key="sid_vet") + disability = st.selectbox("Disability disclosure", ["Prefer not to say", "No", "Yes"], + index=["Prefer not to say", "No", "Yes"].index( + sid.get("disability", "Prefer not to say")), key="sid_dis") + st.caption("⚠️ Drug testing: set to No (medicinal cannabis for EDS). AIHawk will skip employers who require drug tests.") + +st.divider() + +# ── Save ─────────────────────────────────────────────────────────────────── +if st.button("💾 Save Resume Profile", type="primary", use_container_width=True): + data["personal_information"] = { + **data.get("personal_information", {}), + "name": name, "surname": surname, "email": email, "phone": phone, + "city": city, "zip_code": zip_code, "linkedin": linkedin, "date_of_birth": dob, + } + data["education_details"] = updated_edu + data["experience_details"] = updated_exp + data["salary_expectations"] = {"salary_range_usd": salary_range} + data["availability"] = {"notice_period": notice} + data["work_preferences"] = { + **data.get("work_preferences", {}), + "remote_work": "Yes" if remote_work else "No", + "open_to_relocation": "Yes" if relocation else "No", + "willing_to_complete_assessments": "Yes" if assessments else "No", + "willing_to_undergo_background_checks": "Yes" if bg_checks else "No", + "willing_to_undergo_drug_tests": "No", + } + data["self_identification"] = { + "gender": gender, "pronouns": pronouns, "veteran": veteran, + "disability": disability, "ethnicity": ethnicity, + } + RESUME_PATH.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True)) + st.success("✅ Profile saved!") + st.balloons() +``` + +**Step 2: Smoke test** + +Navigate to Resume Editor in the Streamlit app. Confirm all sections render and `FILL_IN` fields show amber warnings. + +**Step 3: Commit** + +```bash +cd /devl/job-seeker +git add app/pages/3_Resume_Editor.py +git commit -m "feat: add Resume Editor page with form-based AIHawk YAML editor" +``` + +--- + +## Task 8: Wire up environment.yml and CLAUDE.md + +**Step 1: Export updated environment.yml** + +```bash +conda run -n job-seeker conda env export > /devl/job-seeker/environment.yml +``` + +**Step 2: Update CLAUDE.md with UI section** + +Add to `CLAUDE.md`: + +```markdown +## Web UI +- Run: `conda run -n job-seeker streamlit run app/Home.py` +- Opens at http://localhost:8501 +- staging.db is gitignored — SQLite staging layer between discovery and Notion +- Pages: Home (dashboard), Job Review, Settings, Resume Editor +``` + +**Step 3: Commit** + +```bash +cd /devl/job-seeker +git add environment.yml CLAUDE.md +git commit -m "chore: update environment.yml and CLAUDE.md for Streamlit UI" +``` + +--- + +## Quick Reference + +| Command | What it does | +|---|---| +| `conda run -n job-seeker streamlit run app/Home.py` | Launch the web UI at localhost:8501 | +| `conda run -n job-seeker python scripts/discover.py` | Scrape boards → SQLite staging | +| `conda run -n job-seeker python scripts/sync.py` | Push approved jobs → Notion | +| `conda run -n job-seeker pytest tests/ -v` | Run all tests | diff --git a/docs/plans/2026-02-21-background-tasks-design.md b/docs/plans/2026-02-21-background-tasks-design.md new file mode 100644 index 0000000..099055b --- /dev/null +++ b/docs/plans/2026-02-21-background-tasks-design.md @@ -0,0 +1,100 @@ +# Background Task Processing — Design + +**Date:** 2026-02-21 +**Status:** Approved + +## Problem + +Cover letter generation (`4_Apply.py`) and company research (`6_Interview_Prep.py`) call LLM scripts synchronously inside `st.spinner()`. If the user navigates away during generation, Streamlit abandons the in-progress call and the result is lost. Both results are already persisted to SQLite on completion, so if the task kept running in the background the result would be available on return. + +## Solution Overview + +Python threading + SQLite task table. When a user clicks Generate, a daemon thread is spawned immediately and the task is recorded in a new `background_tasks` table. The thread writes results to the existing tables (`jobs.cover_letter`, `company_research`) and marks itself complete/failed. All pages share a sidebar indicator that auto-refreshes while tasks are active. Individual pages show task-level status inline. + +## SQLite Schema + +New table `background_tasks` added in `scripts/db.py`: + +```sql +CREATE TABLE IF NOT EXISTS background_tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_type TEXT NOT NULL, -- "cover_letter" | "company_research" + job_id INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', -- queued | running | completed | failed + error TEXT, + created_at DATETIME DEFAULT (datetime('now')), + started_at DATETIME, + finished_at DATETIME +) +``` + +## Deduplication Rule + +Before inserting a new task, check for an existing `queued` or `running` row with the same `(task_type, job_id)`. If one exists, reject the submission (return the existing task's id). Different task types for the same job (e.g. cover letter + research) are allowed to run concurrently. Different jobs of the same type are allowed concurrently. + +## Components + +### `scripts/task_runner.py` (new) + +- `submit_task(db, task_type, job_id) -> int` — dedup check, insert row, spawn daemon thread, return task id +- `_run_task(db, task_id, task_type, job_id)` — thread body: mark running, call generator, save result, mark completed/failed +- `get_active_tasks(db) -> list[dict]` — all queued/running rows with job title+company joined +- `get_task_for_job(db, task_type, job_id) -> dict | None` — latest task row for a specific job+type + +### `scripts/db.py` (modified) + +- Add `init_background_tasks(conn)` called inside `init_db()` +- Add `insert_task`, `update_task_status`, `get_active_tasks`, `get_task_for_job` helpers + +### `app/app.py` (modified) + +- After `st.navigation()`, call `get_active_tasks()` and render sidebar indicator +- Use `st.fragment` with `time.sleep(3)` + `st.rerun(scope="fragment")` to poll while tasks are active +- Sidebar shows: `⏳ N task(s) running` count + per-task line (type + company name) +- Fragment polling stops when active task count reaches zero + +### `app/pages/4_Apply.py` (modified) + +- Generate button calls `submit_task(db, "cover_letter", job_id)` instead of running inline +- If a task is `queued`/`running` for the selected job, disable button and show inline status fragment (polls every 3s) +- On `completed`, load cover letter from `jobs` row (already saved by thread) +- On `failed`, show error message and re-enable button + +### `app/pages/6_Interview_Prep.py` (modified) + +- Generate/Refresh buttons call `submit_task(db, "company_research", job_id)` instead of running inline +- Same inline status fragment pattern as Apply page + +## Data Flow + +``` +User clicks Generate + → submit_task(db, type, job_id) + → dedup check (reject if already queued/running for same type+job) + → INSERT background_tasks row (status=queued) + → spawn daemon thread + → return task_id + → page shows inline "⏳ Queued…" fragment + +Thread runs + → UPDATE status=running, started_at=now + → call generate_cover_letter.generate() OR research_company() + → write result to jobs.cover_letter OR company_research table + → UPDATE status=completed, finished_at=now + (on exception: UPDATE status=failed, error=str(e)) + +Sidebar fragment (every 3s while active tasks > 0) + → get_active_tasks() → render count + list + → st.rerun(scope="fragment") + +Page fragment (every 3s while task for this job is running) + → get_task_for_job() → render status + → on completed: st.rerun() (full rerun to reload cover letter / research) +``` + +## What Is Not Changed + +- `generate_cover_letter.generate()` and `research_company()` are called unchanged from the thread +- `update_cover_letter()` and `save_research()` DB helpers are reused unchanged +- No new Python packages required +- No separate worker process — daemon threads die with the Streamlit server, but results already written to SQLite survive diff --git a/docs/plans/2026-02-21-background-tasks-plan.md b/docs/plans/2026-02-21-background-tasks-plan.md new file mode 100644 index 0000000..29a6b5e --- /dev/null +++ b/docs/plans/2026-02-21-background-tasks-plan.md @@ -0,0 +1,933 @@ +# Background Task Processing Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Replace synchronous LLM calls in Apply and Interview Prep pages with background threads so cover letter and research generation survive page navigation. + +**Architecture:** A new `background_tasks` SQLite table tracks task state. `scripts/task_runner.py` spawns daemon threads that call existing generator functions and write results via existing DB helpers. The Streamlit sidebar polls active tasks every 3s via `@st.fragment(run_every=3)`; individual pages show per-job status with the same pattern. + +**Tech Stack:** Python `threading` (stdlib), SQLite, Streamlit `st.fragment` (≥1.33 — already installed) + +--- + +## Task 1: Add background_tasks table and DB helpers + +**Files:** +- Modify: `scripts/db.py` +- Test: `tests/test_db.py` + +### Step 1: Write the failing tests + +Add to `tests/test_db.py`: + +```python +# ── background_tasks tests ──────────────────────────────────────────────────── + +def test_init_db_creates_background_tasks_table(tmp_path): + """init_db creates a background_tasks table.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cur = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'" + ) + assert cur.fetchone() is not None + conn.close() + + +def test_insert_task_returns_id_and_true(tmp_path): + """insert_task returns (task_id, True) for a new task.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_insert_task_deduplicates_active_task(tmp_path): + """insert_task returns (existing_id, False) if a queued/running task already exists.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + second_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_insert_task_allows_different_types_same_job(tmp_path): + """insert_task allows cover_letter and company_research for the same job concurrently.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + _, cl_new = insert_task(db_path, "cover_letter", job_id) + _, res_new = insert_task(db_path, "company_research", job_id) + assert cl_new is True + assert res_new is True + + +def test_update_task_status_running(tmp_path): + """update_task_status('running') sets started_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "running") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "running" + assert row[1] is not None + + +def test_update_task_status_completed(tmp_path): + """update_task_status('completed') sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "completed") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "completed" + assert row[1] is not None + + +def test_update_task_status_failed_stores_error(tmp_path): + """update_task_status('failed') stores error message and sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "failed", error="LLM timeout") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "failed" + assert row[1] == "LLM timeout" + assert row[2] is not None + + +def test_get_active_tasks_returns_only_active(tmp_path): + """get_active_tasks returns only queued/running tasks with job info joined.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + active_id, _ = insert_task(db_path, "cover_letter", job_id) + done_id, _ = insert_task(db_path, "company_research", job_id) + update_task_status(db_path, done_id, "completed") + + tasks = get_active_tasks(db_path) + assert len(tasks) == 1 + assert tasks[0]["id"] == active_id + assert tasks[0]["company"] == "Acme" + assert tasks[0]["title"] == "CSM" + + +def test_get_task_for_job_returns_latest(tmp_path): + """get_task_for_job returns the most recent task for the given type+job.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, first_id, "completed") + second_id, _ = insert_task(db_path, "cover_letter", job_id) # allowed since first is done + + task = get_task_for_job(db_path, "cover_letter", job_id) + assert task is not None + assert task["id"] == second_id + + +def test_get_task_for_job_returns_none_when_absent(tmp_path): + """get_task_for_job returns None when no task exists for that job+type.""" + from scripts.db import init_db, insert_job, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + assert get_task_for_job(db_path, "cover_letter", job_id) is None +``` + +### Step 2: Run tests to verify they fail + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v -k "background_tasks or insert_task or update_task_status or get_active_tasks or get_task_for_job" +``` + +Expected: FAIL with `ImportError: cannot import name 'insert_task'` + +### Step 3: Implement in scripts/db.py + +Add the DDL constant after `CREATE_COMPANY_RESEARCH`: + +```python +CREATE_BACKGROUND_TASKS = """ +CREATE TABLE IF NOT EXISTS background_tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_type TEXT NOT NULL, + job_id INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', + error TEXT, + created_at DATETIME DEFAULT (datetime('now')), + started_at DATETIME, + finished_at DATETIME +) +""" +``` + +Add `conn.execute(CREATE_BACKGROUND_TASKS)` inside `init_db()`, after the existing three `conn.execute()` calls: + +```python +def init_db(db_path: Path = DEFAULT_DB) -> None: + """Create tables if they don't exist, then run migrations.""" + conn = sqlite3.connect(db_path) + conn.execute(CREATE_JOBS) + conn.execute(CREATE_JOB_CONTACTS) + conn.execute(CREATE_COMPANY_RESEARCH) + conn.execute(CREATE_BACKGROUND_TASKS) # ← add this line + conn.commit() + conn.close() + _migrate_db(db_path) +``` + +Add the four helper functions at the end of `scripts/db.py`: + +```python +# ── Background task helpers ─────────────────────────────────────────────────── + +def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Insert a new background task. + + Returns (task_id, True) if inserted, or (existing_id, False) if a + queued/running task for the same (task_type, job_id) already exists. + """ + conn = sqlite3.connect(db_path) + existing = conn.execute( + "SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')", + (task_type, job_id), + ).fetchone() + if existing: + conn.close() + return existing[0], False + cur = conn.execute( + "INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')", + (task_type, job_id), + ) + task_id = cur.lastrowid + conn.commit() + conn.close() + return task_id, True + + +def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None, + status: str = "", error: Optional[str] = None) -> None: + """Update a task's status and set the appropriate timestamp.""" + now = datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + if status == "running": + conn.execute( + "UPDATE background_tasks SET status=?, started_at=? WHERE id=?", + (status, now, task_id), + ) + elif status in ("completed", "failed"): + conn.execute( + "UPDATE background_tasks SET status=?, finished_at=?, error=? WHERE id=?", + (status, now, error, task_id), + ) + else: + conn.execute("UPDATE background_tasks SET status=? WHERE id=?", (status, task_id)) + conn.commit() + conn.close() + + +def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return all queued/running tasks with job title and company joined in.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute(""" + SELECT bt.*, j.title, j.company + FROM background_tasks bt + LEFT JOIN jobs j ON j.id = bt.job_id + WHERE bt.status IN ('queued', 'running') + ORDER BY bt.created_at ASC + """).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> Optional[dict]: + """Return the most recent task row for a (task_type, job_id) pair, or None.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute( + """SELECT * FROM background_tasks + WHERE task_type=? AND job_id=? + ORDER BY id DESC LIMIT 1""", + (task_type, job_id), + ).fetchone() + conn.close() + return dict(row) if row else None +``` + +### Step 4: Run tests to verify they pass + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v -k "background_tasks or insert_task or update_task_status or get_active_tasks or get_task_for_job" +``` + +Expected: all new tests PASS, no regressions + +### Step 5: Run full test suite + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +Expected: all tests PASS + +### Step 6: Commit + +```bash +git add scripts/db.py tests/test_db.py +git commit -m "feat: add background_tasks table and DB helpers" +``` + +--- + +## Task 2: Create scripts/task_runner.py + +**Files:** +- Create: `scripts/task_runner.py` +- Test: `tests/test_task_runner.py` + +### Step 1: Write the failing tests + +Create `tests/test_task_runner.py`: + +```python +import threading +import time +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +import sqlite3 + + +def _make_db(tmp_path): + from scripts.db import init_db, insert_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "Great role.", "date_found": "2026-02-20", + }) + return db, job_id + + +def test_submit_task_returns_id_and_true(tmp_path): + """submit_task returns (task_id, True) and spawns a thread.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): # don't actually call LLM + from scripts.task_runner import submit_task + task_id, is_new = submit_task(db, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_submit_task_deduplicates(tmp_path): + """submit_task returns (existing_id, False) for a duplicate in-flight task.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): + from scripts.task_runner import submit_task + first_id, _ = submit_task(db, "cover_letter", job_id) + second_id, is_new = submit_task(db, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_run_task_cover_letter_success(tmp_path): + """_run_task marks running→completed and saves cover letter to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job, get_jobs_by_status + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" + assert task["error"] is None + + conn = sqlite3.connect(db) + row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row[0] == "Dear Hiring Manager,\nGreat fit!" + + +def test_run_task_company_research_success(tmp_path): + """_run_task marks running→completed and saves research to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job, get_research + + task_id, _ = insert_task(db, "company_research", job_id) + fake_result = { + "raw_output": "raw", "company_brief": "brief", + "ceo_brief": "ceo", "talking_points": "points", + } + with patch("scripts.company_research.research_company", return_value=fake_result): + from scripts.task_runner import _run_task + _run_task(db, task_id, "company_research", job_id) + + task = get_task_for_job(db, "company_research", job_id) + assert task["status"] == "completed" + + research = get_research(db, job_id=job_id) + assert research["company_brief"] == "brief" + + +def test_run_task_marks_failed_on_exception(tmp_path): + """_run_task marks status=failed and stores error when generator raises.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "failed" + assert "LLM timeout" in task["error"] + + +def test_submit_task_actually_completes(tmp_path): + """Integration: submit_task spawns a thread that completes asynchronously.""" + db, job_id = _make_db(tmp_path) + from scripts.db import get_task_for_job + + with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"): + from scripts.task_runner import submit_task + task_id, _ = submit_task(db, "cover_letter", job_id) + # Wait for thread to complete (max 5s) + for _ in range(50): + task = get_task_for_job(db, "cover_letter", job_id) + if task and task["status"] in ("completed", "failed"): + break + time.sleep(0.1) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" +``` + +### Step 2: Run tests to verify they fail + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py -v +``` + +Expected: FAIL with `ModuleNotFoundError: No module named 'scripts.task_runner'` + +### Step 3: Implement scripts/task_runner.py + +Create `scripts/task_runner.py`: + +```python +# scripts/task_runner.py +""" +Background task runner for LLM generation tasks. + +Submitting a task inserts a row in background_tasks and spawns a daemon thread. +The thread calls the appropriate generator, writes results to existing tables, +and marks the task completed or failed. + +Deduplication: only one queued/running task per (task_type, job_id) is allowed. +Different task types for the same job run concurrently (e.g. cover letter + research). +""" +import sqlite3 +import threading +from pathlib import Path + +from scripts.db import ( + DEFAULT_DB, + insert_task, + update_task_status, + update_cover_letter, + save_research, +) + + +def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Submit a background LLM task. + + Returns (task_id, True) if a new task was queued and a thread spawned. + Returns (existing_id, False) if an identical task is already in-flight. + """ + task_id, is_new = insert_task(db_path, task_type, job_id) + if is_new: + t = threading.Thread( + target=_run_task, + args=(db_path, task_id, task_type, job_id), + daemon=True, + ) + t.start() + return task_id, is_new + + +def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None: + """Thread body: run the generator and persist the result.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if row is None: + update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found") + return + + job = dict(row) + update_task_status(db_path, task_id, "running") + + try: + if task_type == "cover_letter": + from scripts.generate_cover_letter import generate + result = generate( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + update_cover_letter(db_path, job_id, result) + + elif task_type == "company_research": + from scripts.company_research import research_company + result = research_company(job) + save_research(db_path, job_id=job_id, **result) + + else: + raise ValueError(f"Unknown task_type: {task_type!r}") + + update_task_status(db_path, task_id, "completed") + + except Exception as exc: + update_task_status(db_path, task_id, "failed", error=str(exc)) +``` + +### Step 4: Run tests to verify they pass + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py -v +``` + +Expected: all tests PASS + +### Step 5: Run full test suite + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +Expected: all tests PASS + +### Step 6: Commit + +```bash +git add scripts/task_runner.py tests/test_task_runner.py +git commit -m "feat: add task_runner — background thread executor for LLM tasks" +``` + +--- + +## Task 3: Add sidebar task indicator to app/app.py + +**Files:** +- Modify: `app/app.py` + +No new tests needed — this is pure UI wiring. + +### Step 1: Replace the contents of app/app.py + +Current file is 33 lines. Replace entirely with: + +```python +# app/app.py +""" +Streamlit entry point — uses st.navigation() to control the sidebar. +Main workflow pages are listed at the top; Settings is separated into +a "System" section so it doesn't crowd the navigation. + +Run: streamlit run app/app.py + bash scripts/manage-ui.sh start +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import streamlit as st +from scripts.db import DEFAULT_DB, init_db, get_active_tasks + +st.set_page_config( + page_title="Job Seeker", + page_icon="💼", + layout="wide", +) + +init_db(DEFAULT_DB) + +# ── Background task sidebar indicator ───────────────────────────────────────── +@st.fragment(run_every=3) +def _task_sidebar() -> None: + tasks = get_active_tasks(DEFAULT_DB) + if not tasks: + return + with st.sidebar: + st.divider() + st.markdown(f"**⏳ {len(tasks)} task(s) running**") + for t in tasks: + icon = "⏳" if t["status"] == "running" else "🕐" + label = "Cover letter" if t["task_type"] == "cover_letter" else "Research" + st.caption(f"{icon} {label} — {t.get('company') or 'unknown'}") + +_task_sidebar() + +# ── Navigation ───────────────────────────────────────────────────────────────── +pages = { + "": [ + st.Page("Home.py", title="Home", icon="🏠"), + st.Page("pages/1_Job_Review.py", title="Job Review", icon="📋"), + st.Page("pages/4_Apply.py", title="Apply Workspace", icon="🚀"), + st.Page("pages/5_Interviews.py", title="Interviews", icon="🎯"), + st.Page("pages/6_Interview_Prep.py", title="Interview Prep", icon="📞"), + ], + "System": [ + st.Page("pages/2_Settings.py", title="Settings", icon="⚙️"), + ], +} + +pg = st.navigation(pages) +pg.run() +``` + +### Step 2: Smoke-test by running the UI + +```bash +bash /devl/job-seeker/scripts/manage-ui.sh restart +``` + +Navigate to http://localhost:8501 and confirm the app loads without error. The sidebar task indicator does not appear when no tasks are running (correct). + +### Step 3: Commit + +```bash +git add app/app.py +git commit -m "feat: sidebar background task indicator with 3s auto-refresh" +``` + +--- + +## Task 4: Update 4_Apply.py to use background generation + +**Files:** +- Modify: `app/pages/4_Apply.py` + +No new unit tests — covered by existing test suite for DB layer. Smoke-test in browser. + +### Step 1: Add imports at the top of 4_Apply.py + +After the existing imports block (after `from scripts.db import ...`), add: + +```python +from scripts.db import get_task_for_job +from scripts.task_runner import submit_task +``` + +So the full import block becomes: + +```python +from scripts.db import ( + DEFAULT_DB, init_db, get_jobs_by_status, + update_cover_letter, mark_applied, + get_task_for_job, +) +from scripts.task_runner import submit_task +``` + +### Step 2: Replace the Generate button section + +Find this block (around line 174–185): + +```python + if st.button("✨ Generate / Regenerate", use_container_width=True): + with st.spinner("Generating via LLM…"): + try: + from scripts.generate_cover_letter import generate as _gen + st.session_state[_cl_key] = _gen( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + st.rerun() + except Exception as e: + st.error(f"Generation failed: {e}") +``` + +Replace with: + +```python + _cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id) + _cl_running = _cl_task and _cl_task["status"] in ("queued", "running") + + if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)): + submit_task(DEFAULT_DB, "cover_letter", selected_id) + st.rerun() + + if _cl_running: + @st.fragment(run_every=3) + def _cl_status_fragment(): + t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…" + st.info(f"⏳ {lbl}") + else: + st.rerun() # full page rerun — reloads cover letter from DB + _cl_status_fragment() + elif _cl_task and _cl_task["status"] == "failed": + st.error(f"Generation failed: {_cl_task.get('error', 'unknown error')}") +``` + +Also update the session-state initialiser just below (line 171–172) so it loads from DB after background completion. The existing code already does this correctly: + +```python + if _cl_key not in st.session_state: + st.session_state[_cl_key] = job.get("cover_letter") or "" +``` + +This is fine — `job` is fetched fresh on each full-page rerun, so when the background thread writes to `jobs.cover_letter`, the next full rerun picks it up. + +### Step 3: Smoke-test in browser + +1. Navigate to Apply Workspace +2. Select an approved job +3. Click "Generate / Regenerate" +4. Navigate away to Home +5. Navigate back to Apply Workspace for the same job +6. Observe: button is disabled and "⏳ Generating via LLM…" shows while running; cover letter appears when done + +### Step 4: Commit + +```bash +git add app/pages/4_Apply.py +git commit -m "feat: cover letter generation runs in background, survives navigation" +``` + +--- + +## Task 5: Update 6_Interview_Prep.py to use background research + +**Files:** +- Modify: `app/pages/6_Interview_Prep.py` + +### Step 1: Add imports at the top of 6_Interview_Prep.py + +After the existing `from scripts.db import (...)` block, add: + +```python +from scripts.db import get_task_for_job +from scripts.task_runner import submit_task +``` + +So the full import block becomes: + +```python +from scripts.db import ( + DEFAULT_DB, init_db, + get_interview_jobs, get_contacts, get_research, + save_research, get_task_for_job, +) +from scripts.task_runner import submit_task +``` + +### Step 2: Replace the "no research yet" generate button block + +Find this block (around line 99–111): + +```python + if not research: + st.warning("No research brief yet for this job.") + if st.button("🔬 Generate research brief", type="primary", use_container_width=True): + with st.spinner("Generating… this may take 30–60 seconds"): + try: + from scripts.company_research import research_company + result = research_company(job) + save_research(DEFAULT_DB, job_id=selected_id, **result) + st.success("Done!") + st.rerun() + except Exception as e: + st.error(f"Error: {e}") + st.stop() + else: +``` + +Replace with: + +```python + _res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + _res_running = _res_task and _res_task["status"] in ("queued", "running") + + if not research: + if not _res_running: + st.warning("No research brief yet for this job.") + if _res_task and _res_task["status"] == "failed": + st.error(f"Last attempt failed: {_res_task.get('error', '')}") + if st.button("🔬 Generate research brief", type="primary", use_container_width=True): + submit_task(DEFAULT_DB, "company_research", selected_id) + st.rerun() + + if _res_running: + @st.fragment(run_every=3) + def _res_status_initial(): + t = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Generating… this may take 30–60 seconds" + st.info(f"⏳ {lbl}") + else: + st.rerun() + _res_status_initial() + + st.stop() + else: +``` + +### Step 3: Replace the "refresh" button block + +Find this block (around line 113–124): + +```python + generated_at = research.get("generated_at", "") + col_ts, col_btn = st.columns([3, 1]) + col_ts.caption(f"Research generated: {generated_at}") + if col_btn.button("🔄 Refresh", use_container_width=True): + with st.spinner("Refreshing…"): + try: + from scripts.company_research import research_company + result = research_company(job) + save_research(DEFAULT_DB, job_id=selected_id, **result) + st.rerun() + except Exception as e: + st.error(f"Error: {e}") +``` + +Replace with: + +```python + generated_at = research.get("generated_at", "") + col_ts, col_btn = st.columns([3, 1]) + col_ts.caption(f"Research generated: {generated_at}") + if col_btn.button("🔄 Refresh", use_container_width=True, disabled=bool(_res_running)): + submit_task(DEFAULT_DB, "company_research", selected_id) + st.rerun() + + if _res_running: + @st.fragment(run_every=3) + def _res_status_refresh(): + t = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Refreshing research…" + st.info(f"⏳ {lbl}") + else: + st.rerun() + _res_status_refresh() + elif _res_task and _res_task["status"] == "failed": + st.error(f"Refresh failed: {_res_task.get('error', '')}") +``` + +### Step 4: Smoke-test in browser + +1. Move a job to Phone Screen on the Interviews page +2. Navigate to Interview Prep, select that job +3. Click "Generate research brief" +4. Navigate away to Home +5. Navigate back — observe "⏳ Generating…" inline indicator +6. Wait for completion — research sections populate automatically + +### Step 5: Run full test suite one final time + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +Expected: all tests PASS + +### Step 6: Commit + +```bash +git add app/pages/6_Interview_Prep.py +git commit -m "feat: company research generation runs in background, survives navigation" +``` + +--- + +## Summary of Changes + +| File | Change | +|------|--------| +| `scripts/db.py` | Add `CREATE_BACKGROUND_TASKS`, `init_db` call, 4 new helpers | +| `scripts/task_runner.py` | New file — `submit_task` + `_run_task` thread body | +| `app/app.py` | Add `_task_sidebar` fragment with 3s auto-refresh | +| `app/pages/4_Apply.py` | Generate button → `submit_task`; inline status fragment | +| `app/pages/6_Interview_Prep.py` | Generate/Refresh buttons → `submit_task`; inline status fragments | +| `tests/test_db.py` | 9 new tests for background_tasks helpers | +| `tests/test_task_runner.py` | New file — 6 tests for task_runner | diff --git a/docs/plans/2026-02-21-email-handling-design.md b/docs/plans/2026-02-21-email-handling-design.md new file mode 100644 index 0000000..cb570c8 --- /dev/null +++ b/docs/plans/2026-02-21-email-handling-design.md @@ -0,0 +1,91 @@ +# Email Handling Design + +**Date:** 2026-02-21 +**Status:** Approved + +## Problem + +IMAP sync already pulls emails for active pipeline jobs, but two gaps exist: +1. Inbound emails suggesting a stage change (e.g. "let's schedule a call") produce no signal — the recruiter's message just sits in the email log. +2. Recruiter outreach to email addresses not yet in the pipeline is invisible — those leads never enter Job Review. + +## Goals + +- Surface stage-change suggestions inline on the Interviews kanban card (suggest-only, never auto-advance). +- Capture recruiter leads from unmatched inbound email and surface them in Job Review. +- Make email sync a background task triggerable from the UI (Home page + Interviews sidebar). + +## Data Model + +**No new tables.** Two columns added to `job_contacts`: + +```sql +ALTER TABLE job_contacts ADD COLUMN stage_signal TEXT; +ALTER TABLE job_contacts ADD COLUMN suggestion_dismissed INTEGER DEFAULT 0; +``` + +- `stage_signal` — one of: `interview_scheduled`, `offer_received`, `rejected`, `positive_response`, `neutral` (or NULL if not yet classified). +- `suggestion_dismissed` — 1 when the user clicks Dismiss; prevents the banner re-appearing. + +Email leads reuse the existing `jobs` table with `source = 'email'` and `status = 'pending'`. No new columns needed. + +## Components + +### 1. Stage Signal Classification (`scripts/imap_sync.py`) + +After saving each **inbound** contact row, call `phi3:mini` via Ollama to classify the email into one of the five labels. Store the result in `stage_signal`. If classification fails, default to `NULL` (no suggestion shown). + +**Model:** `phi3:mini` via `LLMRouter.complete(model_override="phi3:mini", fallback_order=["ollama_research"])`. +Benchmarked at 100% accuracy / 3.0 s per email on a 12-case test suite. Runner-up Qwen2.5-3B untested but phi3-mini is the safe choice. + +### 2. Recruiter Lead Extraction (`scripts/imap_sync.py`) + +A second pass after per-job sync: scan INBOX broadly for recruitment-keyword emails that don't match any known pipeline company. For each unmatched email, call **Nemotron 1.5B** (already in use for company research) to extract `{company, title}`. If extraction returns a company name not already in the DB, insert a new job row `source='email', status='pending'`. + +**Dedup:** checked by `message_id` against all known contacts (cross-job), plus `url` uniqueness on the jobs table (the email lead URL is set to a synthetic `email:///` value). + +### 3. Background Task (`scripts/task_runner.py`) + +New task type: `email_sync` with `job_id = 0`. +`submit_task(db, "email_sync", 0)` → daemon thread → `sync_all()` → returns summary via task `error` field. + +Deduplication: only one `email_sync` can be queued/running at a time (existing insert_task logic handles this). + +### 4. UI — Sync Button (Home + Interviews) + +**Home.py:** New "Sync Emails" section alongside Find Jobs / Score / Notion sync. +**5_Interviews.py:** Existing sync button already present in sidebar; convert from synchronous `sync_all()` call to `submit_task()` + fragment polling. + +### 5. UI — Email Leads (Job Review) + +When `show_status == "pending"`, prepend email leads (`source = 'email'`) at the top of the list with a distinct `📧 Email Lead` badge. Actions are identical to scraped pending jobs (Approve / Reject). + +### 6. UI — Stage Suggestion Banner (Interviews Kanban) + +Inside `_render_card()`, before the advance/reject buttons, check for unseen stage signals: + +``` +💡 Email suggests: interview_scheduled +From: sarah@company.com · "Let's book a call" +[→ Move to Phone Screen] [Dismiss] +``` + +- "Move" calls `advance_to_stage()` + `submit_task("company_research")` then reruns. +- "Dismiss" calls `dismiss_stage_signal(contact_id)` then reruns. +- Only the most recent undismissed signal is shown per card. + +## Error Handling + +| Failure | Behaviour | +|---------|-----------| +| IMAP connection fails | Error stored in task `error` field; shown as warning in UI after sync | +| Classifier call fails | `stage_signal` left NULL; no suggestion shown; sync continues | +| Lead extractor fails | Email skipped; appended to `result["errors"]`; sync continues | +| Duplicate `email_sync` task | `insert_task` returns existing id; no new thread spawned | +| LLM extraction returns no company | Email silently skipped (not a lead) | + +## Out of Scope + +- Auto-advancing pipeline stage (suggest only). +- Sending email replies from the app (draft helper already exists). +- OAuth / token-refresh IMAP (config/email.yaml credentials only). diff --git a/docs/plans/2026-02-21-email-handling-plan.md b/docs/plans/2026-02-21-email-handling-plan.md new file mode 100644 index 0000000..ac75aa5 --- /dev/null +++ b/docs/plans/2026-02-21-email-handling-plan.md @@ -0,0 +1,1105 @@ +# Email Handling Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add stage-signal classification to inbound emails, recruiter lead capture from unmatched emails, email sync as a background task, and surface both in the UI. + +**Architecture:** Extend `imap_sync.py` with a phi3-mini classifier and Nemotron lead extractor; wire `email_sync` into `task_runner.py`; add two new DB helpers and two migration columns; update three UI pages. + +**Tech Stack:** Python, SQLite, imaplib, LLMRouter (Ollama phi3:mini + Nemotron 1.5B), Streamlit. + +**Run tests:** `/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v` +**Conda prefix:** `conda run -n job-seeker` + +--- + +### Task 1: DB migrations — stage_signal + suggestion_dismissed columns + +**Files:** +- Modify: `scripts/db.py` +- Test: `tests/test_db.py` + +**Context:** `_CONTACT_MIGRATIONS` is a list of `(col, type)` tuples applied in `_migrate_db()`. Add to that list. Also add two helper functions: `get_unread_stage_signals(db_path, job_id)` returns contacts with a non-null, non-neutral stage_signal and `suggestion_dismissed = 0`; `dismiss_stage_signal(db_path, contact_id)` sets `suggestion_dismissed = 1`. Also update `add_contact()` to accept an optional `stage_signal` kwarg. + +**Step 1: Write the failing tests** + +In `tests/test_db.py`, append: + +```python +def test_stage_signal_columns_exist(tmp_path): + """init_db creates stage_signal and suggestion_dismissed columns on job_contacts.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cols = {row[1] for row in conn.execute("PRAGMA table_info(job_contacts)").fetchall()} + conn.close() + assert "stage_signal" in cols + assert "suggestion_dismissed" in cols + + +def test_add_contact_with_stage_signal(tmp_path): + """add_contact stores stage_signal when provided.""" + from scripts.db import init_db, insert_job, add_contact, get_contacts + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" + + +def test_get_unread_stage_signals(tmp_path): + """get_unread_stage_signals returns only non-neutral, non-dismissed signals.""" + from scripts.db import (init_db, insert_job, add_contact, + get_unread_stage_signals, dismiss_stage_signal) + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + c1 = add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Auto-confirm", stage_signal="neutral") + signals = get_unread_stage_signals(db_path, job_id) + assert len(signals) == 1 + assert signals[0]["stage_signal"] == "interview_scheduled" + + dismiss_stage_signal(db_path, c1) + assert get_unread_stage_signals(db_path, job_id) == [] +``` + +**Step 2: Run tests to confirm they fail** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_stage_signal_columns_exist tests/test_db.py::test_add_contact_with_stage_signal tests/test_db.py::test_get_unread_stage_signals -v +``` + +Expected: 3 failures. + +**Step 3: Implement in `scripts/db.py`** + +3a. In `_CONTACT_MIGRATIONS`, add: +```python +_CONTACT_MIGRATIONS = [ + ("message_id", "TEXT"), + ("stage_signal", "TEXT"), + ("suggestion_dismissed", "INTEGER DEFAULT 0"), +] +``` + +3b. Update `add_contact()` signature and INSERT: +```python +def add_contact(db_path: Path = DEFAULT_DB, job_id: int = None, + direction: str = "inbound", subject: str = "", + from_addr: str = "", to_addr: str = "", + body: str = "", received_at: str = "", + message_id: str = "", + stage_signal: str = "") -> int: + """Log an email contact. Returns the new row id.""" + ts = received_at or datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + cur = conn.execute( + """INSERT INTO job_contacts + (job_id, direction, subject, from_addr, to_addr, body, + received_at, message_id, stage_signal) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (job_id, direction, subject, from_addr, to_addr, body, + ts, message_id, stage_signal or None), + ) + conn.commit() + row_id = cur.lastrowid + conn.close() + return row_id +``` + +3c. Add the two new helpers after `get_contacts()`: +```python +def get_unread_stage_signals(db_path: Path = DEFAULT_DB, + job_id: int = None) -> list[dict]: + """Return inbound contacts with a non-neutral, non-dismissed stage signal.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """SELECT * FROM job_contacts + WHERE job_id = ? + AND direction = 'inbound' + AND stage_signal IS NOT NULL + AND stage_signal != 'neutral' + AND (suggestion_dismissed IS NULL OR suggestion_dismissed = 0) + ORDER BY received_at ASC""", + (job_id,), + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def dismiss_stage_signal(db_path: Path = DEFAULT_DB, + contact_id: int = None) -> None: + """Mark a stage signal suggestion as dismissed.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET suggestion_dismissed = 1 WHERE id = ?", + (contact_id,), + ) + conn.commit() + conn.close() +``` + +3d. Add `get_all_message_ids()` (needed for lead dedup in Task 3): +```python +def get_all_message_ids(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all known Message-IDs across all job contacts.""" + conn = sqlite3.connect(db_path) + rows = conn.execute( + "SELECT message_id FROM job_contacts WHERE message_id IS NOT NULL AND message_id != ''" + ).fetchall() + conn.close() + return {r[0] for r in rows} +``` + +**Step 4: Run tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v +``` + +Expected: all pass. + +**Step 5: Commit** + +```bash +git add scripts/db.py tests/test_db.py +git commit -m "feat: add stage_signal/suggestion_dismissed columns and helpers to db" +``` + +--- + +### Task 2: Stage signal classifier in imap_sync.py + +**Files:** +- Modify: `scripts/imap_sync.py` +- Test: `tests/test_imap_sync.py` (create) + +**Context:** Add a `classify_stage_signal(subject, body)` function that calls phi3:mini via LLMRouter and returns one of the 5 label strings. It must gracefully return `None` on any failure (network, timeout, model not loaded). The label parsing must strip `` tags in case a thinking-capable model is used. + +**Step 1: Write the failing test** + +Create `tests/test_imap_sync.py`: + +```python +"""Tests for imap_sync helpers (no live IMAP connection required).""" +import pytest +from unittest.mock import patch + + +def test_classify_stage_signal_interview(tmp_path): + """classify_stage_signal returns interview_scheduled for a call-scheduling email.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "interview_scheduled" + result = classify_stage_signal( + "Let's schedule a call", + "Hi Alex, we'd love to book a 30-min phone screen with you.", + ) + assert result == "interview_scheduled" + + +def test_classify_stage_signal_returns_none_on_error(tmp_path): + """classify_stage_signal returns None when LLM call raises.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.side_effect = RuntimeError("model not loaded") + result = classify_stage_signal("subject", "body") + assert result is None + + +def test_classify_stage_signal_strips_think_tags(tmp_path): + """classify_stage_signal strips … blocks before parsing.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "Let me think…\nrejected" + result = classify_stage_signal("Update on your application", "We went with another candidate.") + assert result == "rejected" + + +def test_normalise_company(): + """_normalise_company strips legal suffixes.""" + from scripts.imap_sync import _normalise_company + assert _normalise_company("DataStax, Inc.") == "DataStax" + assert _normalise_company("Wiz Ltd") == "Wiz" + assert _normalise_company("Crusoe Energy") == "Crusoe Energy" + + +def test_has_recruitment_keyword(): + """_has_recruitment_keyword matches known keywords.""" + from scripts.imap_sync import _has_recruitment_keyword + assert _has_recruitment_keyword("Interview Invitation — Senior TAM") + assert _has_recruitment_keyword("Your application with DataStax") + assert not _has_recruitment_keyword("Team lunch tomorrow") +``` + +**Step 2: Run to confirm failures** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v +``` + +Expected: ImportError or failures on `classify_stage_signal` and `_CLASSIFIER_ROUTER`. + +**Step 3: Implement in `scripts/imap_sync.py`** + +After the existing imports, add: + +```python +import re as _re + +from scripts.llm_router import LLMRouter + +_CLASSIFIER_ROUTER = LLMRouter() + +_CLASSIFY_SYSTEM = ( + "You are an email classifier. Classify the recruitment email into exactly ONE of these categories:\n" + " interview_scheduled, offer_received, rejected, positive_response, neutral\n\n" + "Rules:\n" + "- interview_scheduled: recruiter wants to book a call/interview\n" + "- offer_received: job offer is being extended\n" + "- rejected: explicitly not moving forward\n" + "- positive_response: interested/impressed but no interview booked yet\n" + "- neutral: auto-confirmation, generic update, no clear signal\n\n" + "Respond with ONLY the category name. No explanation." +) + +_CLASSIFY_LABELS = [ + "interview_scheduled", "offer_received", "rejected", + "positive_response", "neutral", +] + + +def classify_stage_signal(subject: str, body: str) -> Optional[str]: + """Classify an inbound email into a pipeline stage signal. + + Returns one of the 5 label strings, or None on failure. + Uses phi3:mini via Ollama (benchmarked 100% on 12-case test set). + """ + try: + prompt = f"Subject: {subject}\n\nEmail: {body[:400]}" + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_CLASSIFY_SYSTEM, + model_override="phi3:mini", + fallback_order=["ollama_research"], + ) + # Strip blocks (in case a reasoning model slips through) + text = _re.sub(r".*?", "", raw, flags=_re.DOTALL) + text = text.lower().strip() + for label in _CLASSIFY_LABELS: + if text.startswith(label) or label in text: + return label + return "neutral" + except Exception: + return None +``` + +**Step 4: Run tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v +``` + +Expected: all 5 pass. + +**Step 5: Commit** + +```bash +git add scripts/imap_sync.py tests/test_imap_sync.py +git commit -m "feat: add classify_stage_signal to imap_sync using phi3:mini" +``` + +--- + +### Task 3: Classify inbound contacts during per-job sync + +**Files:** +- Modify: `scripts/imap_sync.py` +- Test: `tests/test_imap_sync.py` + +**Context:** Inside `sync_job_emails()`, after calling `add_contact()` for an inbound email, call `classify_stage_signal()` and — if the result is non-None and non-'neutral' — update the `stage_signal` column via a direct SQLite update (no new db.py helper needed; avoid round-tripping through `add_contact`). The `contact_id` is already returned by `add_contact()`. + +We need a tiny helper `_update_contact_signal(db_path, contact_id, signal)` locally in imap_sync.py. Do NOT add this to db.py — it's only used here. + +**Step 1: Add test** + +Append to `tests/test_imap_sync.py`: + +```python +def test_sync_job_emails_classifies_inbound(tmp_path): + """sync_job_emails classifies inbound emails and stores the stage_signal.""" + from scripts.db import init_db, insert_job, get_contacts + from scripts.imap_sync import sync_job_emails + + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", + "url": "https://acme.com/jobs/1", + "source": "linkedin", "location": "Remote", + "is_remote": True, "salary": "", "description": "", + "date_found": "2026-02-21", + }) + job = {"id": job_id, "company": "Acme", "url": "https://acme.com/jobs/1"} + + # Fake IMAP connection + one inbound email + from unittest.mock import MagicMock, patch + + fake_msg_bytes = ( + b"From: recruiter@acme.com\r\n" + b"To: alex@example.com\r\n" + b"Subject: Interview Invitation\r\n" + b"Message-ID: \r\n" + b"\r\n" + b"Hi Alex, we'd like to schedule a phone screen." + ) + + conn_mock = MagicMock() + conn_mock.select.return_value = ("OK", [b"1"]) + conn_mock.search.return_value = ("OK", [b"1"]) + conn_mock.fetch.return_value = ("OK", [(b"1 (RFC822 {123})", fake_msg_bytes)]) + + with patch("scripts.imap_sync.classify_stage_signal", return_value="interview_scheduled"): + inb, out = sync_job_emails(job, conn_mock, {"lookback_days": 90}, db_path) + + assert inb == 1 + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" +``` + +**Step 2: Run to confirm failure** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py::test_sync_job_emails_classifies_inbound -v +``` + +Expected: FAIL (stage_signal is None). + +**Step 3: Update `sync_job_emails()` in `scripts/imap_sync.py`** + +Add the private helper just before `sync_job_emails`: + +```python +def _update_contact_signal(db_path: Path, contact_id: int, signal: str) -> None: + """Write a stage signal onto an existing contact row.""" + import sqlite3 as _sqlite3 + conn = _sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET stage_signal = ? WHERE id = ?", + (signal, contact_id), + ) + conn.commit() + conn.close() +``` + +In the INBOX loop inside `sync_job_emails()`, after the `add_contact(...)` call, add: + +```python +signal = classify_stage_signal(parsed["subject"], parsed["body"]) +if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) +``` + +Note: `add_contact()` already returns the `row_id` (the contact_id). Make sure to capture it: + +```python +contact_id = add_contact( + db_path, job_id=job["id"], direction="inbound", + ... +) +signal = classify_stage_signal(parsed["subject"], parsed["body"]) +if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) +``` + +**Step 4: Run tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v +``` + +Expected: all pass. + +**Step 5: Commit** + +```bash +git add scripts/imap_sync.py tests/test_imap_sync.py +git commit -m "feat: classify stage signals for inbound emails during per-job sync" +``` + +--- + +### Task 4: Recruiter lead extractor + unmatched email handling + +**Files:** +- Modify: `scripts/imap_sync.py` +- Modify: `scripts/db.py` +- Test: `tests/test_imap_sync.py` + +**Context:** After per-job sync, do a second pass to find inbound recruitment emails NOT matched to any existing pipeline company. For each, call Nemotron to extract company + job title. If extraction succeeds and company isn't already in the DB, insert a new job (`source='email', status='pending'`). Use a synthetic URL `email:///` to satisfy the UNIQUE constraint on `jobs.url`. + +`sync_all()` return dict gains a `new_leads` key. + +**Step 1: Add test** + +Append to `tests/test_imap_sync.py`: + +```python +def test_extract_lead_info_returns_company_and_title(): + """extract_lead_info parses LLM JSON response into (company, title).""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = '{"company": "Wiz", "title": "Senior TAM"}' + result = extract_lead_info("Senior TAM at Wiz", "Hi Alex, we have a role…", "recruiter@wiz.com") + assert result == ("Wiz", "Senior TAM") + + +def test_extract_lead_info_returns_none_on_bad_json(): + """extract_lead_info returns (None, None) when LLM returns unparseable output.""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "I cannot determine the company." + result = extract_lead_info("Job opportunity", "blah", "noreply@example.com") + assert result == (None, None) +``` + +**Step 2: Run to confirm failures** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py::test_extract_lead_info_returns_company_and_title tests/test_imap_sync.py::test_extract_lead_info_returns_none_on_bad_json -v +``` + +Expected: 2 failures. + +**Step 3: Implement `extract_lead_info()` in `scripts/imap_sync.py`** + +Add after `classify_stage_signal()`: + +```python +_EXTRACT_SYSTEM = ( + "Extract the hiring company name and job title from this recruitment email. " + "Respond with ONLY valid JSON in this exact format: " + '{\"company\": \"Company Name\", \"title\": \"Job Title\"}. ' + "If you cannot determine the company, respond: " + '{\"company\": null, \"title\": null}.' +) + + +def extract_lead_info(subject: str, body: str, + from_addr: str) -> tuple[Optional[str], Optional[str]]: + """Use Nemotron to extract (company, title) from an unmatched recruitment email. + + Returns (company, title) or (None, None) on failure / low confidence. + """ + import json as _json + try: + prompt = ( + f"From: {from_addr}\n" + f"Subject: {subject}\n\n" + f"Email excerpt:\n{body[:600]}" + ) + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_EXTRACT_SYSTEM, + fallback_order=["ollama_research"], + ) + # Strip blocks + text = _re.sub(r".*?", "", raw, flags=_re.DOTALL).strip() + # Find first JSON object in response + m = _re.search(r'\{.*\}', text, _re.DOTALL) + if not m: + return None, None + data = _json.loads(m.group()) + company = data.get("company") or None + title = data.get("title") or None + return company, title + except Exception: + return None, None +``` + +**Step 4: Implement `_scan_unmatched_leads()` in `scripts/imap_sync.py`** + +Add this function. It uses the existing IMAP connection after per-job sync: + +```python +def _scan_unmatched_leads(conn: imaplib.IMAP4, cfg: dict, + db_path: Path, + known_message_ids: set[str]) -> int: + """Scan INBOX for recruitment emails not matched to any pipeline job. + + Calls LLM to extract company/title; inserts qualifying emails as email leads. + Returns the count of new leads inserted. + """ + from scripts.db import get_existing_urls, insert_job, add_contact + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + + # Broad search — subject matches common recruiter terms + broad_terms = ["interview", "opportunity", "offer", "application", "role"] + all_uids: set[bytes] = set() + for term in broad_terms: + uids = _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since) + all_uids.update(uids) + + existing_urls = get_existing_urls(db_path) + new_leads = 0 + + for uid in all_uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + mid = parsed["message_id"] + if mid in known_message_ids: + continue # already synced to some job + if not _has_recruitment_keyword(parsed["subject"]): + continue # false positive from broad search + + company, title = extract_lead_info( + parsed["subject"], parsed["body"], parsed["from_addr"] + ) + if not company: + continue + + # Build a synthetic URL for dedup + from_domain = _extract_domain(parsed["from_addr"]) or "unknown" + mid_hash = str(abs(hash(mid)))[:10] + synthetic_url = f"email://{from_domain}/{mid_hash}" + + if synthetic_url in existing_urls: + continue # already captured this lead + + job_id = insert_job(db_path, { + "title": title or "(untitled)", + "company": company, + "url": synthetic_url, + "source": "email", + "location": "", + "is_remote": 0, + "salary": "", + "description": parsed["body"][:2000], + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + add_contact(db_path, job_id=job_id, direction="inbound", + subject=parsed["subject"], + from_addr=parsed["from_addr"], + body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else "", + message_id=mid) + known_message_ids.add(mid) + existing_urls.add(synthetic_url) + new_leads += 1 + + return new_leads +``` + +**Step 5: Update `sync_all()` to call `_scan_unmatched_leads()`** + +In `sync_all()`, after the per-job loop and before `conn.logout()`: + +```python +from scripts.db import get_all_message_ids +known_mids = get_all_message_ids(db_path) +summary["new_leads"] = _scan_unmatched_leads(conn, cfg, db_path, known_mids) +``` + +Also add `"new_leads": 0` to the initial `summary` dict. + +**Step 6: Run tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v +``` + +Expected: all pass. + +**Step 7: Commit** + +```bash +git add scripts/imap_sync.py scripts/db.py tests/test_imap_sync.py +git commit -m "feat: recruiter lead extraction from unmatched inbound emails" +``` + +--- + +### Task 5: email_sync background task type + +**Files:** +- Modify: `scripts/task_runner.py` +- Test: `tests/test_task_runner.py` + +**Context:** Add `email_sync` to the `if/elif` chain in `_run_task()`. `job_id` is 0 (global task). The result summary is stored in the task's `error` field as a string (same pattern as `discovery`). If IMAP config is missing (`FileNotFoundError`), mark failed with a friendly message. + +**Step 1: Add test** + +Append to `tests/test_task_runner.py`: + +```python +def test_run_task_email_sync_success(tmp_path): + """email_sync task calls sync_all and marks completed with summary.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + summary = {"synced": 3, "inbound": 5, "outbound": 2, "new_leads": 1, "errors": []} + with patch("scripts.imap_sync.sync_all", return_value=summary): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "completed" + assert "3 jobs" in task["error"] + + +def test_run_task_email_sync_file_not_found(tmp_path): + """email_sync marks failed with helpful message when config is missing.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + with patch("scripts.imap_sync.sync_all", side_effect=FileNotFoundError("config/email.yaml")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "failed" + assert "email" in task["error"].lower() +``` + +**Step 2: Run to confirm failures** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py::test_run_task_email_sync_success tests/test_task_runner.py::test_run_task_email_sync_file_not_found -v +``` + +Expected: 2 failures. + +**Step 3: Add email_sync branch to `_run_task()` in `scripts/task_runner.py`** + +Add after the `company_research` elif, before the `else`: + +```python +elif task_type == "email_sync": + try: + from scripts.imap_sync import sync_all + result = sync_all(db_path) + leads = result.get("new_leads", 0) + errs = len(result.get("errors", [])) + msg = ( + f"{result['synced']} jobs updated, " + f"+{result['inbound']} in, +{result['outbound']} out" + f"{f', {leads} new lead(s)' if leads else ''}" + f"{f', {errs} error(s)' if errs else ''}" + ) + update_task_status(db_path, task_id, "completed", error=msg) + return + except FileNotFoundError: + update_task_status(db_path, task_id, "failed", + error="Email not configured — go to Settings → Email") + return +``` + +**Step 4: Run tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py -v +``` + +Expected: all pass. + +**Step 5: Commit** + +```bash +git add scripts/task_runner.py tests/test_task_runner.py +git commit -m "feat: add email_sync background task type to task_runner" +``` + +--- + +### Task 6: Sync Emails button on Home page + +**Files:** +- Modify: `app/Home.py` + +**Context:** Home.py has three sections in `left / mid / right` columns (Find Jobs, Score Listings, Send to Notion). Add a fourth section. Since we can't easily add a 4th column to the same row without crowding, add it as a new row below the divider, before the Danger Zone expander. Use the same background task pattern as discovery: check for an in-flight `email_sync` task, disable button if running, poll with `@st.fragment(run_every=4)`. + +Also update the imports to include `get_all_message_ids` — no, actually we don't need that. We need `submit_task` (already imported) and `get_task_for_job` (already imported). + +Also update the success message to show new_leads if any. + +No tests needed for UI pages (Streamlit pages aren't unit-testable without an e2e framework). + +**Step 1: Add Email Sync section to `app/Home.py`** + +After the `with right:` block and before `st.divider()` (the one before Danger Zone), add: + +```python +st.divider() + +# ── Email Sync ──────────────────────────────────────────────────────────────── +email_left, email_right = st.columns([3, 1]) + +with email_left: + st.subheader("Sync Emails") + st.caption("Pull inbound recruiter emails and match them to active applications. " + "New recruiter outreach is added to your Job Review queue.") + +with email_right: + _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0) + _email_running = _email_task and _email_task["status"] in ("queued", "running") + + if st.button("📧 Sync Emails", use_container_width=True, type="primary", + disabled=bool(_email_running)): + submit_task(DEFAULT_DB, "email_sync", 0) + st.rerun() + + if _email_running: + @st.fragment(run_every=4) + def _email_status(): + t = get_task_for_job(DEFAULT_DB, "email_sync", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Syncing emails…") + else: + st.rerun() + _email_status() + elif _email_task and _email_task["status"] == "completed": + st.success(f"✅ {_email_task.get('error', 'Done')}") + elif _email_task and _email_task["status"] == "failed": + st.error(f"Sync failed: {_email_task.get('error', '')}") +``` + +**Step 2: Manual smoke test** + +```bash +bash /devl/job-seeker/scripts/manage-ui.sh restart +``` + +Open http://localhost:8501, confirm "Sync Emails" section appears with button. + +**Step 3: Commit** + +```bash +git add app/Home.py +git commit -m "feat: add Sync Emails background task button to Home page" +``` + +--- + +### Task 7: Convert Interviews sync to background task + add stage suggestion banner + +**Files:** +- Modify: `app/pages/5_Interviews.py` + +**Context:** The sidebar sync button in 5_Interviews.py currently calls `sync_all()` synchronously inside a `with st.spinner(...)` block (lines 38–61). Replace it with `submit_task(DEFAULT_DB, "email_sync", 0)` + fragment polling, matching the pattern in Home.py. + +Then add the stage suggestion banner in `_render_card()`. After the interview date form (or at the top of the "if not compact:" block), call `get_unread_stage_signals()`. If any exist, show the most recent one with → Move and Dismiss buttons. + +The banner should only show for stages where a stage advancement makes sense: `applied`, `phone_screen`, `interviewing`. Not `offer` or `hired`. + +**Step 1: Update imports in `5_Interviews.py`** + +Add to the existing `from scripts.db import (...)` block: +- `get_unread_stage_signals` +- `dismiss_stage_signal` + +Add to the `from scripts.task_runner import submit_task` line (already present). + +**Step 2: Replace synchronous sync button** + +Replace the entire `with st.sidebar:` block (lines 38–61) with: + +```python +with st.sidebar: + st.markdown("### 📧 Email Sync") + _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0) + _email_running = _email_task and _email_task["status"] in ("queued", "running") + + if st.button("🔄 Sync Emails", use_container_width=True, type="primary", + disabled=bool(_email_running)): + submit_task(DEFAULT_DB, "email_sync", 0) + st.rerun() + + if _email_running: + @st.fragment(run_every=4) + def _email_sidebar_status(): + t = get_task_for_job(DEFAULT_DB, "email_sync", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Syncing…") + else: + st.rerun() + _email_sidebar_status() + elif _email_task and _email_task["status"] == "completed": + st.success(_email_task.get("error", "Done")) + elif _email_task and _email_task["status"] == "failed": + msg = _email_task.get("error", "") + if "not configured" in msg.lower(): + st.error("Email not configured. Go to **Settings → Email**.") + else: + st.error(f"Sync failed: {msg}") +``` + +**Step 3: Add stage suggestion banner in `_render_card()`** + +Inside `_render_card()`, at the start of the `if not compact:` block (just before `# Advance / Reject buttons`), add: + +```python +if stage in ("applied", "phone_screen", "interviewing"): + signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id) + if signals: + sig = signals[-1] # most recent + _SIGNAL_LABELS = { + "interview_scheduled": ("📞 Phone Screen", "phone_screen"), + "positive_response": ("📞 Phone Screen", "phone_screen"), + "offer_received": ("📜 Offer", "offer"), + "rejected": ("✗ Reject", None), + } + label_text, target_stage = _SIGNAL_LABELS.get(sig["stage_signal"], (None, None)) + with st.container(border=True): + st.caption( + f"💡 Email suggests: **{sig['stage_signal'].replace('_', ' ')}** \n" + f"_{sig.get('subject', '')}_ · {(sig.get('received_at') or '')[:10]}" + ) + b1, b2 = st.columns(2) + if target_stage and b1.button( + f"→ {label_text}", key=f"sig_adv_{sig['id']}", + use_container_width=True, type="primary", + ): + if target_stage == "phone_screen" and stage == "applied": + advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen") + submit_task(DEFAULT_DB, "company_research", job_id) + elif target_stage: + advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage) + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun() + elif label_text == "✗ Reject" and b1.button( + "✗ Reject", key=f"sig_rej_{sig['id']}", + use_container_width=True, + ): + reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage) + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun() + if b2.button("Dismiss", key=f"sig_dis_{sig['id']}", + use_container_width=True): + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun() +``` + +**Step 4: Manual smoke test** + +```bash +bash /devl/job-seeker/scripts/manage-ui.sh restart +``` + +Open Interviews page, confirm sidebar sync button is present and non-blocking. + +**Step 5: Commit** + +```bash +git add app/pages/5_Interviews.py +git commit -m "feat: non-blocking email sync + stage suggestion banner on Interviews kanban" +``` + +--- + +### Task 8: Email leads section in Job Review + +**Files:** +- Modify: `app/pages/1_Job_Review.py` +- Modify: `scripts/db.py` + +**Context:** Email leads are jobs with `source = 'email'` and `status = 'pending'`. They already appear in the `pending` list returned by `get_jobs_by_status()`. We want to visually separate them at the top when `show_status == 'pending'`. + +Add a `get_email_leads(db_path)` helper in `scripts/db.py` that returns pending email-source jobs ordered by `date_found DESC`. In the Job Review page, before the main job list loop, if `show_status == 'pending'`, pull email leads and render them in a distinct section with an `📧 Email Lead` badge. Then render the remaining (non-email) pending jobs below. + +**Step 1: Add test for new DB helper** + +Append to `tests/test_db.py`: + +```python +def test_get_email_leads(tmp_path): + """get_email_leads returns only source='email' pending jobs.""" + from scripts.db import init_db, insert_job, get_email_leads + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + insert_job(db_path, { + "title": "TAM", "company": "Wiz", "url": "email://wiz.com/abc123", + "source": "email", "location": "", "is_remote": 0, + "salary": "", "description": "Hi Alex…", "date_found": "2026-02-21", + }) + leads = get_email_leads(db_path) + assert len(leads) == 1 + assert leads[0]["company"] == "Wiz" + assert leads[0]["source"] == "email" +``` + +**Step 2: Run to confirm failure** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_get_email_leads -v +``` + +Expected: FAIL (ImportError or function missing). + +**Step 3: Add `get_email_leads()` to `scripts/db.py`** + +After `get_jobs_by_status()`: + +```python +def get_email_leads(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return pending jobs with source='email', newest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT * FROM jobs WHERE source = 'email' AND status = 'pending' " + "ORDER BY date_found DESC, id DESC" + ).fetchall() + conn.close() + return [dict(r) for r in rows] +``` + +**Step 4: Run test** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_get_email_leads -v +``` + +Expected: PASS. + +**Step 5: Update `1_Job_Review.py`** + +Add to the top-level import from `scripts.db`: +- `get_email_leads` + +After `init_db(DEFAULT_DB)` and before the sidebar filters block, add: + +```python +# ── Email leads (shown only when browsing pending) ──────────────────────────── +_email_leads = get_email_leads(DEFAULT_DB) if True else [] +``` + +(We always fetch them; the section only renders when `show_status == 'pending'`.) + +After `st.divider()` (after the caption line) and before the main `for job in jobs:` loop, add: + +```python +if show_status == "pending" and _email_leads: + st.subheader(f"📧 Email Leads ({len(_email_leads)})") + st.caption( + "Inbound recruiter emails not yet matched to a scraped listing. " + "Approve to move to Job Review; Reject to dismiss." + ) + for lead in _email_leads: + lead_id = lead["id"] + with st.container(border=True): + left_l, right_l = st.columns([7, 3]) + with left_l: + st.markdown(f"**{lead['title']}** — {lead['company']}") + badge_cols = st.columns(4) + badge_cols[0].caption("📧 Email Lead") + badge_cols[1].caption(f"📅 {lead.get('date_found', '')}") + if lead.get("description"): + with st.expander("📄 Email excerpt", expanded=False): + st.text(lead["description"][:500]) + with right_l: + if st.button("✅ Approve", key=f"el_approve_{lead_id}", + type="primary", use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "approved") + st.rerun() + if st.button("❌ Reject", key=f"el_reject_{lead_id}", + use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "rejected") + st.rerun() + st.divider() + +# Filter out email leads from the main pending list (already shown above) +if show_status == "pending": + jobs = [j for j in jobs if j.get("source") != "email"] +``` + +**Step 6: Manual smoke test** + +```bash +bash /devl/job-seeker/scripts/manage-ui.sh restart +``` + +Confirm Job Review shows "Email Leads" section when filtering for pending. + +**Step 7: Commit** + +```bash +git add scripts/db.py tests/test_db.py app/pages/1_Job_Review.py +git commit -m "feat: show email lead jobs at top of Job Review pending queue" +``` + +--- + +### Task 9: Full test run + final polish + +**Files:** +- No new files + +**Step 1: Run full test suite** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +Expected: all pass. Fix any regressions before proceeding. + +**Step 2: Verify DB exports in `scripts/db.py`** + +Confirm that `get_unread_stage_signals`, `dismiss_stage_signal`, `get_all_message_ids`, and `get_email_leads` are imported correctly wherever used: +- `5_Interviews.py` imports `get_unread_stage_signals`, `dismiss_stage_signal` +- `imap_sync.py` imports `get_all_message_ids` +- `1_Job_Review.py` imports `get_email_leads` + +Run: +```bash +conda run -n job-seeker python -c "from scripts.db import get_unread_stage_signals, dismiss_stage_signal, get_all_message_ids, get_email_leads; print('OK')" +``` + +**Step 3: Smoke-test the classifier with real Ollama** + +```bash +conda run -n job-seeker python -c " +from scripts.imap_sync import classify_stage_signal +print(classify_stage_signal('Interview Invitation', 'We would love to schedule a 30-min phone screen with you.')) +print(classify_stage_signal('Your application with DataStax', 'We have decided to move forward with other candidates.')) +print(classify_stage_signal('Application received', 'We have received your application and will be in touch.')) +" +``` + +Expected output: +``` +interview_scheduled +rejected +neutral +``` + +**Step 4: Commit** + +```bash +git add -A +git commit -m "chore: verify all email handling imports and run full test suite" +``` diff --git a/docs/plans/2026-02-22-research-workflow-design.md b/docs/plans/2026-02-22-research-workflow-design.md new file mode 100644 index 0000000..1277357 --- /dev/null +++ b/docs/plans/2026-02-22-research-workflow-design.md @@ -0,0 +1,187 @@ +# Research Workflow Redesign + +**Date:** 2026-02-22 +**Status:** Approved + +## Problem + +The current `company_research.py` produces shallow output: +- Resume context is a hardcoded 2-sentence blurb — talking points aren't grounded in Alex's actual experience +- Search coverage is limited: CEO, HQ, LinkedIn, one generic news query +- Output has 4 sections; new data categories (tech stack, funding, culture, competitors) have nowhere to go +- No skills/keyword config to drive experience matching against the JD + +## Approach: Query Expansion + Parallel JSON Searches + Single LLM Pass + +Run all searches (companyScraper sequential + new parallel SearXNG JSON queries), aggregate into a structured context block, pre-select resume experiences by keyword score, single LLM call produces all expanded sections. + +--- + +## Design + +### 1. Search Pipeline + +**Phase 1 — companyScraper (unchanged, sequential)** +- CEO name, HQ address, LinkedIn URL + +**Phase 1b — Parallel SearXNG JSON queries (new/expanded)** + +Six queries run concurrently via daemon threads: + +| Intent | Query pattern | +|---|---| +| Recent news/press | `"{company}" news 2025 2026` | +| Funding & investors | `"{company}" funding round investors Series valuation` | +| Tech stack | `"{company}" tech stack engineering technology platform` | +| Competitors | `"{company}" competitors alternatives vs market` | +| Culture / Glassdoor | `"{company}" glassdoor culture reviews employees` | +| CEO press (if found) | `"{ceo}" "{company}"` | + +Each returns 3–4 deduplicated snippets (title + content + URL), labeled by type. +Results are best-effort — any failed query is silently skipped. + +--- + +### 2. Resume Matching + +**`config/resume_keywords.yaml`** — three categories, tag-managed via Settings UI: + +```yaml +skills: + - Customer Success + - Technical Account Management + - Revenue Operations + - Salesforce + - Gainsight + - data analysis + - stakeholder management + +domains: + - B2B SaaS + - enterprise software + - security / compliance + - post-sale lifecycle + +keywords: + - QBR + - churn reduction + - NRR / ARR + - onboarding + - renewal + - executive sponsorship + - VOC +``` + +**Matching logic:** +1. Case-insensitive substring check of all keywords against JD text → `matched_keywords` list +2. Score each experience entry: count of matched keywords appearing in position title + responsibility bullets +3. Top 2 by score → included in prompt as full detail (position, company, period, all bullets) +4. Remaining entries → condensed one-liners ("Founder @ M3 Consulting, 2023–present") + +**UpGuard NDA rule** (explicit in prompt): reference as "enterprise security vendor" in general; only name UpGuard directly if the role has a strong security/compliance focus. + +--- + +### 3. LLM Context Block Structure + +``` +## Role Context +{title} at {company} + +## Job Description +{JD text, up to 2500 chars} + +## Alex's Matched Experience +[Top 2 scored experience entries — full detail] + +Also in Alex's background: [remaining entries as one-liners] + +## Matched Skills & Keywords +Skills matching this JD: {matched_keywords joined} + +## Live Company Data +- CEO: {name} +- HQ: {location} +- LinkedIn: {url} + +## News & Press +[snippets] + +## Funding & Investors +[snippets] + +## Tech Stack +[snippets] + +## Competitors +[snippets] + +## Culture & Employee Signals +[snippets] +``` + +--- + +### 4. Output Sections (7, up from 4) + +| Section header | Purpose | +|---|---| +| `## Company Overview` | What they do, business model, size/stage, market position | +| `## Leadership & Culture` | CEO background, leadership team, philosophy | +| `## Tech Stack & Product` | What they build, relevant technology, product direction | +| `## Funding & Market Position` | Stage, investors, recent rounds, competitor landscape | +| `## Recent Developments` | News, launches, pivots, exec moves | +| `## Red Flags & Watch-outs` | Culture issues, layoffs, exec departures, financial stress | +| `## Talking Points for Alex` | 5 role-matched, resume-grounded, UpGuard-aware talking points ready to speak aloud | + +Talking points prompt instructs LLM to: cite the specific matched experience by name, reference matched skills, apply UpGuard NDA rule, frame each as a ready-to-speak sentence. + +--- + +### 5. DB Schema Changes + +Add columns to `company_research` table: + +```sql +ALTER TABLE company_research ADD COLUMN tech_brief TEXT; +ALTER TABLE company_research ADD COLUMN funding_brief TEXT; +ALTER TABLE company_research ADD COLUMN competitors_brief TEXT; +ALTER TABLE company_research ADD COLUMN red_flags TEXT; +``` + +Existing columns (`company_brief`, `ceo_brief`, `talking_points`, `raw_output`) unchanged. + +--- + +### 6. Settings UI — Skills & Keywords Tab + +New tab in `app/pages/2_Settings.py`: +- One expander or subheader per category (Skills, Domains, Keywords) +- Tag chips rendered with `st.pills` or columns of `st.badge`-style buttons with × +- Inline text input + Add button per category +- Each add/remove saves immediately to `config/resume_keywords.yaml` + +--- + +### 7. Interview Prep UI Changes + +`app/pages/6_Interview_Prep.py` — render new sections alongside existing ones: +- Tech Stack & Product (new panel) +- Funding & Market Position (new panel) +- Red Flags & Watch-outs (new panel, visually distinct — e.g. orange/amber) +- Talking Points promoted to top (most useful during a live call) + +--- + +## Files Affected + +| File | Change | +|---|---| +| `scripts/company_research.py` | Parallel search queries, resume matching, expanded prompt + sections | +| `scripts/db.py` | Add 4 new columns to `company_research`; update `save_research` / `get_research` | +| `config/resume_keywords.yaml` | New file | +| `config/resume_keywords.yaml.example` | New committed template | +| `app/pages/2_Settings.py` | New Skills & Keywords tab | +| `app/pages/6_Interview_Prep.py` | Render new sections | +| `tests/test_db.py` | Tests for new columns | +| `tests/test_company_research.py` | New test file for matching logic + section parsing | diff --git a/docs/plans/2026-02-22-research-workflow-impl.md b/docs/plans/2026-02-22-research-workflow-impl.md new file mode 100644 index 0000000..1d7c84f --- /dev/null +++ b/docs/plans/2026-02-22-research-workflow-impl.md @@ -0,0 +1,869 @@ +# Research Workflow Redesign — Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Expand company research to gather richer web data (funding, tech stack, competitors, culture/Glassdoor, news), match Alex's resume experience against the JD, and produce a 7-section brief with role-grounded talking points. + +**Architecture:** Parallel SearXNG JSON queries (6 types) feed a structured context block alongside tiered resume experience (top-2 scored full, rest condensed) from `config/resume_keywords.yaml`. Single LLM call produces 7 output sections stored in expanded DB columns. + +**Tech Stack:** Python threading, requests (SearXNG JSON API at `http://localhost:8888/search?format=json`), PyYAML, SQLite ALTER TABLE migrations, Streamlit `st.pills` / column chips. + +**Design doc:** `docs/plans/2026-02-22-research-workflow-design.md` + +**Run tests:** `/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v` +**Python:** `conda run -n job-seeker python + """ + + mock_resp = MagicMock() + mock_resp.text = json_ld_html + mock_resp.raise_for_status = MagicMock() + + with patch("scripts.scrape_url.requests.get", return_value=mock_resp): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + assert result.get("title") == "TAM Role" + assert result.get("company") == "TechCo" + + +def test_scrape_url_graceful_on_http_error(tmp_path): + db, job_id = _make_db(tmp_path) + import requests as req + + with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + # Should return empty dict and not raise; job row still exists + assert isinstance(result, dict) + import sqlite3 + conn = sqlite3.connect(db) + row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row is not None +``` + +**Step 2: Run tests to verify they fail** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_scrape_url.py -v +``` +Expected: FAIL — `ModuleNotFoundError: No module named 'scripts.scrape_url'` + +**Step 3: Implement `scripts/scrape_url.py`** + +```python +# scripts/scrape_url.py +""" +Scrape a job listing from its URL and update the job record. + +Supports: + - LinkedIn (guest jobs API — no auth required) + - Indeed (HTML parse) + - Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py) + - Generic (JSON-LD → og:tags fallback) + +Usage (background task — called by task_runner): + from scripts.scrape_url import scrape_job_url + scrape_job_url(db_path, job_id) +""" +import json +import re +import sqlite3 +import sys +from pathlib import Path +from typing import Optional + +import requests +from bs4 import BeautifulSoup + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, update_job_fields + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} +_TIMEOUT = 12 + + +def _detect_board(url: str) -> str: + """Return 'linkedin', 'indeed', 'glassdoor', or 'generic'.""" + url_lower = url.lower() + if "linkedin.com" in url_lower: + return "linkedin" + if "indeed.com" in url_lower: + return "indeed" + if "glassdoor.com" in url_lower: + return "glassdoor" + return "generic" + + +def _extract_linkedin_job_id(url: str) -> Optional[str]: + """Extract numeric job ID from a LinkedIn job URL.""" + m = re.search(r"/jobs/view/(\d+)", url) + return m.group(1) if m else None + + +def canonicalize_url(url: str) -> str: + """ + Strip tracking parameters from a job URL and return a clean canonical form. + + LinkedIn: https://www.linkedin.com/jobs/view//?trk=... → https://www.linkedin.com/jobs/view// + Indeed: strips utm_* and other tracking params + Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId + """ + url = url.strip() + if "linkedin.com" in url.lower(): + job_id = _extract_linkedin_job_id(url) + if job_id: + return f"https://www.linkedin.com/jobs/view/{job_id}/" + # For other boards: strip common tracking params + from urllib.parse import urlparse, urlencode, parse_qsl + _STRIP_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", + "trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig", + "eid", "otpToken", "ssid", "fmid", + } + parsed = urlparse(url) + clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS]) + return parsed._replace(query=clean_qs).geturl() + + +def _scrape_linkedin(url: str) -> dict: + """Fetch via LinkedIn guest jobs API (no auth required).""" + job_id = _extract_linkedin_job_id(url) + if not job_id: + return {} + api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}" + resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + def _text(selector, **kwargs): + tag = soup.find(selector, **kwargs) + return tag.get_text(strip=True) if tag else "" + + title = _text("h2", class_="top-card-layout__title") + company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link") + location = _text("span", class_="topcard__flavor--bullet") + desc_div = soup.find("div", class_="show-more-less-html__markup") + description = desc_div.get_text(separator="\n", strip=True) if desc_div else "" + + return {k: v for k, v in { + "title": title, + "company": company, + "location": location, + "description": description, + "source": "linkedin", + }.items() if v} + + +def _scrape_indeed(url: str) -> dict: + """Scrape an Indeed job page.""" + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def _scrape_glassdoor(url: str) -> dict: + """Re-use JobSpy's Glassdoor scraper for description fetch.""" + m = re.search(r"jl=(\d+)", url) + if not m: + return {} + try: + from jobspy.glassdoor import Glassdoor + from jobspy.glassdoor.constant import fallback_token, headers + from jobspy.model import ScraperInput, Site + from jobspy.util import create_session + + scraper = Glassdoor() + scraper.base_url = "https://www.glassdoor.com/" + scraper.session = create_session(has_retry=True) + token = scraper._get_csrf_token() + headers["gd-csrf-token"] = token if token else fallback_token + scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) + description = scraper._fetch_job_description(int(m.group(1))) + return {"description": description} if description else {} + except Exception: + return {} + + +def _parse_json_ld_or_og(html: str) -> dict: + """Extract job fields from JSON-LD structured data, then og: meta tags.""" + soup = BeautifulSoup(html, "html.parser") + + # Try JSON-LD first + for script in soup.find_all("script", type="application/ld+json"): + try: + data = json.loads(script.string or "") + if isinstance(data, list): + data = next((d for d in data if d.get("@type") == "JobPosting"), {}) + if data.get("@type") == "JobPosting": + org = data.get("hiringOrganization") or {} + loc = (data.get("jobLocation") or {}) + if isinstance(loc, list): + loc = loc[0] if loc else {} + addr = loc.get("address") or {} + location = ( + addr.get("addressLocality", "") or + addr.get("addressRegion", "") or + addr.get("addressCountry", "") + ) + return {k: v for k, v in { + "title": data.get("title", ""), + "company": org.get("name", ""), + "location": location, + "description": data.get("description", ""), + "salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "", + }.items() if v} + except Exception: + continue + + # Fall back to og: meta tags + def _meta(prop): + tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop}) + return (tag or {}).get("content", "") if tag else "" + + title = _meta("og:title") or (soup.find("title") or {}).get_text(strip=True) + description = _meta("og:description") + return {k: v for k, v in {"title": title, "description": description}.items() if v} + + +def _scrape_generic(url: str) -> dict: + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict: + """ + Fetch the job listing at the stored URL and update the job record. + + Returns the dict of fields that were scraped (may be empty on failure). + Does not raise — failures are logged and the job row is left as-is. + """ + if not job_id: + return {} + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if not row: + return {} + + url = row["url"] or "" + if not url.startswith("http"): + return {} + + board = _detect_board(url) + try: + if board == "linkedin": + fields = _scrape_linkedin(url) + elif board == "indeed": + fields = _scrape_indeed(url) + elif board == "glassdoor": + fields = _scrape_glassdoor(url) + else: + fields = _scrape_generic(url) + except requests.RequestException as exc: + print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}") + return {} + except Exception as exc: + print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}") + return {} + + if fields: + # Never overwrite the URL or source with empty values + fields.pop("url", None) + update_job_fields(db_path, job_id, fields) + print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}") + + return fields +``` + +**Step 4: Add `scrape_url` task type to `scripts/task_runner.py`** + +In `_run_task`, add a new `elif` branch after `enrich_descriptions` and before the final `else`: + +```python + elif task_type == "scrape_url": + from scripts.scrape_url import scrape_job_url + fields = scrape_job_url(db_path, job_id) + title = fields.get("title") or job.get("url", "?") + company = fields.get("company", "") + msg = f"{title}" + (f" @ {company}" if company else "") + update_task_status(db_path, task_id, "completed", error=msg) + return +``` + +**Step 5: Run all tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_scrape_url.py -v +``` +Expected: all PASS + +**Step 6: Commit** + +```bash +git add scripts/scrape_url.py scripts/task_runner.py tests/test_scrape_url.py +git commit -m "feat: add scrape_url background task for URL-based job import" +``` + +--- + +## Task 3: LinkedIn Job Alert email parser + +**Files:** +- Modify: `scripts/imap_sync.py` +- Test: `tests/test_imap_sync.py` + +**Step 1: Write the failing tests** + +Add to `tests/test_imap_sync.py`: + +```python +def test_parse_linkedin_alert_extracts_jobs(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Your job alert for customer success manager in United States +New jobs match your preferences. +Manage alerts: https://www.linkedin.com/comm/jobs/alerts?... + +Customer Success Manager +Reflow +California, United States +View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz + +--------------------------------------------------------- + +Customer Engagement Manager +Bitwarden +United States + +2 school alumni +Apply with resume & profile +View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D + +--------------------------------------------------------- + +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 2 + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["company"] == "Reflow" + assert jobs[0]["location"] == "California, United States" + assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/" + assert jobs[1]["title"] == "Customer Engagement Manager" + assert jobs[1]["company"] == "Bitwarden" + assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/" + + +def test_parse_linkedin_alert_skips_blocks_without_view_job(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Customer Success Manager +Some Company +United States + +--------------------------------------------------------- + +Valid Job Title +Valid Company +Remote +View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y + +--------------------------------------------------------- +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 1 + assert jobs[0]["title"] == "Valid Job Title" + + +def test_parse_linkedin_alert_empty_body(): + from scripts.imap_sync import parse_linkedin_alert + assert parse_linkedin_alert("") == [] + assert parse_linkedin_alert("No jobs here.") == [] +``` + +**Step 2: Run tests to verify they fail** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py::test_parse_linkedin_alert_extracts_jobs tests/test_imap_sync.py::test_parse_linkedin_alert_skips_blocks_without_view_job tests/test_imap_sync.py::test_parse_linkedin_alert_empty_body -v +``` +Expected: FAIL — `ImportError: cannot import name 'parse_linkedin_alert'` + +**Step 3: Implement `parse_linkedin_alert` in `scripts/imap_sync.py`** + +Add after the existing `_has_todo_keyword` function (around line 391): + +```python +_LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com" + +# Social-proof / nav lines to skip when parsing alert blocks +_ALERT_SKIP_PHRASES = { + "alumni", "apply with", "actively hiring", "manage alerts", + "view all jobs", "your job alert", "new jobs match", + "unsubscribe", "linkedin corporation", +} + + +def parse_linkedin_alert(body: str) -> list[dict]: + """ + Parse the plain-text body of a LinkedIn Job Alert digest email. + + Returns a list of dicts: {title, company, location, url}. + URL is canonicalized to https://www.linkedin.com/jobs/view// + (tracking parameters stripped). + """ + jobs = [] + # Split on separator lines (10+ dashes) + blocks = re.split(r"\n\s*-{10,}\s*\n", body) + for block in blocks: + lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()] + + # Find "View job:" URL + url = None + for line in lines: + m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE) + if m: + raw_url = m.group(1) + job_id_m = re.search(r"/jobs/view/(\d+)", raw_url) + if job_id_m: + url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/" + break + if not url: + continue + + # Filter noise lines + content = [ + ln for ln in lines + if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES) + and not ln.lower().startswith("view job:") + and not ln.startswith("http") + ] + if len(content) < 2: + continue + + jobs.append({ + "title": content[0], + "company": content[1], + "location": content[2] if len(content) > 2 else "", + "url": url, + }) + return jobs +``` + +**Step 4: Wire the parser into `_scan_unmatched_leads`** + +In `_scan_unmatched_leads`, inside the `for uid in all_uids:` loop, add a detection block immediately after the `if mid in known_message_ids: continue` check (before the existing `_has_recruitment_keyword` check): + +```python + # ── LinkedIn Job Alert digest — parse each card individually ────── + if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower(): + cards = parse_linkedin_alert(parsed["body"]) + for card in cards: + if card["url"] in existing_urls: + continue + job_id = insert_job(db_path, { + "title": card["title"], + "company": card["company"], + "url": card["url"], + "source": "linkedin", + "location": card["location"], + "is_remote": 0, + "salary": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + from scripts.task_runner import submit_task + submit_task(db_path, "scrape_url", job_id) + existing_urls.add(card["url"]) + new_leads += 1 + print(f"[imap] LinkedIn alert → {card['company']} — {card['title']}") + known_message_ids.add(mid) + continue # skip normal LLM extraction path +``` + +**Step 5: Run all imap_sync tests** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v +``` +Expected: all PASS (including the 3 new tests) + +**Step 6: Commit** + +```bash +git add scripts/imap_sync.py tests/test_imap_sync.py +git commit -m "feat: auto-parse LinkedIn Job Alert digest emails into pending jobs" +``` + +--- + +## Task 4: Home page — Add Job(s) by URL + +**Files:** +- Modify: `app/Home.py` + +No unit tests — this is pure Streamlit UI. Verify manually by pasting a URL and checking the DB. + +**Step 1: Add `_queue_url_imports` helper and the new section to `app/Home.py`** + +Add to the imports at the top (after the existing `from scripts.db import ...` line): + +```python +from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \ + kill_stuck_tasks, get_task_for_job, get_active_tasks, insert_job, get_existing_urls +``` + +Add this helper function before the Streamlit layout code (after the `init_db` call at the top): + +```python +def _queue_url_imports(db_path: Path, urls: list[str]) -> int: + """Insert each URL as a pending manual job and queue a scrape_url task. + Returns count of newly queued jobs.""" + from datetime import datetime + from scripts.scrape_url import canonicalize_url + existing = get_existing_urls(db_path) + queued = 0 + for url in urls: + url = canonicalize_url(url.strip()) + if not url.startswith("http"): + continue + if url in existing: + continue + job_id = insert_job(db_path, { + "title": "Importing…", + "company": "", + "url": url, + "source": "manual", + "location": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + submit_task(db_path, "scrape_url", job_id) + queued += 1 + return queued +``` + +Add a new section between the Email Sync divider and the Danger Zone expander. Replace: + +```python +st.divider() + +# ── Danger zone: purge + re-scrape ──────────────────────────────────────────── +``` + +with: + +```python +st.divider() + +# ── Add Jobs by URL ─────────────────────────────────────────────────────────── +add_left, add_right = st.columns([3, 1]) +with add_left: + st.subheader("Add Jobs by URL") + st.caption("Paste job listing URLs to import and scrape in the background. " + "Supports LinkedIn, Indeed, Glassdoor, and most job boards.") + +url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"]) + +with url_tab: + url_text = st.text_area( + "urls", + placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc", + height=100, + label_visibility="collapsed", + ) + if st.button("📥 Add Jobs", key="add_urls_btn", use_container_width=True, + disabled=not (url_text or "").strip()): + _urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")] + if _urls: + _n = _queue_url_imports(DEFAULT_DB, _urls) + if _n: + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.") + else: + st.info("All URLs already in the database.") + st.rerun() + +with csv_tab: + csv_file = st.file_uploader("CSV with a URL column", type=["csv"], + label_visibility="collapsed") + if csv_file: + import csv as _csv + import io as _io + reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace"))) + _csv_urls = [] + for row in reader: + for val in row.values(): + if val and val.strip().startswith("http"): + _csv_urls.append(val.strip()) + break + if _csv_urls: + st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.") + if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True): + _n = _queue_url_imports(DEFAULT_DB, _csv_urls) + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.") + st.rerun() + else: + st.warning("No URLs found — CSV must have a column whose values start with http.") + +# Active scrape_url tasks status +@st.fragment(run_every=3) +def _scrape_status(): + import sqlite3 as _sq + conn = _sq.connect(DEFAULT_DB) + conn.row_factory = _sq.Row + rows = conn.execute( + """SELECT bt.status, bt.error, j.title, j.company, j.url + FROM background_tasks bt + JOIN jobs j ON j.id = bt.job_id + WHERE bt.task_type = 'scrape_url' + AND bt.updated_at >= datetime('now', '-5 minutes') + ORDER BY bt.updated_at DESC LIMIT 20""" + ).fetchall() + conn.close() + if not rows: + return + st.caption("Recent URL imports:") + for r in rows: + if r["status"] == "running": + st.info(f"⏳ Scraping {r['url']}") + elif r["status"] == "completed": + label = f"{r['title']}" + (f" @ {r['company']}" if r['company'] else "") + st.success(f"✅ {label}") + elif r["status"] == "failed": + st.error(f"❌ {r['url']} — {r['error'] or 'scrape failed'}") + +_scrape_status() + +st.divider() + +# ── Danger zone: purge + re-scrape ──────────────────────────────────────────── +``` + +**Step 2: Check `background_tasks` schema has an `updated_at` column** + +The status fragment queries `bt.updated_at`. Verify it exists: + +```bash +conda run -n job-seeker python -c " +import sqlite3 +from scripts.db import DEFAULT_DB, init_db +init_db(DEFAULT_DB) +conn = sqlite3.connect(DEFAULT_DB) +print(conn.execute('PRAGMA table_info(background_tasks)').fetchall()) +" +``` + +If `updated_at` is missing, add a migration in `scripts/db.py`'s `_migrate_db` function: + +```python + try: + conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT DEFAULT (datetime('now'))") + except sqlite3.OperationalError: + pass +``` + +And update `update_task_status` in `db.py` to set `updated_at = datetime('now')` on every status change: + +```python +def update_task_status(db_path, task_id, status, error=None): + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE background_tasks SET status=?, error=?, updated_at=datetime('now') WHERE id=?", + (status, error, task_id), + ) + conn.commit() + conn.close() +``` + +**Step 3: Restart the UI and manually verify** + +```bash +bash /devl/job-seeker/scripts/manage-ui.sh restart +``` + +Test: +1. Paste `https://www.linkedin.com/jobs/view/4376518925/` into the text area +2. Click "📥 Add Jobs" — should show "Queued 1 job for import" +3. Go to Job Review → should see a pending job (Reflow - Customer Success Manager once scraped) + +**Step 4: Commit** + +```bash +git add app/Home.py +git commit -m "feat: add 'Add Jobs by URL' section to Home page with background scraping" +``` + +--- + +## Final: push to remote + +```bash +git push origin main +``` diff --git a/docs/plans/2026-02-24-job-seeker-app-generalize.md b/docs/plans/2026-02-24-job-seeker-app-generalize.md new file mode 100644 index 0000000..ee50c44 --- /dev/null +++ b/docs/plans/2026-02-24-job-seeker-app-generalize.md @@ -0,0 +1,1559 @@ +# Job Seeker App — Generalization Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Fork the personal job-seeker app into a fully generalized, Docker-Compose-based version at `/Library/Development/devl/job-seeker-app/` that any job seeker can run. + +**Architecture:** A `UserProfile` class backed by `config/user.yaml` replaces all hard-coded personal references across the codebase. A Docker Compose stack with four named profiles (`remote`, `cpu`, `single-gpu`, `dual-gpu`) controls which services start. A first-run wizard gates the app on first launch and writes `user.yaml` on completion. + +**Tech Stack:** Python 3.11, Streamlit, SQLite, Docker Compose v2, NVIDIA Container Toolkit (optional), PyYAML, Requests + +**Reference:** Design doc at `docs/plans/2026-02-24-generalize-design.md` in the personal repo. + +--- + +## Task 1: Bootstrap — New Repo From Personal Source + +**Files:** +- Create: `/Library/Development/devl/job-seeker-app/` (new directory) + +**Step 1: Copy source, strip personal config** + +```bash +mkdir -p /Library/Development/devl/job-seeker-app +rsync -av --exclude='.git' \ + --exclude='staging.db' \ + --exclude='config/email.yaml' \ + --exclude='config/notion.yaml' \ + --exclude='config/tokens.yaml' \ + --exclude='aihawk/' \ + --exclude='__pycache__/' \ + --exclude='*.pyc' \ + --exclude='.streamlit.pid' \ + --exclude='.streamlit.log' \ + /devl/job-seeker/ \ + /Library/Development/devl/job-seeker-app/ +``` + +**Step 2: Init fresh git repo** + +```bash +cd /Library/Development/devl/job-seeker-app +git init +git add . +git commit -m "chore: seed from personal job-seeker (pre-generalization)" +``` + +**Step 3: Verify structure** + +```bash +ls /Library/Development/devl/job-seeker-app/ +# Expected: app/ config/ scripts/ tests/ docs/ environment.yml etc. +# NOT expected: staging.db, config/notion.yaml, config/email.yaml +``` + +--- + +## Task 2: UserProfile Class + +**Files:** +- Create: `scripts/user_profile.py` +- Create: `config/user.yaml.example` +- Create: `tests/test_user_profile.py` + +**Step 1: Write failing tests** + +```python +# tests/test_user_profile.py +import pytest +from pathlib import Path +import tempfile, yaml +from scripts.user_profile import UserProfile + +@pytest.fixture +def profile_yaml(tmp_path): + data = { + "name": "Jane Smith", + "email": "jane@example.com", + "phone": "555-1234", + "linkedin": "linkedin.com/in/janesmith", + "career_summary": "Experienced CSM with 8 years in SaaS.", + "nda_companies": ["AcmeCorp"], + "docs_dir": "~/Documents/JobSearch", + "ollama_models_dir": "~/models/ollama", + "vllm_models_dir": "~/models/vllm", + "inference_profile": "single-gpu", + "services": { + "streamlit_port": 8501, + "ollama_host": "localhost", + "ollama_port": 11434, + "ollama_ssl": False, + "ollama_ssl_verify": True, + "vllm_host": "localhost", + "vllm_port": 8000, + "vllm_ssl": False, + "vllm_ssl_verify": True, + "searxng_host": "localhost", + "searxng_port": 8888, + "searxng_ssl": False, + "searxng_ssl_verify": True, + } + } + p = tmp_path / "user.yaml" + p.write_text(yaml.dump(data)) + return p + +def test_loads_fields(profile_yaml): + p = UserProfile(profile_yaml) + assert p.name == "Jane Smith" + assert p.email == "jane@example.com" + assert p.nda_companies == ["AcmeCorp"] + assert p.inference_profile == "single-gpu" + +def test_service_url_http(profile_yaml): + p = UserProfile(profile_yaml) + assert p.ollama_url == "http://localhost:11434" + assert p.vllm_url == "http://localhost:8000" + assert p.searxng_url == "http://localhost:8888" + +def test_service_url_https(tmp_path): + data = yaml.safe_load(open(profile_yaml)) if False else { + "name": "X", "services": { + "ollama_host": "myserver.com", "ollama_port": 443, + "ollama_ssl": True, "ollama_ssl_verify": True, + "vllm_host": "localhost", "vllm_port": 8000, + "vllm_ssl": False, "vllm_ssl_verify": True, + "searxng_host": "localhost", "searxng_port": 8888, + "searxng_ssl": False, "searxng_ssl_verify": True, + } + } + p2 = tmp_path / "user2.yaml" + p2.write_text(yaml.dump(data)) + prof = UserProfile(p2) + assert prof.ollama_url == "https://myserver.com:443" + +def test_nda_mask(profile_yaml): + p = UserProfile(profile_yaml) + assert p.is_nda("AcmeCorp") + assert p.is_nda("acmecorp") # case-insensitive + assert not p.is_nda("Google") + +def test_missing_file_raises(): + with pytest.raises(FileNotFoundError): + UserProfile(Path("/nonexistent/user.yaml")) + +def test_exists_check(profile_yaml, tmp_path): + assert UserProfile.exists(profile_yaml) + assert not UserProfile.exists(tmp_path / "missing.yaml") + +def test_docs_dir_expanded(profile_yaml): + p = UserProfile(profile_yaml) + assert not str(p.docs_dir).startswith("~") + assert p.docs_dir.is_absolute() +``` + +**Step 2: Run tests to verify they fail** + +```bash +cd /Library/Development/devl/job-seeker-app +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_user_profile.py -v +# Expected: ImportError — scripts/user_profile.py does not exist yet +``` + +**Step 3: Implement UserProfile** + +```python +# scripts/user_profile.py +""" +UserProfile — wraps config/user.yaml and provides typed accessors. + +All hard-coded personal references in the app should import this instead +of reading strings directly. URL construction for services is centralised +here so port/host/SSL changes propagate everywhere automatically. +""" +from __future__ import annotations +from pathlib import Path +import yaml + +_DEFAULTS = { + "name": "", + "email": "", + "phone": "", + "linkedin": "", + "career_summary": "", + "nda_companies": [], + "docs_dir": "~/Documents/JobSearch", + "ollama_models_dir": "~/models/ollama", + "vllm_models_dir": "~/models/vllm", + "inference_profile": "remote", + "services": { + "streamlit_port": 8501, + "ollama_host": "localhost", + "ollama_port": 11434, + "ollama_ssl": False, + "ollama_ssl_verify": True, + "vllm_host": "localhost", + "vllm_port": 8000, + "vllm_ssl": False, + "vllm_ssl_verify": True, + "searxng_host": "localhost", + "searxng_port": 8888, + "searxng_ssl": False, + "searxng_ssl_verify": True, + }, +} + + +class UserProfile: + def __init__(self, path: Path): + if not path.exists(): + raise FileNotFoundError(f"user.yaml not found at {path}") + raw = yaml.safe_load(path.read_text()) or {} + data = {**_DEFAULTS, **raw} + svc_defaults = dict(_DEFAULTS["services"]) + svc_defaults.update(raw.get("services", {})) + data["services"] = svc_defaults + + self.name: str = data["name"] + self.email: str = data["email"] + self.phone: str = data["phone"] + self.linkedin: str = data["linkedin"] + self.career_summary: str = data["career_summary"] + self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]] + self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve() + self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve() + self.vllm_models_dir: Path = Path(data["vllm_models_dir"]).expanduser().resolve() + self.inference_profile: str = data["inference_profile"] + self._svc = data["services"] + + # ── Service URLs ────────────────────────────────────────────────────────── + def _url(self, host: str, port: int, ssl: bool) -> str: + scheme = "https" if ssl else "http" + return f"{scheme}://{host}:{port}" + + @property + def ollama_url(self) -> str: + s = self._svc + return self._url(s["ollama_host"], s["ollama_port"], s["ollama_ssl"]) + + @property + def vllm_url(self) -> str: + s = self._svc + return self._url(s["vllm_host"], s["vllm_port"], s["vllm_ssl"]) + + @property + def searxng_url(self) -> str: + s = self._svc + return self._url(s["searxng_host"], s["searxng_port"], s["searxng_ssl"]) + + def ssl_verify(self, service: str) -> bool: + """Return ssl_verify flag for a named service (ollama/vllm/searxng).""" + return bool(self._svc.get(f"{service}_ssl_verify", True)) + + # ── NDA helpers ─────────────────────────────────────────────────────────── + def is_nda(self, company: str) -> bool: + return company.lower() in self.nda_companies + + def nda_label(self, company: str, score: int = 0, threshold: int = 3) -> str: + """Return masked label if company is NDA and score below threshold.""" + if self.is_nda(company) and score < threshold: + return "previous employer (NDA)" + return company + + # ── Existence check (used by app.py before load) ───────────────────────── + @staticmethod + def exists(path: Path) -> bool: + return path.exists() + + # ── llm.yaml URL generation ─────────────────────────────────────────────── + def generate_llm_urls(self) -> dict[str, str]: + """Return base_url values for each backend, derived from services config.""" + return { + "ollama": f"{self.ollama_url}/v1", + "ollama_research": f"{self.ollama_url}/v1", + "vllm": f"{self.vllm_url}/v1", + } +``` + +**Step 4: Run tests to verify they pass** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_user_profile.py -v +# Expected: all PASS +``` + +**Step 5: Create config/user.yaml.example** + +```yaml +# config/user.yaml.example +# Copy to config/user.yaml and fill in your details. +# The first-run wizard will create this file automatically. + +name: "Your Name" +email: "you@example.com" +phone: "555-000-0000" +linkedin: "linkedin.com/in/yourprofile" +career_summary: > + Experienced professional with X years in [your field]. + Specialise in [key skills]. Known for [strength]. + +nda_companies: [] # e.g. ["FormerEmployer"] — masked in research briefs + +docs_dir: "~/Documents/JobSearch" +ollama_models_dir: "~/models/ollama" +vllm_models_dir: "~/models/vllm" + +inference_profile: "remote" # remote | cpu | single-gpu | dual-gpu + +services: + streamlit_port: 8501 + ollama_host: localhost + ollama_port: 11434 + ollama_ssl: false + ollama_ssl_verify: true + vllm_host: localhost + vllm_port: 8000 + vllm_ssl: false + vllm_ssl_verify: true + searxng_host: localhost + searxng_port: 8888 + searxng_ssl: false + searxng_ssl_verify: true +``` + +**Step 6: Commit** + +```bash +git add scripts/user_profile.py config/user.yaml.example tests/test_user_profile.py +git commit -m "feat: add UserProfile class with service URL generation and NDA helpers" +``` + +--- + +## Task 3: Extract Hard-Coded References — Scripts + +**Files:** +- Modify: `scripts/company_research.py` +- Modify: `scripts/generate_cover_letter.py` +- Modify: `scripts/match.py` +- Modify: `scripts/finetune_local.py` +- Modify: `scripts/prepare_training_data.py` + +**Step 1: Add UserProfile loading helper to company_research.py** + +In `scripts/company_research.py`, remove the hard-coded `_SCRAPER_DIR` path and +replace personal references. The scraper is now bundled in the Docker image so its +path is always `/app/companyScraper.py` inside the container. + +Replace: +```python +_SCRAPER_DIR = Path("/Library/Development/scrapers") +_SCRAPER_AVAILABLE = False + +if _SCRAPER_DIR.exists(): + sys.path.insert(0, str(_SCRAPER_DIR)) + try: + from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig + _SCRAPER_AVAILABLE = True + except (ImportError, SystemExit): + pass +``` + +With: +```python +# companyScraper is bundled into the Docker image at /app/scrapers/ +_SCRAPER_AVAILABLE = False +for _scraper_candidate in [ + Path("/app/scrapers"), # Docker container path + Path(__file__).parent.parent / "scrapers", # local dev fallback +]: + if _scraper_candidate.exists(): + sys.path.insert(0, str(_scraper_candidate)) + try: + from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig + _SCRAPER_AVAILABLE = True + except (ImportError, SystemExit): + pass + break +``` + +Replace `_searxng_running()` to use profile URL: +```python +def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool: + try: + import requests + r = requests.get(f"{searxng_url}/", timeout=3) + return r.status_code == 200 + except Exception: + return False +``` + +Replace all `"Alex Rivera"` / `"Alex's"` / `_NDA_COMPANIES` references: +```python +# At top of research_company(): +from scripts.user_profile import UserProfile +from scripts.db import DEFAULT_DB +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +# In _build_resume_context(), replace _company_label(): +def _company_label(exp: dict) -> str: + company = exp.get("company", "") + score = exp.get("score", 0) + if _profile: + return _profile.nda_label(company, score) + return company + +# Replace "## Alex's Matched Experience": +lines = [f"## {_profile.name if _profile else 'Candidate'}'s Matched Experience"] + +# In research_company() prompt, replace "Alex Rivera": +name = _profile.name if _profile else "the candidate" +summary = _profile.career_summary if _profile else "" +# Replace "You are preparing Alex Rivera for a job interview." with: +prompt = f"""You are preparing {name} for a job interview.\n{summary}\n...""" +``` + +**Step 2: Update generate_cover_letter.py** + +Replace: +```python +LETTERS_DIR = Path("/Library/Documents/JobSearch") +SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera...""" +``` + +With: +```python +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" +SYSTEM_CONTEXT = ( + f"You are writing cover letters for {_profile.name}. {_profile.career_summary}" + if _profile else + "You are a professional cover letter writer. Write in first person." +) +``` + +**Step 3: Update match.py** + +Replace hard-coded resume path with a config lookup: +```python +# match.py — read RESUME_PATH from config/user.yaml or fall back to auto-discovery +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +def _find_resume(docs_dir: Path) -> Path | None: + """Find the most recently modified PDF in docs_dir matching *resume* or *cv*.""" + candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf")) + return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None + +RESUME_PATH = ( + _find_resume(_profile.docs_dir) if _profile else None +) or Path(__file__).parent.parent / "config" / "resume.pdf" +``` + +**Step 4: Update finetune_local.py and prepare_training_data.py** + +Replace all `/Library/` paths with profile-driven paths: +```python +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" +LETTERS_JSONL = _docs / "training_data" / "cover_letters.jsonl" +OUTPUT_DIR = _docs / "training_data" / "finetune_output" +GGUF_DIR = _docs / "training_data" / "gguf" +OLLAMA_NAME = f"{_profile.name.split()[0].lower()}-cover-writer" if _profile else "cover-writer" +SYSTEM_PROMPT = ( + f"You are {_profile.name}'s personal cover letter writer. " + f"{_profile.career_summary}" + if _profile else + "You are a professional cover letter writer. Write in first person." +) +``` + +**Step 5: Run existing tests to verify nothing broken** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +# Expected: all existing tests PASS +``` + +**Step 6: Commit** + +```bash +git add scripts/ +git commit -m "feat: extract hard-coded personal references from all scripts via UserProfile" +``` + +--- + +## Task 4: Extract Hard-Coded References — App Pages + +**Files:** +- Modify: `app/Home.py` +- Modify: `app/pages/4_Apply.py` +- Modify: `app/pages/5_Interviews.py` +- Modify: `app/pages/6_Interview_Prep.py` +- Modify: `app/pages/2_Settings.py` + +**Step 1: Add profile loader utility to app pages** + +Add to the top of each modified page (after sys.path insert): +```python +from scripts.user_profile import UserProfile +from scripts.db import DEFAULT_DB + +_USER_YAML = Path(__file__).parent.parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None +_name = _profile.name if _profile else "Job Seeker" +``` + +**Step 2: Home.py** + +Replace: +```python +st.title("🔍 Alex's Job Search") +# and: +st.caption(f"Run TF-IDF match scoring against Alex's resume...") +``` +With: +```python +st.title(f"🔍 {_name}'s Job Search") +# and: +st.caption(f"Run TF-IDF match scoring against {_name}'s resume...") +``` + +**Step 3: 4_Apply.py — PDF contact block and DOCS_DIR** + +Replace: +```python +DOCS_DIR = Path("/Library/Documents/JobSearch") +# and the contact paragraph: +Paragraph("ALEX RIVERA", name_style) +Paragraph("alex@example.com · (555) 867-5309 · ...", contact_style) +Paragraph("Warm regards,

Alex Rivera", body_style) +``` +With: +```python +DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" +# and: +display_name = (_profile.name.upper() if _profile else "YOUR NAME") +contact_line = " · ".join(filter(None, [ + _profile.email if _profile else "", + _profile.phone if _profile else "", + _profile.linkedin if _profile else "", +])) +Paragraph(display_name, name_style) +Paragraph(contact_line, contact_style) +Paragraph(f"Warm regards,

{_profile.name if _profile else 'Your Name'}", body_style) +``` + +**Step 4: 5_Interviews.py — email assistant prompt** + +Replace hard-coded persona strings with: +```python +_persona = ( + f"{_name} is a {_profile.career_summary[:120] if _profile and _profile.career_summary else 'professional'}" +) +# Replace all occurrences of "Alex Rivera is a Customer Success..." with _persona +``` + +**Step 5: 6_Interview_Prep.py — interviewer and Q&A prompts** + +Replace all occurrences of `"Alex"` in f-strings with `_name`. + +**Step 6: 2_Settings.py — Services tab** + +Remove `PFP_DIR` and the Claude Code Wrapper / Copilot Wrapper service entries entirely. + +Replace the vLLM service entry's `model_dir` with: +```python +"model_dir": str(_profile.vllm_models_dir) if _profile else str(Path.home() / "models" / "vllm"), +``` + +Replace the SearXNG entry to use Docker Compose instead of a host path: +```python +{ + "name": "SearXNG (company scraper)", + "port": _profile._svc["searxng_port"] if _profile else 8888, + "start": ["docker", "compose", "--profile", "searxng", "up", "-d", "searxng"], + "stop": ["docker", "compose", "stop", "searxng"], + "cwd": str(Path(__file__).parent.parent.parent), + "note": "Privacy-respecting meta-search for company research", +}, +``` + +Replace all caption strings containing "Alex's" with `f"{_name}'s"`. + +**Step 7: Commit** + +```bash +git add app/ +git commit -m "feat: extract hard-coded personal references from all app pages via UserProfile" +``` + +--- + +## Task 5: llm.yaml URL Auto-Generation + +**Files:** +- Modify: `scripts/user_profile.py` (already has `generate_llm_urls()`) +- Modify: `app/pages/2_Settings.py` (My Profile save button) +- Create: `scripts/generate_llm_config.py` + +**Step 1: Write failing test** + +```python +# tests/test_llm_config_generation.py +from pathlib import Path +import tempfile, yaml +from scripts.user_profile import UserProfile +from scripts.generate_llm_config import apply_service_urls + +def test_urls_applied_to_llm_yaml(tmp_path): + user_yaml = tmp_path / "user.yaml" + user_yaml.write_text(yaml.dump({ + "name": "Test", + "services": { + "ollama_host": "myserver", "ollama_port": 11434, "ollama_ssl": False, + "ollama_ssl_verify": True, + "vllm_host": "localhost", "vllm_port": 8000, "vllm_ssl": False, + "vllm_ssl_verify": True, + "searxng_host": "localhost", "searxng_port": 8888, + "searxng_ssl": False, "searxng_ssl_verify": True, + } + })) + llm_yaml = tmp_path / "llm.yaml" + llm_yaml.write_text(yaml.dump({"backends": { + "ollama": {"base_url": "http://old:11434/v1", "type": "openai_compat"}, + "vllm": {"base_url": "http://old:8000/v1", "type": "openai_compat"}, + }})) + + profile = UserProfile(user_yaml) + apply_service_urls(profile, llm_yaml) + + result = yaml.safe_load(llm_yaml.read_text()) + assert result["backends"]["ollama"]["base_url"] == "http://myserver:11434/v1" + assert result["backends"]["vllm"]["base_url"] == "http://localhost:8000/v1" +``` + +**Step 2: Run to verify it fails** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_llm_config_generation.py -v +# Expected: ImportError +``` + +**Step 3: Implement generate_llm_config.py** + +```python +# scripts/generate_llm_config.py +"""Update config/llm.yaml base_url values from the user profile's services block.""" +from pathlib import Path +import yaml +from scripts.user_profile import UserProfile + + +def apply_service_urls(profile: UserProfile, llm_yaml_path: Path) -> None: + """Rewrite base_url for ollama, ollama_research, and vllm backends.""" + if not llm_yaml_path.exists(): + return + cfg = yaml.safe_load(llm_yaml_path.read_text()) or {} + urls = profile.generate_llm_urls() + backends = cfg.get("backends", {}) + for backend_name, url in urls.items(): + if backend_name in backends: + backends[backend_name]["base_url"] = url + cfg["backends"] = backends + llm_yaml_path.write_text(yaml.dump(cfg, default_flow_style=False, allow_unicode=True)) +``` + +**Step 4: Run test to verify it passes** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_llm_config_generation.py -v +# Expected: PASS +``` + +**Step 5: Wire into Settings My Profile save** + +In `app/pages/2_Settings.py`, after the "Save My Profile" button writes `user.yaml`, add: +```python +from scripts.generate_llm_config import apply_service_urls +apply_service_urls(UserProfile(_USER_YAML), LLM_CFG) +st.success("Profile saved and service URLs updated.") +``` + +**Step 6: Commit** + +```bash +git add scripts/generate_llm_config.py tests/test_llm_config_generation.py app/pages/2_Settings.py +git commit -m "feat: auto-generate llm.yaml base_url values from user profile services config" +``` + +--- + +## Task 6: Settings — My Profile Tab + +**Files:** +- Modify: `app/pages/2_Settings.py` + +**Step 1: Add My Profile tab to the tab list** + +Replace the existing `st.tabs(...)` call to add the new tab first: +```python +tab_profile, tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs( + ["👤 My Profile", "🔎 Search", "🤖 LLM Backends", "📚 Notion", + "🔌 Services", "📝 Resume Profile", "📧 Email", "🏷️ Skills"] +) +``` + +**Step 2: Implement the My Profile tab** + +```python +USER_CFG = CONFIG_DIR / "user.yaml" + +with tab_profile: + from scripts.user_profile import UserProfile, _DEFAULTS + import yaml as _yaml + + st.caption("Your identity and service configuration. Saved values drive all LLM prompts, PDF headers, and service connections.") + + _u = _yaml.safe_load(USER_CFG.read_text()) or {} if USER_CFG.exists() else {} + _svc = {**_DEFAULTS["services"], **_u.get("services", {})} + + with st.expander("👤 Identity", expanded=True): + c1, c2 = st.columns(2) + u_name = c1.text_input("Full Name", _u.get("name", "")) + u_email = c1.text_input("Email", _u.get("email", "")) + u_phone = c2.text_input("Phone", _u.get("phone", "")) + u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", "")) + u_summary = st.text_area("Career Summary (used in LLM prompts)", + _u.get("career_summary", ""), height=100) + + with st.expander("🔒 Sensitive Employers (NDA)"): + st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.") + nda_list = list(_u.get("nda_companies", [])) + nda_cols = st.columns(max(len(nda_list), 1)) + _to_remove = None + for i, company in enumerate(nda_list): + if nda_cols[i % len(nda_cols)].button(f"× {company}", key=f"rm_nda_{company}"): + _to_remove = company + if _to_remove: + nda_list.remove(_to_remove) + nc, nb = st.columns([4, 1]) + new_nda = nc.text_input("Add employer", key="new_nda", label_visibility="collapsed", placeholder="Employer name…") + if nb.button("＋ Add", key="add_nda") and new_nda.strip(): + nda_list.append(new_nda.strip()) + + with st.expander("📁 File Paths"): + u_docs = st.text_input("Documents directory", _u.get("docs_dir", "~/Documents/JobSearch")) + u_ollama = st.text_input("Ollama models directory", _u.get("ollama_models_dir", "~/models/ollama")) + u_vllm = st.text_input("vLLM models directory", _u.get("vllm_models_dir", "~/models/vllm")) + + with st.expander("⚙️ Inference Profile"): + profiles = ["remote", "cpu", "single-gpu", "dual-gpu"] + u_profile = st.selectbox("Active profile", profiles, + index=profiles.index(_u.get("inference_profile", "remote"))) + + with st.expander("🔌 Service Ports & Hosts"): + st.caption("Advanced — change only if services run on non-default ports or remote hosts.") + sc1, sc2, sc3 = st.columns(3) + with sc1: + st.markdown("**Ollama**") + svc_ollama_host = st.text_input("Host##ollama", _svc["ollama_host"], key="svc_ollama_host") + svc_ollama_port = st.number_input("Port##ollama", value=_svc["ollama_port"], key="svc_ollama_port") + svc_ollama_ssl = st.checkbox("SSL##ollama", _svc["ollama_ssl"], key="svc_ollama_ssl") + svc_ollama_verify = st.checkbox("Verify cert##ollama", _svc["ollama_ssl_verify"], key="svc_ollama_verify") + with sc2: + st.markdown("**vLLM**") + svc_vllm_host = st.text_input("Host##vllm", _svc["vllm_host"], key="svc_vllm_host") + svc_vllm_port = st.number_input("Port##vllm", value=_svc["vllm_port"], key="svc_vllm_port") + svc_vllm_ssl = st.checkbox("SSL##vllm", _svc["vllm_ssl"], key="svc_vllm_ssl") + svc_vllm_verify = st.checkbox("Verify cert##vllm", _svc["vllm_ssl_verify"], key="svc_vllm_verify") + with sc3: + st.markdown("**SearXNG**") + svc_sxng_host = st.text_input("Host##sxng", _svc["searxng_host"], key="svc_sxng_host") + svc_sxng_port = st.number_input("Port##sxng", value=_svc["searxng_port"], key="svc_sxng_port") + svc_sxng_ssl = st.checkbox("SSL##sxng", _svc["searxng_ssl"], key="svc_sxng_ssl") + svc_sxng_verify = st.checkbox("Verify cert##sxng", _svc["searxng_ssl_verify"], key="svc_sxng_verify") + + if st.button("💾 Save Profile", type="primary", key="save_user_profile"): + new_data = { + "name": u_name, "email": u_email, "phone": u_phone, + "linkedin": u_linkedin, "career_summary": u_summary, + "nda_companies": nda_list, + "docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm, + "inference_profile": u_profile, + "services": { + "streamlit_port": _svc["streamlit_port"], + "ollama_host": svc_ollama_host, "ollama_port": int(svc_ollama_port), + "ollama_ssl": svc_ollama_ssl, "ollama_ssl_verify": svc_ollama_verify, + "vllm_host": svc_vllm_host, "vllm_port": int(svc_vllm_port), + "vllm_ssl": svc_vllm_ssl, "vllm_ssl_verify": svc_vllm_verify, + "searxng_host": svc_sxng_host, "searxng_port": int(svc_sxng_port), + "searxng_ssl": svc_sxng_ssl, "searxng_ssl_verify": svc_sxng_verify, + } + } + save_yaml(USER_CFG, new_data) + from scripts.user_profile import UserProfile + from scripts.generate_llm_config import apply_service_urls + apply_service_urls(UserProfile(USER_CFG), LLM_CFG) + st.success("Profile saved and service URLs updated.") +``` + +**Step 2: Commit** + +```bash +git add app/pages/2_Settings.py +git commit -m "feat: add My Profile tab to Settings with full user.yaml editing + URL auto-generation" +``` + +--- + +## Task 7: First-Run Wizard + +**Files:** +- Create: `app/pages/0_Setup.py` +- Modify: `app/app.py` + +**Step 1: Create the wizard page** + +```python +# app/pages/0_Setup.py +""" +First-run setup wizard — shown by app.py when config/user.yaml is absent. +Five steps: hardware detection → identity → NDA companies → inference/keys → Notion. +Writes config/user.yaml (and optionally config/notion.yaml) on completion. +""" +import subprocess +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +CONFIG_DIR = Path(__file__).parent.parent.parent / "config" +USER_CFG = CONFIG_DIR / "user.yaml" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +LLM_CFG = CONFIG_DIR / "llm.yaml" + +PROFILES = ["remote", "cpu", "single-gpu", "dual-gpu"] + +def _detect_gpus() -> list[str]: + """Return list of GPU names via nvidia-smi, or [] if none.""" + try: + out = subprocess.check_output( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + text=True, timeout=5 + ) + return [l.strip() for l in out.strip().splitlines() if l.strip()] + except Exception: + return [] + +def _suggest_profile(gpus: list[str]) -> str: + if len(gpus) >= 2: + return "dual-gpu" + if len(gpus) == 1: + return "single-gpu" + return "remote" + +# ── Wizard state ────────────────────────────────────────────────────────────── +if "wizard_step" not in st.session_state: + st.session_state.wizard_step = 1 +if "wizard_data" not in st.session_state: + st.session_state.wizard_data = {} + +step = st.session_state.wizard_step +data = st.session_state.wizard_data + +st.title("👋 Welcome to Job Seeker") +st.caption("Let's get you set up. This takes about 2 minutes.") +st.progress(step / 5, text=f"Step {step} of 5") +st.divider() + +# ── Step 1: Hardware detection ──────────────────────────────────────────────── +if step == 1: + st.subheader("Step 1 — Hardware Detection") + gpus = _detect_gpus() + suggested = _suggest_profile(gpus) + + if gpus: + st.success(f"Found {len(gpus)} GPU(s): {', '.join(gpus)}") + else: + st.info("No NVIDIA GPUs detected. Remote or CPU mode recommended.") + + profile = st.selectbox( + "Inference mode", + PROFILES, + index=PROFILES.index(suggested), + help="This controls which Docker services start. You can change it later in Settings → My Profile.", + ) + if profile in ("single-gpu", "dual-gpu") and not gpus: + st.warning("No GPUs detected — GPU profiles require NVIDIA Container Toolkit. See the README for install instructions.") + + if st.button("Next →", type="primary"): + data["inference_profile"] = profile + data["gpus_detected"] = gpus + st.session_state.wizard_step = 2 + st.rerun() + +# ── Step 2: Identity ────────────────────────────────────────────────────────── +elif step == 2: + st.subheader("Step 2 — Your Identity") + st.caption("Used in cover letter PDFs, LLM prompts, and the app header.") + c1, c2 = st.columns(2) + name = c1.text_input("Full Name *", data.get("name", "")) + email = c1.text_input("Email *", data.get("email", "")) + phone = c2.text_input("Phone", data.get("phone", "")) + linkedin = c2.text_input("LinkedIn URL", data.get("linkedin", "")) + summary = st.text_area( + "Career Summary *", + data.get("career_summary", ""), + height=120, + placeholder="Experienced professional with X years in [field]. Specialise in [skills].", + help="This paragraph is injected into cover letter and research prompts as your professional context.", + ) + + col_back, col_next = st.columns([1, 4]) + if col_back.button("← Back"): + st.session_state.wizard_step = 1 + st.rerun() + if col_next.button("Next →", type="primary"): + if not name or not email or not summary: + st.error("Name, email, and career summary are required.") + else: + data.update({"name": name, "email": email, "phone": phone, + "linkedin": linkedin, "career_summary": summary}) + st.session_state.wizard_step = 3 + st.rerun() + +# ── Step 3: NDA Companies ───────────────────────────────────────────────────── +elif step == 3: + st.subheader("Step 3 — Sensitive Employers (Optional)") + st.caption( + "Previous employers listed here will appear as 'previous employer (NDA)' in " + "research briefs and talking points. Skip if not applicable." + ) + nda_list = list(data.get("nda_companies", [])) + if nda_list: + cols = st.columns(min(len(nda_list), 5)) + to_remove = None + for i, c in enumerate(nda_list): + if cols[i % 5].button(f"× {c}", key=f"rm_{c}"): + to_remove = c + if to_remove: + nda_list.remove(to_remove) + data["nda_companies"] = nda_list + st.rerun() + nc, nb = st.columns([4, 1]) + new_c = nc.text_input("Add employer", key="new_nda_wiz", label_visibility="collapsed", placeholder="Employer name…") + if nb.button("＋ Add") and new_c.strip(): + nda_list.append(new_c.strip()) + data["nda_companies"] = nda_list + st.rerun() + + col_back, col_skip, col_next = st.columns([1, 1, 3]) + if col_back.button("← Back"): + st.session_state.wizard_step = 2 + st.rerun() + if col_skip.button("Skip"): + data.setdefault("nda_companies", []) + st.session_state.wizard_step = 4 + st.rerun() + if col_next.button("Next →", type="primary"): + data["nda_companies"] = nda_list + st.session_state.wizard_step = 4 + st.rerun() + +# ── Step 4: Inference & API Keys ────────────────────────────────────────────── +elif step == 4: + profile = data.get("inference_profile", "remote") + st.subheader("Step 4 — Inference & API Keys") + + if profile == "remote": + st.info("Remote mode: LLM calls go to external APIs. At least one key is needed.") + anthropic_key = st.text_input("Anthropic API Key", type="password", + placeholder="sk-ant-…") + openai_url = st.text_input("OpenAI-compatible endpoint (optional)", + placeholder="https://api.together.xyz/v1") + openai_key = st.text_input("Endpoint API Key (optional)", type="password") if openai_url else "" + data.update({"anthropic_key": anthropic_key, "openai_url": openai_url, "openai_key": openai_key}) + else: + st.info(f"Local mode ({profile}): Ollama handles cover letters. Configure model below.") + ollama_model = st.text_input("Cover letter model name", + data.get("ollama_model", "llama3.2:3b"), + help="This model will be pulled by Ollama on first start.") + data["ollama_model"] = ollama_model + + st.divider() + with st.expander("Advanced — Service Ports & Hosts"): + st.caption("Change only if services run on non-default ports or remote hosts.") + svc = data.get("services", {}) + for svc_name, default_host, default_port in [ + ("ollama", "localhost", 11434), + ("vllm", "localhost", 8000), + ("searxng","localhost", 8888), + ]: + c1, c2, c3, c4 = st.columns([2, 1, 0.5, 0.5]) + svc[f"{svc_name}_host"] = c1.text_input(f"{svc_name} host", svc.get(f"{svc_name}_host", default_host), key=f"adv_{svc_name}_host") + svc[f"{svc_name}_port"] = c2.number_input(f"port", value=svc.get(f"{svc_name}_port", default_port), key=f"adv_{svc_name}_port") + svc[f"{svc_name}_ssl"] = c3.checkbox("SSL", svc.get(f"{svc_name}_ssl", False), key=f"adv_{svc_name}_ssl") + svc[f"{svc_name}_ssl_verify"] = c4.checkbox("Verify", svc.get(f"{svc_name}_ssl_verify", True), key=f"adv_{svc_name}_verify") + data["services"] = svc + + col_back, col_next = st.columns([1, 4]) + if col_back.button("← Back"): + st.session_state.wizard_step = 3 + st.rerun() + if col_next.button("Next →", type="primary"): + st.session_state.wizard_step = 5 + st.rerun() + +# ── Step 5: Notion (optional) ───────────────────────────────────────────────── +elif step == 5: + st.subheader("Step 5 — Notion Sync (Optional)") + st.caption("Syncs approved and applied jobs to a Notion database. Skip if not using Notion.") + notion_token = st.text_input("Integration Token", type="password", placeholder="secret_…") + notion_db = st.text_input("Database ID", placeholder="32-character ID from Notion URL") + + if notion_token and notion_db: + if st.button("🔌 Test connection"): + with st.spinner("Connecting…"): + try: + from notion_client import Client + db = Client(auth=notion_token).databases.retrieve(notion_db) + st.success(f"Connected: {db['title'][0]['plain_text']}") + except Exception as e: + st.error(f"Connection failed: {e}") + + col_back, col_skip, col_finish = st.columns([1, 1, 3]) + if col_back.button("← Back"): + st.session_state.wizard_step = 4 + st.rerun() + + def _finish(save_notion: bool): + # Build user.yaml + svc_defaults = { + "streamlit_port": 8501, + "ollama_host": "localhost", "ollama_port": 11434, "ollama_ssl": False, "ollama_ssl_verify": True, + "vllm_host": "localhost", "vllm_port": 8000, "vllm_ssl": False, "vllm_ssl_verify": True, + "searxng_host":"localhost", "searxng_port": 8888, "searxng_ssl":False, "searxng_ssl_verify": True, + } + svc_defaults.update(data.get("services", {})) + user_data = { + "name": data.get("name", ""), + "email": data.get("email", ""), + "phone": data.get("phone", ""), + "linkedin": data.get("linkedin", ""), + "career_summary": data.get("career_summary", ""), + "nda_companies": data.get("nda_companies", []), + "docs_dir": "~/Documents/JobSearch", + "ollama_models_dir":"~/models/ollama", + "vllm_models_dir": "~/models/vllm", + "inference_profile":data.get("inference_profile", "remote"), + "services": svc_defaults, + } + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + USER_CFG.write_text(yaml.dump(user_data, default_flow_style=False, allow_unicode=True)) + + # Update llm.yaml URLs + if LLM_CFG.exists(): + from scripts.user_profile import UserProfile + from scripts.generate_llm_config import apply_service_urls + apply_service_urls(UserProfile(USER_CFG), LLM_CFG) + + # Optionally write notion.yaml + if save_notion and notion_token and notion_db: + NOTION_CFG.write_text(yaml.dump({"token": notion_token, "database_id": notion_db})) + + st.session_state.wizard_step = 1 + st.session_state.wizard_data = {} + st.success("Setup complete! Redirecting…") + st.rerun() + + if col_skip.button("Skip & Finish"): + _finish(save_notion=False) + if col_finish.button("💾 Save & Finish", type="primary"): + _finish(save_notion=True) +``` + +**Step 2: Gate navigation in app.py** + +In `app/app.py`, after `init_db()`, add: +```python +from scripts.user_profile import UserProfile + +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" + +if not UserProfile.exists(_USER_YAML): + # Show wizard only — no nav, no sidebar tasks + setup_page = st.Page("pages/0_Setup.py", title="Setup", icon="👋") + st.navigation({"": [setup_page]}).run() + st.stop() +``` + +This must appear before the normal `st.navigation(pages)` call. + +**Step 3: Commit** + +```bash +git add app/pages/0_Setup.py app/app.py +git commit -m "feat: first-run setup wizard gates app until user.yaml is created" +``` + +--- + +## Task 8: Docker Compose Stack + +**Files:** +- Create: `Dockerfile` +- Create: `compose.yml` +- Create: `docker/searxng/settings.yml` +- Create: `docker/ollama/entrypoint.sh` +- Create: `.dockerignore` +- Create: `.env.example` + +**Step 1: Dockerfile** + +```dockerfile +# Dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# System deps for companyScraper (beautifulsoup4, fake-useragent, lxml) +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc libffi-dev curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Bundle companyScraper +COPY scrapers/ /app/scrapers/ + +COPY . . + +EXPOSE 8501 + +CMD ["streamlit", "run", "app/app.py", \ + "--server.port=8501", \ + "--server.headless=true", \ + "--server.fileWatcherType=none"] +``` + +**Step 2: compose.yml** + +```yaml +# compose.yml +services: + + app: + build: . + ports: + - "${STREAMLIT_PORT:-8501}:8501" + volumes: + - ./config:/app/config + - ./data:/app/data + - ${DOCS_DIR:-~/Documents/JobSearch}:/docs + environment: + - STAGING_DB=/app/data/staging.db + depends_on: + searxng: + condition: service_healthy + restart: unless-stopped + + searxng: + image: searxng/searxng:latest + ports: + - "${SEARXNG_PORT:-8888}:8080" + volumes: + - ./docker/searxng:/etc/searxng:ro + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/"] + interval: 10s + timeout: 5s + retries: 3 + restart: unless-stopped + + ollama: + image: ollama/ollama:latest + ports: + - "${OLLAMA_PORT:-11434}:11434" + volumes: + - ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama + - ./docker/ollama/entrypoint.sh:/entrypoint.sh + environment: + - OLLAMA_MODELS=/root/.ollama + entrypoint: ["/bin/bash", "/entrypoint.sh"] + profiles: [cpu, single-gpu, dual-gpu] + restart: unless-stopped + + ollama-gpu: + extends: + service: ollama + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["0"] + capabilities: [gpu] + profiles: [single-gpu, dual-gpu] + + vllm: + image: vllm/vllm-openai:latest + ports: + - "${VLLM_PORT:-8000}:8000" + volumes: + - ${VLLM_MODELS_DIR:-~/models/vllm}:/models + command: > + --model /models/${VLLM_MODEL:-Ouro-1.4B} + --trust-remote-code + --max-model-len 4096 + --gpu-memory-utilization 0.75 + --enforce-eager + --max-num-seqs 8 + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["1"] + capabilities: [gpu] + profiles: [dual-gpu] + restart: unless-stopped +``` + +**Step 3: SearXNG settings.yml** + +```yaml +# docker/searxng/settings.yml +use_default_settings: true +search: + formats: + - html + - json +server: + secret_key: "change-me-in-production" + bind_address: "0.0.0.0:8080" +``` + +**Step 4: Ollama entrypoint** + +```bash +#!/usr/bin/env bash +# docker/ollama/entrypoint.sh +# Start Ollama server and pull a default model if none are present +ollama serve & +sleep 5 +if [ -z "$(ollama list 2>/dev/null | tail -n +2)" ]; then + MODEL="${DEFAULT_OLLAMA_MODEL:-llama3.2:3b}" + echo "No models found — pulling $MODEL..." + ollama pull "$MODEL" +fi +wait +``` + +**Step 5: .env.example** + +```bash +# .env.example — copy to .env (auto-generated by wizard, or fill manually) +STREAMLIT_PORT=8501 +OLLAMA_PORT=11434 +VLLM_PORT=8000 +SEARXNG_PORT=8888 +DOCS_DIR=~/Documents/JobSearch +OLLAMA_MODELS_DIR=~/models/ollama +VLLM_MODELS_DIR=~/models/vllm +VLLM_MODEL=Ouro-1.4B +``` + +**Step 6: .dockerignore** + +``` +.git +__pycache__ +*.pyc +staging.db +config/user.yaml +config/notion.yaml +config/email.yaml +config/tokens.yaml +.streamlit.pid +.streamlit.log +aihawk/ +docs/ +tests/ +``` + +**Step 7: Update .gitignore** + +Add to `.gitignore`: +``` +.env +config/user.yaml +data/ +``` + +**Step 8: Commit** + +```bash +git add Dockerfile compose.yml docker/ .dockerignore .env.example +git commit -m "feat: add Docker Compose stack with remote/cpu/single-gpu/dual-gpu profiles" +``` + +--- + +## Task 9: Services Tab — Compose-Driven Start/Stop + +**Files:** +- Modify: `app/pages/2_Settings.py` + +**Step 1: Replace SERVICES list with compose-driven definitions** + +```python +COMPOSE_DIR = str(Path(__file__).parent.parent.parent) +_profile_name = _profile.inference_profile if _profile else "remote" + +SERVICES = [ + { + "name": "Streamlit UI", + "port": _profile._svc["streamlit_port"] if _profile else 8501, + "start": ["docker", "compose", "--profile", _profile_name, "up", "-d", "app"], + "stop": ["docker", "compose", "stop", "app"], + "cwd": COMPOSE_DIR, + "note": "Job Seeker web interface", + }, + { + "name": "Ollama (local LLM)", + "port": _profile._svc["ollama_port"] if _profile else 11434, + "start": ["docker", "compose", "--profile", _profile_name, "up", "-d", "ollama"], + "stop": ["docker", "compose", "stop", "ollama"], + "cwd": COMPOSE_DIR, + "note": f"Local inference engine — profile: {_profile_name}", + "hidden": _profile_name == "remote", + }, + { + "name": "vLLM Server", + "port": _profile._svc["vllm_port"] if _profile else 8000, + "start": ["docker", "compose", "--profile", _profile_name, "up", "-d", "vllm"], + "stop": ["docker", "compose", "stop", "vllm"], + "cwd": COMPOSE_DIR, + "model_dir": str(_profile.vllm_models_dir) if _profile else str(Path.home() / "models" / "vllm"), + "note": "vLLM inference — dual-gpu profile only", + "hidden": _profile_name != "dual-gpu", + }, + { + "name": "SearXNG (company scraper)", + "port": _profile._svc["searxng_port"] if _profile else 8888, + "start": ["docker", "compose", "up", "-d", "searxng"], + "stop": ["docker", "compose", "stop", "searxng"], + "cwd": COMPOSE_DIR, + "note": "Privacy-respecting meta-search for company research", + }, +] +# Filter hidden services +SERVICES = [s for s in SERVICES if not s.get("hidden")] +``` + +**Step 2: Update health checks to use SSL** + +Replace the `_port_open()` helper: +```python +def _port_open(port: int, host: str = "127.0.0.1", + ssl: bool = False, verify: bool = True) -> bool: + try: + import requests as _r + scheme = "https" if ssl else "http" + _r.get(f"{scheme}://{host}:{port}/", timeout=1, verify=verify) + return True + except Exception: + return False +``` + +Update each service health check call to pass host/ssl/verify from the profile. + +**Step 3: Commit** + +```bash +git add app/pages/2_Settings.py +git commit -m "feat: services tab uses docker compose commands and SSL-aware health checks" +``` + +--- + +## Task 10: Fine-Tune Wizard Tab + +**Files:** +- Modify: `app/pages/2_Settings.py` + +**Step 1: Add fine-tune tab (GPU profiles only)** + +Add `tab_finetune` to the tab list (shown only when profile is single-gpu or dual-gpu). + +```python +# In the tab definition, add conditionally: +_show_finetune = _profile and _profile.inference_profile in ("single-gpu", "dual-gpu") + +# Add tab: +tab_finetune = st.tabs([..., "🎯 Fine-Tune"])[last_index] if _show_finetune else None +``` + +**Step 2: Implement the fine-tune tab** + +```python +if _show_finetune and tab_finetune: + with tab_finetune: + st.subheader("Fine-Tune Your Cover Letter Model") + st.caption( + "Upload your existing cover letters to train a personalised writing model. " + "Requires a GPU. The base model is used until fine-tuning completes." + ) + + step = st.session_state.get("ft_step", 1) + + if step == 1: + st.markdown("**Step 1: Upload Cover Letters**") + uploaded = st.file_uploader( + "Upload cover letters (PDF, DOCX, or TXT)", + type=["pdf", "docx", "txt"], + accept_multiple_files=True, + ) + if uploaded and st.button("Extract Training Pairs →", type="primary"): + # Save uploads to docs_dir/training_data/uploads/ + upload_dir = (_profile.docs_dir / "training_data" / "uploads") + upload_dir.mkdir(parents=True, exist_ok=True) + for f in uploaded: + (upload_dir / f.name).write_bytes(f.read()) + st.session_state.ft_step = 2 + st.rerun() + + elif step == 2: + st.markdown("**Step 2: Preview Training Pairs**") + st.info("Run `python scripts/prepare_training_data.py` to extract pairs, then return here.") + jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl" + if jsonl_path.exists(): + import json + pairs = [json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()] + st.caption(f"{len(pairs)} training pairs extracted.") + for i, p in enumerate(pairs[:3]): + with st.expander(f"Pair {i+1}"): + st.text(p.get("input", "")[:300]) + col_back, col_next = st.columns([1, 4]) + if col_back.button("← Back"): + st.session_state.ft_step = 1; st.rerun() + if col_next.button("Start Training →", type="primary"): + st.session_state.ft_step = 3; st.rerun() + + elif step == 3: + st.markdown("**Step 3: Train**") + epochs = st.slider("Epochs", 3, 20, 10) + if st.button("🚀 Start Fine-Tune", type="primary"): + from scripts.task_runner import submit_task + from scripts.db import DEFAULT_DB + # finetune task type — extend task_runner for this + st.info("Fine-tune queued as a background task. Check back in 30–60 minutes.") + if col_back := st.button("← Back"): + st.session_state.ft_step = 2; st.rerun() +else: + if tab_finetune is None and _profile: + with st.expander("🎯 Fine-Tune (GPU only)"): + st.info( + f"Fine-tuning requires a GPU profile. " + f"Current profile: `{_profile.inference_profile}`. " + "Change it in My Profile to enable this tab." + ) +``` + +**Step 3: Commit** + +```bash +git add app/pages/2_Settings.py +git commit -m "feat: add fine-tune wizard tab to Settings (GPU profiles only)" +``` + +--- + +## Task 11: Final Wiring, Tests & README + +**Files:** +- Create: `README.md` +- Create: `requirements.txt` (Docker-friendly, no torch/CUDA) +- Modify: `tests/` (smoke test wizard gating) + +**Step 1: Write a smoke test for wizard gating** + +```python +# tests/test_app_gating.py +from pathlib import Path +from scripts.user_profile import UserProfile + +def test_wizard_gating_logic(tmp_path): + """app.py should show wizard when user.yaml is absent.""" + missing = tmp_path / "user.yaml" + assert not UserProfile.exists(missing) + +def test_wizard_gating_passes_after_setup(tmp_path): + import yaml + p = tmp_path / "user.yaml" + p.write_text(yaml.dump({"name": "Test User", "services": {}})) + assert UserProfile.exists(p) +``` + +**Step 2: Create requirements.txt** + +``` +streamlit>=1.45 +pyyaml>=6.0 +requests>=2.31 +reportlab>=4.0 +jobspy>=1.1 +notion-client>=2.2 +anthropic>=0.34 +openai>=1.40 +beautifulsoup4>=4.12 +fake-useragent>=1.5 +imaplib2>=3.6 +``` + +**Step 3: Create README.md** + +Document: quick start (`git clone → docker compose --profile remote up -d`), profile options, first-run wizard, and how to configure each inference mode. + +**Step 4: Run full test suite** + +```bash +/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v +# Expected: all PASS +``` + +**Step 5: Final commit** + +```bash +git add README.md requirements.txt tests/ +git commit -m "feat: complete generalization — wizard, UserProfile, compose stack, all personal refs extracted" +``` + +--- + +## Execution Checklist + +- [ ] Task 1: Bootstrap new repo +- [ ] Task 2: UserProfile class + tests +- [ ] Task 3: Extract references — scripts +- [ ] Task 4: Extract references — app pages +- [ ] Task 5: llm.yaml URL auto-generation +- [ ] Task 6: My Profile tab in Settings +- [ ] Task 7: First-run wizard +- [ ] Task 8: Docker Compose stack +- [ ] Task 9: Services tab — compose-driven +- [ ] Task 10: Fine-tune wizard tab +- [ ] Task 11: Final wiring, tests, README diff --git a/docs/plans/2026-02-24-monetization-business-plan.md b/docs/plans/2026-02-24-monetization-business-plan.md new file mode 100644 index 0000000..f37c1e8 --- /dev/null +++ b/docs/plans/2026-02-24-monetization-business-plan.md @@ -0,0 +1,474 @@ +# Job Seeker Platform — Monetization Business Plan + +**Date:** 2026-02-24 +**Status:** Draft — pre-VC pitch +**Author:** Brainstorming session + +--- + +## 1. Product Overview + +An automated job discovery, resume matching, and application pipeline platform. Built originally as a personal tool for a single job seeker; architecture is already generalized — user identity, preferences, and data are fully parameterized via onboarding, not hardcoded. + +### Core pipeline +``` +Job Discovery (multi-board) → Resume Matching → Job Review UI +→ Apply Workspace (cover letter + PDF) +→ Interviews Kanban (phone_screen → offer → hired) +→ Notion Sync +``` + +### Key feature surface +- Multi-board job discovery (LinkedIn, Indeed, Glassdoor, ZipRecruiter, Google, Adzuna, The Ladders) +- LinkedIn Alert email ingestion + email classifier (interview requests, rejections, surveys) +- Resume keyword matching + match scoring +- AI cover letter generation (local model, shared hosted model, or cloud LLM) +- Company research briefs (web scrape + LLM synthesis) +- Interview prep + practice Q&A +- Culture-fit survey assistant with vision/screenshot support +- Application pipeline kanban with stage tracking +- Notion sync for external tracking +- Mission alignment + accessibility preferences (personal decision-making only) +- Per-user fine-tuned cover letter model (trained on user's own writing corpus) + +--- + +## 2. Target Market + +### Primary: Individual job seekers (B2C) +- Actively searching, technically comfortable, value privacy +- Frustrated by manual tracking (spreadsheets, Notion boards) +- Want AI-assisted applications without giving their data to a third party +- Typical job search duration: 3–6 months → average subscription length ~4.5 months + +### Secondary: Career coaches (B2B, seat-based) +- Manage 10–20 active clients simultaneously +- High willingness to pay for tools that make their service more efficient +- **20× revenue multiplier** vs. solo users (base + per-seat pricing) + +### Tertiary: Outplacement firms / staffing agencies (B2B enterprise) +- Future expansion; validates product-market fit at coach tier first + +--- + +## 3. Distribution Model + +### Starting point: Local-first (self-hosted) + +Users run the application on their own machine via Docker Compose or a native installer. All job data, resume data, and preferences stay local. AI features are optional and configurable — users can use their own LLM backends or subscribe for hosted AI. + +**Why local-first:** +- Zero infrastructure cost per free user +- Strong privacy story (no job search data on your servers) +- Reversible — easy to add a hosted SaaS path later without a rewrite +- Aligns with the open core licensing model + +### Future path: Cloud Edition (SaaS) + +Same codebase deployed as a hosted service. Users sign up at a URL, no install required. Unlocked when revenue and user feedback validate the market. + +**Architecture readiness:** The config layer, per-user data isolation, and SQLite-per-user design already support multi-tenancy with minimal refactoring. SaaS is a deployment mode, not a rewrite. + +--- + +## 4. Licensing Strategy + +### Open Core + +| Component | License | Rationale | +|---|---|---| +| Job discovery pipeline | MIT | Community maintains scrapers (boards break constantly) | +| SQLite schema + `db.py` | MIT | Interoperability, trust | +| Application pipeline state machine | MIT | Core value is visible, auditable | +| Streamlit UI shell | MIT | Community contributions, forks welcome | +| AI cover letter generation | BSL 1.1 | Proprietary prompt engineering + model routing | +| Company research synthesis | BSL 1.1 | LLM orchestration is the moat | +| Interview prep + practice Q&A | BSL 1.1 | Premium feature | +| Survey assistant (vision) | BSL 1.1 | Premium feature | +| Email classifier | BSL 1.1 | Premium feature | +| Notion sync | BSL 1.1 | Integration layer | +| Team / multi-user features | Proprietary | Future enterprise feature | +| Analytics dashboard | Proprietary | Future feature | +| Fine-tuned model weights | Proprietary | Per-user, not redistributable | + +**Business Source License (BSL 1.1):** Code is visible and auditable on GitHub. Free for personal, non-commercial self-hosting. Commercial use or SaaS re-hosting requires a paid license. Converts to MIT after 4 years. Used by HashiCorp (Vault, Terraform), MariaDB, and others — well understood by the VC community. + +**Why this works here:** The value is not in the code. A competitor could clone the repo and still not have: the fine-tuned model, the user's corpus, the orchestration prompts, or the UX polish. The moat is the system, not any individual file. + +--- + +## 5. Tier Structure + +### Free — $0/mo +Self-hosted, local-only. Genuinely useful as a privacy-respecting job tracker. + +| Feature | Included | +|---|---| +| Multi-board job discovery | ✓ | +| Custom board scrapers (Adzuna, The Ladders) | ✓ | +| LinkedIn Alert email ingestion | ✓ | +| Add jobs by URL | ✓ | +| Resume keyword matching | ✓ | +| Cover letter generation (local Ollama only) | ✓ | +| Application pipeline kanban | ✓ | +| Mission alignment + accessibility preferences | ✓ | +| Search profiles | 1 | +| AI backend | User's local Ollama | +| Support | Community (GitHub Discussions) | + +**Purpose:** Acquisition engine. GitHub stars = distribution. Users who get a job on free tier refer friends. + +--- + +### Paid — $12/mo +For job seekers who want quality AI output without GPU setup or API key management. + +Includes everything in Free, plus: + +| Feature | Included | +|---|---| +| Shared hosted fine-tuned cover letter model | ✓ | +| Claude API (BYOK — bring your own key) | ✓ | +| Company research briefs | ✓ | +| Interview prep + practice Q&A | ✓ | +| Survey assistant (vision/screenshot) | ✓ | +| Search criteria LLM suggestions | ✓ | +| Email classifier | ✓ | +| Notion sync | ✓ | +| Search profiles | 5 | +| Support | Email | + +**Purpose:** Primary revenue tier. High margin, low support burden. Targets the individual job seeker who wants "it just works." + +--- + +### Premium — $29/mo +For power users and career coaches who want best-in-class output and personal model training. + +Includes everything in Paid, plus: + +| Feature | Included | +|---|---| +| Claude Sonnet (your hosted key, 150 ops/mo included) | ✓ | +| Per-user fine-tuned model (trained on their corpus) | ✓ (one-time onboarding) | +| Corpus re-training | ✓ (quarterly) | +| Search profiles | Unlimited | +| Multi-user / coach mode | ✓ (+$15/seat) | +| Shared job pool across seats | ✓ | +| Priority support + onboarding call | ✓ | + +**Purpose:** Highest LTV tier. Coach accounts at 3+ seats generate $59–$239/mo each. Fine-tuned personal model is a high-perceived-value differentiator that costs ~$0.50 to produce. + +--- + +## 6. AI Inference — Claude API Cost Model + +Pricing basis: Haiku 4.5 = $0.80/MTok in · $4/MTok out | Sonnet 4.6 = $3/MTok in · $15/MTok out + +### Per-operation costs + +| Operation | Tokens In | Tokens Out | Haiku | Sonnet | +|---|---|---|---|---| +| Cover letter generation | ~2,400 | ~400 | $0.0035 | $0.013 | +| Company research brief | ~3,000 | ~800 | $0.0056 | $0.021 | +| Survey Q&A (5 questions) | ~3,000 | ~1,500 | $0.0084 | $0.031 | +| Job description enrichment | ~800 | ~300 | $0.0018 | $0.007 | +| Search criteria suggestion | ~400 | ~200 | $0.0010 | $0.004 | + +### Monthly inference cost per active user +Assumptions: 12 cover letters, 3 research briefs, 2 surveys, 40 enrichments, 2 search suggestions + +| Backend mix | Cost/user/mo | +|---|---| +| Haiku only (paid tier) | ~$0.15 | +| Sonnet only | ~$0.57 | +| Mixed: Sonnet for CL + research, Haiku for rest (premium tier) | ~$0.31 | + +### Per-user fine-tuning cost (premium, one-time) +| Provider | Cost | +|---|---| +| User's local GPU | $0 | +| RunPod A100 (~20 min) | $0.25–$0.40 | +| Together AI / Replicate | $0.50–$0.75 | +| Quarterly re-train | Same as above | + +**Amortized over 12 months:** ~$0.04–$0.06/user/mo + +--- + +## 7. Full Infrastructure Cost Model + +Local-first architecture means most compute runs on the user's machine. Your infra is limited to: AI inference API calls, shared model serving, fine-tune jobs, license/auth server, and storage for model artifacts. + +### Monthly infrastructure at 100K users +(4% paid conversion = 4,000 paid; 20% of paid premium = 800 premium) + +| Cost center | Detail | Monthly cost | +|---|---|---| +| Claude API inference (paid tier, Haiku) | 4,000 users × $0.15 | $600 | +| Claude API inference (premium tier, mixed) | 800 users × $0.31 | $248 | +| Shared model serving (Together AI, 3B model) | 48,000 requests/mo | $27 | +| Per-user fine-tune jobs | 800 users / 12mo × $0.50 | $33 | +| App hosting (license server, auth API, DB) | VPS + PostgreSQL | $200 | +| Model artifact storage (800 × 1.5GB on S3) | 1.2TB | $28 | +| **Total** | | **$1,136/mo** | + +--- + +## 8. Revenue Model & Unit Economics + +### Monthly revenue at scale + +| Total users | Paid (4%) | Premium (20% of paid) | Revenue/mo | Infra/mo | **Gross margin** | +|---|---|---|---|---|---| +| 10,000 | 400 | 80 | $7,120 | $196 | **97.2%** | +| 100,000 | 4,000 | 800 | $88,250 | $1,136 | **98.7%** | + +### Blended ARPU +- Across all users (including free): **~$0.71/user/mo** +- Across paying users only: **~$17.30/user/mo** +- Coach account (3 seats avg): **~$74/mo** + +### LTV per user segment +- Paid individual (4.5mo avg job search): **~$54** +- Premium individual (4.5mo avg): **~$130** +- Coach account (ongoing, low churn): **$74/mo × 18mo estimated = ~$1,330** +- **Note:** Success churn is real — users leave when they get a job. Re-subscription rate on next job search partially offsets this. + +### ARR projections + +| Scale | ARR | +|---|---| +| 10K users | **~$85K** | +| 100K users | **~$1.06M** | +| 1M users | **~$10.6M** | + +To reach $10M ARR: ~1M total users **or** meaningful coach/enterprise penetration at lower user counts. + +--- + +## 9. VC Pitch Angles + +### The thesis +> "GitHub is our distribution channel. Local-first is our privacy moat. Coaches are our revenue engine." + +### Key metrics to hit before Series A +- 10K GitHub stars (validates distribution thesis) +- 500 paying users (validates willingness to pay) +- 20 coach accounts (validates B2B multiplier) +- 97%+ gross margin (already proven in model) + +### Competitive differentiation +1. **Privacy-first** — job search data never leaves your machine on free/paid tiers +2. **Fine-tuned personal model** — no other tool trains a cover letter model on your specific writing voice +3. **Full pipeline** — discovery through hired, not just one step (most competitors are point solutions) +4. **Open core** — community maintains job board scrapers, which break constantly; competitors pay engineers for this +5. **LLM-agnostic** — works with Ollama, Claude, GPT, vLLM; users aren't locked to one provider + +### Risks to address +- **Success churn** — mitigated by re-subscription on next job search, coach accounts (persistent), and potential pivot to ongoing career management +- **Job board scraping fragility** — mitigated by open core (community patches), multiple board sources, email ingestion fallback +- **LLM cost spikes** — mitigated by Haiku-first routing, local model fallback, user BYOK option +- **Copying by incumbents** — LinkedIn, Indeed have distribution but not privacy story; fine-tuned personal model is hard to replicate at their scale + +--- + +## 10. Roadmap + +### Phase 1 — Local-first launch (now) +- Docker Compose installer + setup wizard +- License key server (simple, hosted) +- Paid tier: shared model endpoint + Notion sync + email classifier +- Premium tier: fine-tune pipeline + Claude API routing +- Open core GitHub repo (MIT core, BSL premium) + +### Phase 2 — Coach tier validation (3–6 months post-launch) +- Multi-user mode with seat management +- Coach dashboard: shared job pool, per-candidate pipeline view +- Billing portal (Stripe) +- Outplacement firm pilot + +### Phase 3 — Cloud Edition (6–12 months, revenue-funded or post-seed) +- Hosted SaaS version at a URL (no install) +- Same codebase, cloud deployment mode +- Converts local-first users who want convenience +- Enables mobile access + +### Phase 4 — Enterprise (post-Series A) +- SSO / SAML +- Admin dashboard + analytics +- API for ATS integrations +- Custom fine-tune models for outplacement firm's brand voice + +--- + +## 11. Competitive Landscape + +### Direct competitors + +| Product | Price | Pipeline | AI CL | Privacy | Fine-tune | Open Source | +|---|---|---|---|---|---|---| +| **Job Seeker Platform** | Free–$29 | Full (discovery→hired) | Personal fine-tune | Local-first | Per-user | Core (MIT) | +| Teal | Free/$29 | Partial (tracker + resume) | Generic AI | Cloud | No | No | +| Jobscan | $49.95 | Resume scan only | No | Cloud | No | No | +| Huntr | Free/$30 | Tracker only | No | Cloud | No | No | +| Rezi | $29 | Resume/CL only | Generic AI | Cloud | No | No | +| Kickresume | $19 | Resume/CL only | Generic AI | Cloud | No | No | +| LinkedIn Premium | $40 | Job search only | No | Cloud (them) | No | No | +| AIHawk | Free | LinkedIn Easy Apply | No | Local | No | Yes (MIT) | +| Simplify | Free | Auto-fill only | No | Extension | No | No | + +### Competitive analysis + +**Teal** ($29/mo) is the closest feature competitor — job tracker + resume builder + AI cover letters. Key gaps: cloud-only (privacy risk), no discovery automation, generic AI (not fine-tuned to your voice), no interview prep, no email classifier. Their paid tier costs the same as our premium and delivers substantially less. + +**Jobscan** ($49.95/mo) is the premium ATS-optimization tool. Single-purpose, no pipeline, no cover letters. Overpriced for what it does. Users often use it alongside a tracker — this platform replaces both. + +**AIHawk** (open source) automates LinkedIn Easy Apply but has no pipeline, no AI beyond form filling, no cover letter gen, no tracking. It's a macro, not a platform. We already integrate with it as a downstream action. We're complementary, not competitive at the free tier. + +**LinkedIn Premium** ($40/mo) has distribution but actively works against user privacy and owns the candidate relationship. Users are the product. Our privacy story is a direct counter-positioning. + +### The whitespace + +No competitor offers all three of: **full pipeline automation + privacy-first local storage + personalized fine-tuned AI**. Every existing tool is either a point solution (just resume, just tracker, just auto-apply) or cloud-based SaaS that monetizes user data. The combination is the moat. + +### Indirect competition + +- **Spreadsheets + Notion templates** — free, flexible, no AI. The baseline we replace for free users. +- **Recruiting agencies** — human-assisted job search; we're a complement, not a replacement. +- **Career coaches** — we sell *to* them, not against them. + +--- + +## 12. Go-to-Market Strategy + +### Phase 1: Developer + privacy community launch + +**Channel:** GitHub → Hacker News → Reddit + +The open core model makes GitHub the primary distribution channel. A compelling README, one-command Docker install, and a working free tier are the launch. Target communities: + +- Hacker News "Show HN" — privacy-first self-hosted tools get strong traction +- r/cscareerquestions (1.2M members) — active job seekers, technically literate +- r/selfhosted (2.8M members) — prime audience for local-first tools +- r/ExperiencedDevs, r/remotework — secondary seeding + +**Goal:** 1,000 GitHub stars and 100 free installs in first 30 days. + +**Content hook:** "I built a private job search AI that runs entirely on your machine — no data leaves your computer." Privacy angle resonates deeply post-2024 data breach fatigue. + +### Phase 2: Career coaching channel + +**Channel:** LinkedIn → direct outreach → coach partnerships + +Career coaches are the highest-LTV customer and the most efficient channel to reach many job seekers at once. One coach onboarded = 10–20 active users. + +Tactics: +- Identify coaches on LinkedIn who post about job search tools +- Offer white-glove onboarding + 60-day free trial of coach seats +- Co-create content: "How I run 15 client job searches simultaneously" +- Referral program: coach gets 1 free seat per paid client referral + +**Goal:** 20 coach accounts within 90 days of paid tier launch. + +### Phase 3: Content + SEO (SaaS phase) + +Once the hosted Cloud Edition exists, invest in organic content: + +- "Best job tracker apps 2027" (comparison content — we win on privacy + AI) +- "How to write a cover letter that sounds like you, not ChatGPT" +- "Job search automation without giving LinkedIn your data" +- Tutorial videos: full setup walkthrough, fine-tuning demo + +**Goal:** 10K organic monthly visitors driving 2–5% free tier signups. + +### Phase 4: Outplacement firm partnerships (enterprise) + +Target HR consultancies and outplacement firms (Challenger, Gray & Christmas; Right Management; Lee Hecht Harrison). These firms place thousands of candidates per year and pay per-seat enterprise licenses. + +**Goal:** 3 enterprise pilots within 12 months of coach tier validation. + +### Pricing strategy by channel + +| Channel | Entry offer | Conversion lever | +|---|---|---| +| GitHub / OSS | Free forever | Upgrade friction: GPU setup, no shared model | +| Direct / ProductHunt | Free 30-day paid trial | AI quality gap is immediately visible | +| Coach outreach | Free 60-day coach trial | Efficiency gain across client base | +| Enterprise | Pilot with 10 seats | ROI vs. current manual process | + +### Key metrics by phase + +| Phase | Primary metric | Target | +|---|---|---| +| Launch | GitHub stars | 1K in 30 days | +| Paid validation | Paying users | 500 in 90 days | +| Coach validation | Coach accounts | 20 in 90 days | +| SaaS launch | Cloud signups | 10K in 6 months | +| Enterprise | ARR from enterprise | $100K in 12 months | + +--- + +## 13. Pricing Sensitivity Analysis + +### Paid tier sensitivity ($8 / $12 / $15 / $20) + +Assumption: 100K total users, 4% base conversion, gross infra cost $1,136/mo + +| Price | Conversion assumption | Paying users | Revenue/mo | Gross margin | +|---|---|---|---|---| +| $8 | 5.5% (price-elastic) | 5,500 | $44,000 | 97.4% | +| **$12** | **4.0% (base)** | **4,000** | **$48,000** | **97.6%** | +| $15 | 3.2% (slight drop) | 3,200 | $48,000 | 97.6% | +| $20 | 2.5% (meaningful drop) | 2,500 | $50,000 | 97.7% | + +**Finding:** Revenue is relatively flat between $12 and $20 because conversion drops offset the price increase. $12 is the sweet spot — maximizes paying user count (more data, more referrals, more upgrade candidates) without sacrificing revenue. Going below $10 requires meaningfully higher conversion to justify. + +### Premium tier sensitivity ($19 / $29 / $39 / $49) + +Assumption: 800 base premium users (20% of 4,000 paid), conversion adjusts with price + +| Price | Conversion from paid | Premium users | Revenue/mo | Fine-tune cost | Net/mo | +|---|---|---|---|---|---| +| $19 | 25% | 1,000 | $19,000 | $42 | $18,958 | +| **$29** | **20%** | **800** | **$23,200** | **$33** | **$23,167** | +| $39 | 15% | 600 | $23,400 | $25 | $23,375 | +| $49 | 10% | 400 | $19,600 | $17 | $19,583 | + +**Finding:** $29–$39 is the revenue-maximizing range. $29 wins on user volume (more fine-tune data, stronger coach acquisition funnel). $39 wins marginally on revenue but shrinks the premium base significantly. Recommend $29 at launch with the option to test $34–$39 once the fine-tuned model quality is demonstrated. + +### Coach seat sensitivity ($10 / $15 / $20 per seat) + +Assumption: 50 coach accounts, 3 seats avg, base $29 already captured above + +| Seat price | Seat revenue/mo | Total coach revenue/mo | +|---|---|---| +| $10 | $1,500 | $1,500 | +| **$15** | **$2,250** | **$2,250** | +| $20 | $3,000 | $3,000 | + +**Finding:** Seat pricing is relatively inelastic for coaches — $15–$20 is well within their cost of tools per client. $15 is conservative and easy to raise. $20 is defensible once coach ROI is documented. Consider $15 at launch, $20 after first 20 coach accounts are active. + +### Blended revenue at optimized pricing (100K users) + +| Component | Users | Price | Revenue/mo | +|---|---|---|---| +| Paid tier | 4,000 | $12 | $48,000 | +| Premium individual | 720 | $29 | $20,880 | +| Premium coach base | 80 | $29 | $2,320 | +| Coach seats (80 accounts × 3 avg) | 240 seats | $15 | $3,600 | +| **Total** | | | **$74,800/mo** | +| Infrastructure | | | -$1,136/mo | +| **Net** | | | **$73,664/mo (~$884K ARR)** | + +### Sensitivity to conversion rate (at $12/$29 pricing, 100K users) + +| Free→Paid conversion | Paid→Premium conversion | Revenue/mo | ARR | +|---|---|---|---| +| 2% | 15% | $30,720 | $369K | +| 3% | 18% | $47,664 | $572K | +| **4%** | **20%** | **$65,600** | **$787K** | +| 5% | 22% | $84,480 | $1.01M | +| 6% | 25% | $104,400 | $1.25M | + +**Key insight:** Conversion rate is the highest-leverage variable. Going from 4% → 5% free-to-paid conversion adds $228K ARR at 100K users. Investment in onboarding quality and the free-tier value proposition has outsized return vs. price adjustments. diff --git a/docs/plans/email-sync-testing-checklist.md b/docs/plans/email-sync-testing-checklist.md new file mode 100644 index 0000000..b7a7f5d --- /dev/null +++ b/docs/plans/email-sync-testing-checklist.md @@ -0,0 +1,106 @@ +# Email Sync — Testing Checklist + +Generated from audit of `scripts/imap_sync.py`. + +## Bugs fixed (2026-02-23) + +- [x] Gmail label with spaces not quoted for IMAP SELECT → `_quote_folder()` added +- [x] `_quote_folder` didn't escape internal double-quotes → RFC 3501 escaping added +- [x] `signal is None` in `_scan_unmatched_leads` allowed classifier failures through → now skips +- [x] Email with no Message-ID re-inserted on every sync → `_parse_message` returns `None` when ID missing +- [x] `todo_attached` missing from early-return dict in `sync_all` → added +- [x] Body phrase check truncated at 800 chars (rejection footers missed) → bumped to 1500 +- [x] `_DONT_FORGET_VARIANTS` missing left single quotation mark `\u2018` → added + +--- + +## Unit tests — phrase filter + +- [ ] `_has_rejection_or_ats_signal` — rejection phrase at char 1501 (boundary) +- [ ] `_has_rejection_or_ats_signal` — right single quote `\u2019` in "don't forget" +- [ ] `_has_rejection_or_ats_signal` — left single quote `\u2018` in "don't forget" +- [ ] `_has_rejection_or_ats_signal` — ATS subject phrase only checked against subject, not body +- [ ] `_has_rejection_or_ats_signal` — spam subject prefix `@` match +- [ ] `_has_rejection_or_ats_signal` — `"UNFORTUNATELY"` (uppercase → lowercased correctly) +- [ ] `_has_rejection_or_ats_signal` — phrase in body quoted thread (beyond 1500 chars) is not blocked + +## Unit tests — folder quoting + +- [ ] `_quote_folder("TO DO JOBS")` → `'"TO DO JOBS"'` +- [ ] `_quote_folder("INBOX")` → `"INBOX"` (no spaces, no quotes added) +- [ ] `_quote_folder('My "Jobs"')` → `'"My \\"Jobs\\""'` +- [ ] `_search_folder` — folder doesn't exist → returns `[]`, no exception +- [ ] `_search_folder` — special folder `"[Gmail]/All Mail"` (brackets + slash) + +## Unit tests — message-ID dedup + +- [ ] `_get_existing_message_ids` — NULL message_id in DB excluded from set +- [ ] `_get_existing_message_ids` — empty string `""` excluded from set +- [ ] `_get_existing_message_ids` — job with no contacts returns empty set +- [ ] `_parse_message` — email with no Message-ID header returns `None` +- [ ] `_parse_message` — email with RFC2047-encoded subject decodes correctly +- [ ] No email is inserted twice across two sync runs (integration) + +## Unit tests — classifier & signal + +- [ ] `classify_stage_signal` — returns one of 5 labels or `None` +- [ ] `classify_stage_signal` — returns `None` on LLM error +- [ ] `classify_stage_signal` — returns `"neutral"` when no label matched in LLM output +- [ ] `classify_stage_signal` — strips `…` blocks +- [ ] `_scan_unmatched_leads` — skips when `signal is None` +- [ ] `_scan_unmatched_leads` — skips when `signal == "rejected"` +- [ ] `_scan_unmatched_leads` — proceeds when `signal == "neutral"` +- [ ] `extract_lead_info` — returns `(None, None)` on bad JSON +- [ ] `extract_lead_info` — returns `(None, None)` on LLM error + +## Integration tests — TODO label scan + +- [ ] `_scan_todo_label` — `todo_label` empty string → returns 0 +- [ ] `_scan_todo_label` — `todo_label` missing from config → returns 0 +- [ ] `_scan_todo_label` — folder doesn't exist on IMAP server → returns 0, no crash +- [ ] `_scan_todo_label` — email matches company + action keyword → contact attached +- [ ] `_scan_todo_label` — email matches company but no action keyword → skipped +- [ ] `_scan_todo_label` — email matches no company term → skipped +- [ ] `_scan_todo_label` — duplicate message-ID → not re-inserted +- [ ] `_scan_todo_label` — stage_signal set when classifier returns non-neutral +- [ ] `_scan_todo_label` — body fallback (company only in body[:300]) → still matches +- [ ] `_scan_todo_label` — email handled by `sync_job_emails` first not re-added by label scan + +## Integration tests — unmatched leads + +- [ ] `_scan_unmatched_leads` — genuine lead inserted with synthetic URL `email://domain/hash` +- [ ] `_scan_unmatched_leads` — same email not re-inserted on second sync run +- [ ] `_scan_unmatched_leads` — duplicate synthetic URL skipped +- [ ] `_scan_unmatched_leads` — `extract_lead_info` returns `(None, None)` → no insertion +- [ ] `_scan_unmatched_leads` — rejection phrase in body → blocked before LLM +- [ ] `_scan_unmatched_leads` — rejection phrase in quoted thread > 1500 chars → passes filter (acceptable) + +## Integration tests — full sync + +- [ ] `sync_all` with no active jobs → returns dict with all 6 keys incl. `todo_attached: 0` +- [ ] `sync_all` return dict shape identical on all code paths +- [ ] `sync_all` with `job_ids` filter → only syncs those jobs +- [ ] `sync_all` `dry_run=True` → no DB writes +- [ ] `sync_all` `on_stage` callback fires: "connecting", "job N/M", "scanning todo label", "scanning leads" +- [ ] `sync_all` IMAP connection error → caught, returned in `errors` list +- [ ] `sync_all` per-job exception → other jobs still sync + +## Config / UI + +- [ ] Settings UI field for `todo_label` (currently YAML-only) +- [ ] Warn in sync summary when `todo_label` folder not found on server +- [ ] Clear error message when `config/email.yaml` is missing +- [ ] `test_email_classify.py --verbose` shows correct blocking phrase for each BLOCK + +## Backlog — Known issues + +- [ ] **The Ladders emails confuse the classifier** — promotional/job alert emails from `@theladders.com` are matching the recruitment keyword filter and being treated as leads. Fix: add a sender-based skip rule in `_scan_unmatched_leads` for known job board senders (similar to how LinkedIn Alert emails are short-circuited before the LLM classifier). Senders to exclude: `@theladders.com`, and audit for others (Glassdoor alerts, Indeed digest, ZipRecruiter, etc.). + +--- + +## Performance & edge cases + +- [ ] Email with 10 000-char body → truncated to 4000 chars, no crash +- [ ] Email with binary attachment → `_parse_message` returns valid dict, no crash +- [ ] Email with multiple `text/plain` MIME parts → first part taken +- [ ] `get_all_message_ids` with 100 000 rows → completes in < 1s diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..d381d9d --- /dev/null +++ b/environment.yml @@ -0,0 +1,68 @@ +name: job-seeker +# Recreate: conda env create -f environment.yml +# Update pinned snapshot: conda env export --no-builds > environment.yml +channels: + - conda-forge + - defaults +dependencies: + - python=3.12 + - pip + - pip: + # ── Web UI ──────────────────────────────────────────────────────────────── + - streamlit>=1.35 + - watchdog # live reload + - reportlab>=4.0 # PDF cover letter export + - pandas>=2.0 + - pyarrow # streamlit data tables + - streamlit-paste-button>=0.1.0 + + # ── Job scraping ────────────────────────────────────────────────────────── + - python-jobspy>=1.1 + - playwright # browser automation (run: playwright install chromium) + - selenium + - undetected-chromedriver + - webdriver-manager + - beautifulsoup4 + - requests + - curl_cffi # Chrome TLS fingerprint — bypasses Cloudflare on The Ladders + - fake-useragent # company scraper rotation + + # ── LLM / AI backends ───────────────────────────────────────────────────── + - openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers) + - anthropic>=0.80 # direct Anthropic API fallback + - ollama # Python client for Ollama management + - langchain>=0.2 + - langchain-openai + - langchain-anthropic + - langchain-ollama + - langchain-community + - langchain-google-genai + - google-generativeai + - tiktoken + + # ── Resume matching ─────────────────────────────────────────────────────── + - scikit-learn>=1.3 + - rapidfuzz + - lib-resume-builder-aihawk + + # ── Notion integration ──────────────────────────────────────────────────── + - notion-client>=3.0 + + # ── Document handling ───────────────────────────────────────────────────── + - pypdf + - pdfminer-six + - pyyaml>=6.0 + - python-dotenv + + # ── Utilities ───────────────────────────────────────────────────────────── + - sqlalchemy + - tqdm + - loguru + - rich + - tenacity + - httpx + + # ── Testing ─────────────────────────────────────────────────────────────── + - pytest>=9.0 + - pytest-cov + - pytest-mock diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5ee6477 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/company_research.py b/scripts/company_research.py new file mode 100644 index 0000000..3c7069c --- /dev/null +++ b/scripts/company_research.py @@ -0,0 +1,468 @@ +# scripts/company_research.py +""" +Pre-interview company research generator. + +Three-phase approach: + 1. If SearXNG is available (port 8888), use companyScraper.py to fetch live + data: CEO name, HQ address, LinkedIn, contact info. + 1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for + recent news snippets (funding, launches, leadership changes, etc.). + 2. Feed all real data into an LLM prompt to synthesise a structured brief + covering company overview, leadership, recent developments, and talking + points tailored to Alex. + +Falls back to pure LLM knowledge when SearXNG is offline. + +Usage (standalone): + conda run -n job-seeker python scripts/company_research.py --job-id 42 + conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape +""" +import re +import sys +from pathlib import Path +from types import SimpleNamespace + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# ── SearXNG scraper integration ─────────────────────────────────────────────── +_SCRAPER_DIR = Path("/Library/Development/scrapers") +_SCRAPER_AVAILABLE = False + +if _SCRAPER_DIR.exists(): + sys.path.insert(0, str(_SCRAPER_DIR)) + try: + from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig + _SCRAPER_AVAILABLE = True + except (ImportError, SystemExit): + # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed + pass + + +def _searxng_running() -> bool: + """Quick check whether SearXNG is reachable.""" + try: + import requests + r = requests.get("http://localhost:8888/", timeout=3) + return r.status_code == 200 + except Exception: + return False + + +def _scrape_company(company: str) -> dict: + """ + Use companyScraper in minimal mode to pull live CEO / HQ data. + Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found'). + """ + mock_args = SimpleNamespace( + mode="minimal", + verbose=False, + dry_run=False, + debug=False, + use_cache=True, + save_raw=False, + target_staff=None, + include_types=None, + exclude_types=None, + include_contact=False, + include_address=False, + include_social=True, # grab LinkedIn while we're at it + timeout=20, + input_file=None, + output_file="/dev/null", + searxng_url="http://localhost:8888/", + ) + # Override the singleton Config URL + _ScraperConfig.SEARXNG_URL = "http://localhost:8888/" + + scraper = EnhancedCompanyScraper(mock_args) + scraper.companies = [company] + + result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"} + for search_type in ["ceo", "hq", "social"]: + html = scraper.search_company(company, search_type) + if search_type == "ceo": + result["ceo"] = scraper.extract_ceo(html, company) + elif search_type == "hq": + result["headquarters"] = scraper.extract_address(html, company) + elif search_type == "social": + social = scraper.extract_social(html, company) + # Pull out just the LinkedIn entry + for part in (social or "").split(";"): + if "linkedin" in part.lower(): + result["linkedin"] = part.strip() + break + + return result + + +_SEARCH_QUERIES = { + "news": '"{company}" news 2025 2026', + "funding": '"{company}" funding round investors Series valuation', + "tech": '"{company}" tech stack engineering technology platform', + "competitors": '"{company}" competitors alternatives vs market', + "culture": '"{company}" glassdoor culture reviews employees', + "accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG', + "ceo_press": '"{ceo}" "{company}"', # only used if ceo is known +} + + +def _run_search_query(query: str, results: dict, key: str) -> None: + """Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key].""" + import requests + + snippets: list[str] = [] + seen: set[str] = set() + try: + resp = requests.get( + "http://localhost:8888/search", + params={"q": query, "format": "json", "language": "en-US"}, + timeout=12, + ) + if resp.status_code != 200: + return + for r in resp.json().get("results", [])[:4]: + url = r.get("url", "") + if url in seen: + continue + seen.add(url) + title = r.get("title", "").strip() + content = r.get("content", "").strip() + if title or content: + snippets.append(f"- **{title}**\n {content}\n <{url}>") + except Exception: + pass + results[key] = "\n\n".join(snippets) + + +def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]: + """ + Run all search queries in parallel threads. + Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press). + Missing/failed queries produce empty strings. + """ + import threading + + results: dict[str, str] = {} + threads = [] + + keys: list[str] = [] + for key, pattern in _SEARCH_QUERIES.items(): + if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found": + continue + # Use replace() not .format() — company names may contain curly braces + query = pattern.replace("{company}", company).replace("{ceo}", ceo) + t = threading.Thread( + target=_run_search_query, + args=(query, results, key), + daemon=True, + ) + threads.append(t) + keys.append(key) + t.start() + + for t, key in zip(threads, keys): + t.join(timeout=15) + # Thread may still be alive after timeout — pre-populate key so + # the results dict contract ("missing queries → empty string") holds + if t.is_alive(): + results.setdefault(key, "") + + return results + + +def _parse_sections(text: str) -> dict[str, str]: + """Split LLM markdown output on ## headers into named sections.""" + sections: dict[str, str] = {} + pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE) + matches = list(pattern.finditer(text)) + for i, match in enumerate(matches): + name = match.group(1).strip() + start = match.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(text) + sections[name] = text[start:end].strip() + return sections + + +_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml" + +# Companies where Alex has an NDA — reference as generic label unless +# the role is security-focused (score >= 3 matching JD keywords). +_NDA_COMPANIES = {"upguard"} + + +def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]: + """Score each experience entry by keyword overlap with JD; return sorted descending.""" + jd_lower = jd.lower() + scored = [] + for exp in experiences: + text = " ".join([ + exp.get("position", ""), + exp.get("company", ""), + " ".join( + v + for resp in exp.get("key_responsibilities", []) + for v in resp.values() + ), + ]).lower() + score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower) + scored.append({**exp, "score": score}) + return sorted(scored, key=lambda x: x["score"], reverse=True) + + +def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str: + """ + Build the resume section of the LLM context block. + Top 2 scored experiences included in full detail; rest as one-liners. + Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)' + unless the role is security-focused (score >= 3). + """ + experiences = resume.get("experience_details", []) + if not experiences: + return "" + + scored = _score_experiences(experiences, keywords, jd) + top2 = scored[:2] + rest = scored[2:] + + def _company_label(exp: dict) -> str: + company = exp.get("company", "") + if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3: + return "enterprise security vendor (NDA)" + return company + + def _exp_header(exp: dict) -> str: + return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})" + + def _exp_bullets(exp: dict) -> str: + bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()] + return "\n".join(f" - {b}" for b in bullets) + + lines = ["## Alex's Matched Experience"] + for exp in top2: + lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})") + lines.append(_exp_bullets(exp)) + + if rest: + condensed = ", ".join(_exp_header(e) for e in rest) + lines.append(f"\nAlso in Alex's background: {condensed}") + + return "\n".join(lines) + + +def _load_resume_and_keywords() -> tuple[dict, list[str]]: + """Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list).""" + import yaml as _yaml + + resume = {} + if _RESUME_YAML.exists(): + resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {} + + keywords: list[str] = [] + if _KEYWORDS_YAML.exists(): + kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {} + for lst in kw_cfg.values(): + if isinstance(lst, list): + keywords.extend(lst) + + return resume, keywords + + +def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict: + """ + Generate a pre-interview research brief for a job. + + Parameters + ---------- + job : dict + Job row from the DB (needs at least 'company', 'title', 'description'). + use_scraper : bool + Whether to attempt live data via SearXNG before falling back to LLM. + + Returns + ------- + dict with keys: raw_output, company_brief, ceo_brief, tech_brief, + funding_brief, competitors_brief, red_flags, talking_points + """ + from scripts.llm_router import LLMRouter + + router = LLMRouter() + research_order = router.config.get("research_fallback_order") or router.config["fallback_order"] + company = job.get("company") or "the company" + title = job.get("title") or "this role" + jd_excerpt = (job.get("description") or "")[:1500] + + resume, keywords = _load_resume_and_keywords() + matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()] + resume_context = _build_resume_context(resume, keywords, jd_excerpt) + keywords_note = ( + f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}" + if matched_keywords else "" + ) + + def _stage(msg: str) -> None: + if on_stage: + try: + on_stage(msg) + except Exception: + pass # never let stage callbacks break the task + + # ── Phase 1: live scrape (optional) ────────────────────────────────────── + live_data: dict = {} + scrape_note = "" + _stage("Checking for live company data…") + if use_scraper and _SCRAPER_AVAILABLE and _searxng_running(): + _stage("Scraping CEO & HQ data…") + try: + live_data = _scrape_company(company) + parts = [] + if live_data.get("ceo") not in (None, "Not found"): + parts.append(f"CEO: {live_data['ceo']}") + if live_data.get("headquarters") not in (None, "Not found"): + parts.append(f"HQ: {live_data['headquarters']}") + if live_data.get("linkedin") not in (None, "Not found"): + parts.append(f"LinkedIn: {live_data['linkedin']}") + if parts: + scrape_note = ( + "\n\n**Live data retrieved via SearXNG:**\n" + + "\n".join(f"- {p}" for p in parts) + + "\n\nIncorporate these facts where relevant." + ) + except BaseException as e: + scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_" + + # ── Phase 1b: parallel search queries ──────────────────────────────────── + search_data: dict[str, str] = {} + _stage("Running web searches…") + if use_scraper and _searxng_running(): + _stage("Running web searches (news, funding, tech, culture)…") + try: + ceo_name = (live_data.get("ceo") or "") if live_data else "" + search_data = _fetch_search_data(company, ceo=ceo_name) + except BaseException: + pass # best-effort; never fail the whole task + + # Track whether SearXNG actually contributed usable data to this brief. + scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0 + + def _section_note(key: str, label: str) -> str: + text = search_data.get(key, "").strip() + return f"\n\n## {label} (live web search)\n\n{text}" if text else "" + + news_note = _section_note("news", "News & Press") + funding_note = _section_note("funding", "Funding & Investors") + tech_note = _section_note("tech", "Tech Stack") + competitors_note = _section_note("competitors", "Competitors") + culture_note = _section_note("culture", "Culture & Employee Signals") + accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion") + ceo_press_note = _section_note("ceo_press", "CEO in the News") + + # ── Phase 2: LLM synthesis ──────────────────────────────────────────────── + _stage("Generating brief with LLM… (30–90 seconds)") + prompt = f"""You are preparing Alex Rivera for a job interview. + +Role: **{title}** at **{company}** + +## Job Description +{jd_excerpt} +{resume_context}{keywords_note} + +## Live Company Data +{scrape_note.strip() or "_(scrape unavailable)_"} +{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note} + +--- + +Produce a structured research brief using **exactly** these eight markdown section headers +(include all eight even if a section has limited data — say so honestly): + +## Company Overview +What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning. + +## Leadership & Culture +CEO background and leadership style, key execs, mission/values statements, Glassdoor themes. + +## Tech Stack & Product +Technologies, platforms, and product direction relevant to the {title} role. + +## Funding & Market Position +Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape. + +## Recent Developments +News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months. +Draw on the live snippets above; if none available, note what is publicly known. + +## Red Flags & Watch-outs +Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call. +If nothing notable, write "No significant red flags identified." + +## Inclusion & Accessibility +Assess {company}'s commitment to disability inclusion and accessibility. Cover: +- ADA accommodation language in job postings or company policy +- Disability Employee Resource Group (ERG) or affinity group +- Product or service accessibility (WCAG compliance, adaptive features, AT integrations) +- Any public disability/accessibility advocacy, partnerships, or certifications +- Glassdoor or press signals about how employees with disabilities experience the company +If no specific signals are found, say so clearly — absence of public commitment is itself signal. +This section is for Alex's personal decision-making only and will not appear in any application. + +## Talking Points for Alex +Five specific talking points for the phone screen. Each must: +- Reference a concrete experience from Alex's matched background by name + (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus) +- Connect to a specific signal from the JD or company context above +- Be 1–2 sentences, ready to speak aloud +- Never give generic advice + +--- +⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call. +""" + + raw = router.complete(prompt, fallback_order=research_order) + # Strip … blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R) + raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + sections = _parse_sections(raw) + + return { + "raw_output": raw, + "company_brief": sections.get("Company Overview", ""), + "ceo_brief": sections.get("Leadership & Culture", ""), + "tech_brief": sections.get("Tech Stack & Product", ""), + "funding_brief": sections.get("Funding & Market Position", ""), + "competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section + "red_flags": sections.get("Red Flags & Watch-outs", ""), + "accessibility_brief": sections.get("Inclusion & Accessibility", ""), + "talking_points": sections.get("Talking Points for Alex", ""), + "scrape_used": scrape_used, + } + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Generate company research brief") + parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db") + parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape") + args = parser.parse_args() + + from scripts.db import DEFAULT_DB, init_db, save_research + import sqlite3 + + init_db(DEFAULT_DB) + conn = sqlite3.connect(DEFAULT_DB) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone() + conn.close() + + if not row: + sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}") + + job = dict(row) + print(f"Researching: {job['title']} @ {job['company']} …\n") + if _SCRAPER_AVAILABLE and not args.no_scrape: + print(f"SearXNG available: {_searxng_running()}") + + result = research_company(job, use_scraper=not args.no_scrape) + save_research(DEFAULT_DB, job_id=args.job_id, **result) + print(result["raw_output"]) + print(f"\n[Saved to company_research for job {args.job_id}]") diff --git a/scripts/custom_boards/__init__.py b/scripts/custom_boards/__init__.py new file mode 100644 index 0000000..7b12ac1 --- /dev/null +++ b/scripts/custom_boards/__init__.py @@ -0,0 +1 @@ +# Custom job board scrapers — each module exposes scrape(profile, location, results_wanted) -> list[dict] diff --git a/scripts/custom_boards/adzuna.py b/scripts/custom_boards/adzuna.py new file mode 100644 index 0000000..fa57bdc --- /dev/null +++ b/scripts/custom_boards/adzuna.py @@ -0,0 +1,160 @@ +"""Adzuna Jobs API scraper. + +API docs: https://developer.adzuna.com/docs/search +Config: config/adzuna.yaml (gitignored — contains app_id + app_key) + +Each title in the search profile is queried as an exact phrase per location. +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import time +from pathlib import Path + +import requests +import yaml + +_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "adzuna.yaml" +_BASE_URL = "https://api.adzuna.com/v1/api/jobs/us/search" + + +def _load_config() -> tuple[str, str]: + if not _CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Adzuna config not found: {_CONFIG_PATH}\n" + "Copy config/adzuna.yaml.example → config/adzuna.yaml and fill in credentials." + ) + cfg = yaml.safe_load(_CONFIG_PATH.read_text()) + app_id = (cfg.get("app_id") or "").strip() + app_key = (cfg.get("app_key") or "").strip() + if not app_id or not app_key: + raise ValueError( + "config/adzuna.yaml requires both 'app_id' and 'app_key'.\n" + "Find your App ID at https://developer.adzuna.com/admin/applications" + ) + return app_id, app_key + + +def _salary_str(job: dict) -> str: + lo = job.get("salary_min") + hi = job.get("salary_max") + try: + if lo and hi: + return f"${int(lo):,} – ${int(hi):,}" + if lo: + return f"${int(lo):,}+" + except (TypeError, ValueError): + pass + return "" + + +def _is_remote(location_display: str) -> bool: + return "remote" in location_display.lower() + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """Fetch jobs from the Adzuna API for a single location. + + Args: + profile: Search profile dict from search_profiles.yaml. + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + """ + try: + app_id, app_key = _load_config() + except (FileNotFoundError, ValueError) as exc: + print(f" [adzuna] Skipped — {exc}") + return [] + + titles = profile.get("titles", []) + hours_old = profile.get("hours_old", 240) + max_days_old = max(1, hours_old // 24) + is_remote_search = location.lower() == "remote" + + session = requests.Session() + session.headers.update({"Accept": "application/json", "User-Agent": "Mozilla/5.0"}) + + seen_ids: set[str] = set() + results: list[dict] = [] + + for title in titles: + if len(results) >= results_wanted: + break + + page = 1 + while len(results) < results_wanted: + # Adzuna doesn't support where=remote — it treats it as a city name and + # returns 0 results. For remote searches, append "remote" to the what param. + if is_remote_search: + params = { + "app_id": app_id, + "app_key": app_key, + "results_per_page": 50, + "what": f'"{title}" remote', + "sort_by": "date", + "max_days_old": max_days_old, + } + else: + params = { + "app_id": app_id, + "app_key": app_key, + "results_per_page": 50, + "what_phrase": title, + "where": location, + "sort_by": "date", + "max_days_old": max_days_old, + } + try: + resp = session.get(f"{_BASE_URL}/{page}", params=params, timeout=20) + except requests.RequestException as exc: + print(f" [adzuna] Request error ({title}): {exc}") + break + + if resp.status_code == 401: + print(" [adzuna] Auth failed — check app_id and app_key in config/adzuna.yaml") + return results + if resp.status_code != 200: + print(f" [adzuna] HTTP {resp.status_code} for '{title}' page {page}") + break + + data = resp.json() + jobs = data.get("results", []) + if not jobs: + break + + for job in jobs: + job_id = str(job.get("id", "")) + if job_id in seen_ids: + continue + seen_ids.add(job_id) + + loc_display = job.get("location", {}).get("display_name", "") + redirect_url = job.get("redirect_url", "") + if not redirect_url: + continue + + results.append({ + "title": job.get("title", ""), + "company": job.get("company", {}).get("display_name", ""), + "url": redirect_url, + "source": "adzuna", + "location": loc_display, + "is_remote": is_remote_search or _is_remote(loc_display), + "salary": _salary_str(job), + "description": job.get("description", ""), + }) + + total = data.get("count", 0) + if len(results) >= total or len(jobs) < 50: + break # last page + + page += 1 + time.sleep(0.5) # polite pacing between pages + + time.sleep(0.5) # between titles + + return results[:results_wanted] diff --git a/scripts/custom_boards/craigslist.py b/scripts/custom_boards/craigslist.py new file mode 100644 index 0000000..30226ae --- /dev/null +++ b/scripts/custom_boards/craigslist.py @@ -0,0 +1,177 @@ +"""Craigslist job scraper — RSS-based. + +Uses Craigslist's native RSS feed endpoint for discovery. +Full job description is populated by the scrape_url background task. +Company name and salary (not structured in Craigslist listings) are +extracted from the description body by the enrich_craigslist task. + +Config: config/craigslist.yaml (gitignored — metro list + location map) + config/craigslist.yaml.example (committed template) + +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import time +import xml.etree.ElementTree as ET +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path +from urllib.parse import quote_plus + +import requests +import yaml + +_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml" +_DEFAULT_CATEGORY = "jjj" +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} +_TIMEOUT = 15 +_SLEEP = 0.5 # seconds between requests — easy to make configurable later + + +def _load_config() -> dict: + if not _CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Craigslist config not found: {_CONFIG_PATH}\n" + "Copy config/craigslist.yaml.example → config/craigslist.yaml " + "and configure your target metros." + ) + cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {} + if not cfg.get("metros"): + raise ValueError( + "config/craigslist.yaml must contain at least one entry under 'metros'." + ) + return cfg + + +def _rss_url(metro: str, category: str, query: str) -> str: + return ( + f"https://{metro}.craigslist.org/search/{category}" + f"?query={quote_plus(query)}&format=rss&sort=date" + ) + + +def _parse_pubdate(pubdate_str: str) -> datetime | None: + """Parse an RSS pubDate string to a timezone-aware datetime.""" + try: + return parsedate_to_datetime(pubdate_str) + except Exception: + return None + + +def _fetch_rss(url: str) -> list[dict]: + """Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts.""" + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + try: + root = ET.fromstring(resp.content) + except ET.ParseError as exc: + raise ValueError(f"Malformed RSS XML: {exc}") from exc + + items = [] + for item in root.findall(".//item"): + def _text(tag: str, _item=item) -> str: + el = _item.find(tag) + return (el.text or "").strip() if el is not None else "" + + items.append({ + "title": _text("title"), + "link": _text("link"), + "description": _text("description"), + "pubDate": _text("pubDate"), + }) + return items + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """Fetch jobs from Craigslist RSS for a single location. + + Args: + profile: Search profile dict from search_profiles.yaml. + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all metros and titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + company/salary are empty — filled later by enrich_craigslist task. + """ + try: + cfg = _load_config() + except (FileNotFoundError, ValueError) as exc: + print(f" [craigslist] Skipped — {exc}") + return [] + + metros_all: list[str] = cfg.get("metros", []) + location_map: dict[str, str] = cfg.get("location_map", {}) + category: str = cfg.get("category") or _DEFAULT_CATEGORY + + is_remote_search = location.lower() == "remote" + if is_remote_search: + metros = metros_all + else: + metro = location_map.get(location) + if not metro: + print(f" [craigslist] No metro mapping for '{location}' — skipping") + return [] + metros = [metro] + + titles: list[str] = profile.get("titles", []) + hours_old: int = profile.get("hours_old", 240) + cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600) + + seen_urls: set[str] = set() + results: list[dict] = [] + + for metro in metros: + if len(results) >= results_wanted: + break + + for title in titles: + if len(results) >= results_wanted: + break + + url = _rss_url(metro, category, title) + try: + items = _fetch_rss(url) + except requests.RequestException as exc: + print(f" [craigslist] HTTP error ({metro}/{title}): {exc}") + time.sleep(_SLEEP) + continue + except ValueError as exc: + print(f" [craigslist] Parse error ({metro}/{title}): {exc}") + time.sleep(_SLEEP) + continue + + for item in items: + if len(results) >= results_wanted: + break + + item_url = item.get("link", "") + if not item_url or item_url in seen_urls: + continue + + pub = _parse_pubdate(item.get("pubDate", "")) + if pub and pub.timestamp() < cutoff: + continue + + seen_urls.add(item_url) + results.append({ + "title": item.get("title", ""), + "company": "", + "url": item_url, + "source": "craigslist", + "location": f"{metro} (Craigslist)", + "is_remote": is_remote_search, + "salary": "", + "description": "", + }) + + time.sleep(_SLEEP) + + return results[:results_wanted] diff --git a/scripts/custom_boards/theladders.py b/scripts/custom_boards/theladders.py new file mode 100644 index 0000000..d7330af --- /dev/null +++ b/scripts/custom_boards/theladders.py @@ -0,0 +1,179 @@ +"""The Ladders scraper — Playwright-based (requires chromium installed). + +The Ladders is a client-side React app (no SSR __NEXT_DATA__). We use Playwright +to execute JS, wait for job cards to render, then extract from the DOM. + +Company names are hidden from guest (non-logged-in) users, but are encoded in +the job URL slug: /job/{title-slug}-{company-slug}-{location-slug}_{id} + +curl_cffi is no longer needed for this scraper; plain Playwright is sufficient. +playwright must be installed: `conda run -n job-seeker python -m playwright install chromium` + +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import re +import time +from typing import Any + +_BASE = "https://www.theladders.com" +_SEARCH_PATH = "/jobs/searchjobs/{slug}" + +# Location slug in URLs for remote jobs +_REMOTE_SLUG = "virtual-travel" + + +def _company_from_url(href: str, title_slug: str) -> str: + """ + Extract company name from The Ladders job URL slug. + + URL format: /job/{title-slug}-{company-slug}-{location-slug}_{id}?ir=1 + Example: /job/customer-success-manager-gainsight-virtual-travel_85434789 + → "Gainsight" + """ + # Strip path prefix and query + slug = href.split("/job/", 1)[-1].split("?")[0] + # Strip numeric ID suffix (e.g. _85434789) + slug = re.sub(r"_\d+$", "", slug) + # Strip known title prefix + if slug.startswith(title_slug + "-"): + slug = slug[len(title_slug) + 1:] + # Strip common location suffixes + for loc_suffix in [f"-{_REMOTE_SLUG}", "-new-york", "-los-angeles", + "-san-francisco", "-chicago", "-austin", "-seattle", + "-boston", "-atlanta", "-remote"]: + if slug.endswith(loc_suffix): + slug = slug[: -len(loc_suffix)] + break + # Convert kebab-case → title case + return slug.replace("-", " ").title() if slug else "" + + +def _extract_jobs_js() -> str: + """JS to run in page context — extracts job data from rendered card elements.""" + return """() => { + const cards = document.querySelectorAll('[class*=job-card-container]'); + return Array.from(cards).map(card => { + const link = card.querySelector('p.job-link-wrapper a, a.clipped-text'); + const salary = card.querySelector('p.salary, .salary-info p'); + const locEl = card.querySelector('.remote-location-text, .location-info'); + const remoteEl = card.querySelector('.remote-flag-badge-remote'); + return { + title: link ? link.textContent.trim() : null, + href: link ? link.getAttribute('href') : null, + salary: salary ? salary.textContent.replace('*','').trim() : null, + location: locEl ? locEl.textContent.trim() : null, + is_remote: !!remoteEl, + }; + }).filter(j => j.title && j.href); + }""" + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """ + Scrape job listings from The Ladders using Playwright. + + Args: + profile: Search profile dict (uses 'titles'). + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + """ + try: + from playwright.sync_api import sync_playwright + except ImportError: + print( + " [theladders] playwright not installed.\n" + " Install: conda run -n job-seeker pip install playwright && " + "conda run -n job-seeker python -m playwright install chromium" + ) + return [] + + is_remote_search = location.lower() == "remote" + results: list[dict] = [] + seen_urls: set[str] = set() + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + ctx = browser.new_context( + user_agent=( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + ) + page = ctx.new_page() + + for title in profile.get("titles", []): + if len(results) >= results_wanted: + break + + slug = title.lower().replace(" ", "-").replace("/", "-") + title_slug = slug # used for company extraction from URL + + params: dict[str, str] = {} + if is_remote_search: + params["remote"] = "true" + elif location: + params["location"] = location + + url = _BASE + _SEARCH_PATH.format(slug=slug) + if params: + query = "&".join(f"{k}={v}" for k, v in params.items()) + url = f"{url}?{query}" + + try: + page.goto(url, timeout=30_000) + page.wait_for_load_state("networkidle", timeout=20_000) + except Exception as exc: + print(f" [theladders] Page load error for '{title}': {exc}") + continue + + try: + raw_jobs: list[dict[str, Any]] = page.evaluate(_extract_jobs_js()) + except Exception as exc: + print(f" [theladders] JS extract error for '{title}': {exc}") + continue + + if not raw_jobs: + print(f" [theladders] No cards found for '{title}' — selector may need updating") + continue + + for job in raw_jobs: + href = job.get("href", "") + if not href: + continue + full_url = _BASE + href if href.startswith("/") else href + if full_url in seen_urls: + continue + seen_urls.add(full_url) + + company = _company_from_url(href, title_slug) + loc_text = (job.get("location") or "").replace("Remote", "").strip(", ") + if is_remote_search or job.get("is_remote"): + loc_display = "Remote" + (f" — {loc_text}" if loc_text and loc_text != "US-Anywhere" else "") + else: + loc_display = loc_text or location + + results.append({ + "title": job.get("title", ""), + "company": company, + "url": full_url, + "source": "theladders", + "location": loc_display, + "is_remote": bool(job.get("is_remote") or is_remote_search), + "salary": job.get("salary") or "", + "description": "", # not available in card view; scrape_url will fill in + }) + + if len(results) >= results_wanted: + break + + time.sleep(1) # polite pacing between titles + + browser.close() + + return results[:results_wanted] diff --git a/scripts/db.py b/scripts/db.py new file mode 100644 index 0000000..b2443a1 --- /dev/null +++ b/scripts/db.py @@ -0,0 +1,728 @@ +""" +SQLite staging layer for job listings. +Jobs flow: pending → approved/rejected → applied → synced + applied → phone_screen → interviewing → offer → hired (or rejected) +""" +import sqlite3 +from datetime import datetime +from pathlib import Path +from typing import Optional + +DEFAULT_DB = Path(__file__).parent.parent / "staging.db" + +CREATE_JOBS = """ +CREATE TABLE IF NOT EXISTS jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, + company TEXT, + url TEXT UNIQUE, + source TEXT, + location TEXT, + is_remote INTEGER DEFAULT 0, + salary TEXT, + description TEXT, + match_score REAL, + keyword_gaps TEXT, + date_found TEXT, + status TEXT DEFAULT 'pending', + notion_page_id TEXT, + cover_letter TEXT, + applied_at TEXT +); +""" + +CREATE_JOB_CONTACTS = """ +CREATE TABLE IF NOT EXISTS job_contacts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL, + direction TEXT DEFAULT 'inbound', + subject TEXT, + from_addr TEXT, + to_addr TEXT, + body TEXT, + received_at TEXT, + is_response_needed INTEGER DEFAULT 0, + responded_at TEXT, + message_id TEXT, + FOREIGN KEY (job_id) REFERENCES jobs(id) +); +""" + +_CONTACT_MIGRATIONS = [ + ("message_id", "TEXT"), + ("stage_signal", "TEXT"), + ("suggestion_dismissed", "INTEGER DEFAULT 0"), +] + +_RESEARCH_MIGRATIONS = [ + ("tech_brief", "TEXT"), + ("funding_brief", "TEXT"), + ("competitors_brief", "TEXT"), + ("red_flags", "TEXT"), + ("scrape_used", "INTEGER"), # 1 = SearXNG contributed data, 0 = LLM-only + ("accessibility_brief", "TEXT"), # Inclusion & Accessibility section +] + +CREATE_COMPANY_RESEARCH = """ +CREATE TABLE IF NOT EXISTS company_research ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL UNIQUE, + generated_at TEXT, + company_brief TEXT, + ceo_brief TEXT, + talking_points TEXT, + raw_output TEXT, + tech_brief TEXT, + funding_brief TEXT, + competitors_brief TEXT, + red_flags TEXT, + FOREIGN KEY (job_id) REFERENCES jobs(id) +); +""" + +CREATE_BACKGROUND_TASKS = """ +CREATE TABLE IF NOT EXISTS background_tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_type TEXT NOT NULL, + job_id INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', + error TEXT, + created_at DATETIME DEFAULT (datetime('now')), + started_at DATETIME, + finished_at DATETIME, + stage TEXT, + updated_at DATETIME +) +""" + +CREATE_SURVEY_RESPONSES = """ +CREATE TABLE IF NOT EXISTS survey_responses ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL REFERENCES jobs(id), + survey_name TEXT, + received_at DATETIME, + source TEXT, + raw_input TEXT, + image_path TEXT, + mode TEXT, + llm_output TEXT, + reported_score TEXT, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); +""" + +_MIGRATIONS = [ + ("cover_letter", "TEXT"), + ("applied_at", "TEXT"), + ("interview_date", "TEXT"), + ("rejection_stage", "TEXT"), + ("phone_screen_at", "TEXT"), + ("interviewing_at", "TEXT"), + ("offer_at", "TEXT"), + ("hired_at", "TEXT"), + ("survey_at", "TEXT"), +] + + +def _migrate_db(db_path: Path) -> None: + """Add new columns to existing tables without breaking old data.""" + conn = sqlite3.connect(db_path) + for col, coltype in _MIGRATIONS: + try: + conn.execute(f"ALTER TABLE jobs ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass # column already exists + for col, coltype in _CONTACT_MIGRATIONS: + try: + conn.execute(f"ALTER TABLE job_contacts ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass + for col, coltype in _RESEARCH_MIGRATIONS: + try: + conn.execute(f"ALTER TABLE company_research ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass + try: + conn.execute("ALTER TABLE background_tasks ADD COLUMN stage TEXT") + except sqlite3.OperationalError: + pass + try: + conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT") + except sqlite3.OperationalError: + pass + conn.commit() + conn.close() + + +def init_db(db_path: Path = DEFAULT_DB) -> None: + """Create tables if they don't exist, then run migrations.""" + conn = sqlite3.connect(db_path) + conn.execute(CREATE_JOBS) + conn.execute(CREATE_JOB_CONTACTS) + conn.execute(CREATE_COMPANY_RESEARCH) + conn.execute(CREATE_BACKGROUND_TASKS) + conn.execute(CREATE_SURVEY_RESPONSES) + conn.commit() + conn.close() + _migrate_db(db_path) + + +def insert_job(db_path: Path = DEFAULT_DB, job: dict = None) -> Optional[int]: + """Insert a job. Returns row id, or None if URL already exists.""" + if job is None: + return None + conn = sqlite3.connect(db_path) + try: + cursor = conn.execute( + """INSERT INTO jobs + (title, company, url, source, location, is_remote, salary, description, date_found) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + job.get("title", ""), + job.get("company", ""), + job.get("url", ""), + job.get("source", ""), + job.get("location", ""), + int(bool(job.get("is_remote", False))), + job.get("salary", ""), + job.get("description", ""), + job.get("date_found", ""), + ), + ) + conn.commit() + return cursor.lastrowid + except sqlite3.IntegrityError: + return None # duplicate URL + finally: + conn.close() + + +def get_job_by_id(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]: + """Return a single job by ID, or None if not found.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + return dict(row) if row else None + + +def get_jobs_by_status(db_path: Path = DEFAULT_DB, status: str = "pending") -> list[dict]: + """Return all jobs with the given status as a list of dicts.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM jobs WHERE status = ? ORDER BY date_found DESC, id DESC", + (status,), + ) + rows = [dict(row) for row in cursor.fetchall()] + conn.close() + return rows + + +def get_email_leads(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return pending jobs with source='email', newest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT * FROM jobs WHERE source = 'email' AND status = 'pending' " + "ORDER BY date_found DESC, id DESC" + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def get_job_counts(db_path: Path = DEFAULT_DB) -> dict: + """Return counts per status.""" + conn = sqlite3.connect(db_path) + cursor = conn.execute( + "SELECT status, COUNT(*) as n FROM jobs GROUP BY status" + ) + counts = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + return counts + + +def update_job_status(db_path: Path = DEFAULT_DB, ids: list[int] = None, status: str = "approved") -> None: + """Batch-update status for a list of job IDs.""" + if not ids: + return + conn = sqlite3.connect(db_path) + conn.execute( + f"UPDATE jobs SET status = ? WHERE id IN ({','.join('?' * len(ids))})", + [status] + list(ids), + ) + conn.commit() + conn.close() + + +def get_existing_urls(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all URLs already in staging (any status).""" + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT url FROM jobs") + urls = {row[0] for row in cursor.fetchall()} + conn.close() + return urls + + +def write_match_scores(db_path: Path = DEFAULT_DB, job_id: int = None, + score: float = 0.0, gaps: str = "") -> None: + """Write match score and keyword gaps back to a job row.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE jobs SET match_score = ?, keyword_gaps = ? WHERE id = ?", + (score, gaps, job_id), + ) + conn.commit() + conn.close() + + +def update_cover_letter(db_path: Path = DEFAULT_DB, job_id: int = None, text: str = "") -> None: + """Persist a generated/edited cover letter for a job.""" + if job_id is None: + return + conn = sqlite3.connect(db_path) + conn.execute("UPDATE jobs SET cover_letter = ? WHERE id = ?", (text, job_id)) + conn.commit() + conn.close() + + +_UPDATABLE_JOB_COLS = { + "title", "company", "url", "source", "location", "is_remote", + "salary", "description", "match_score", "keyword_gaps", +} + + +def update_job_fields(db_path: Path = DEFAULT_DB, job_id: int = None, + fields: dict = None) -> None: + """Update arbitrary job columns. Unknown keys are silently ignored.""" + if job_id is None or not fields: + return + safe = {k: v for k, v in fields.items() if k in _UPDATABLE_JOB_COLS} + if not safe: + return + conn = sqlite3.connect(db_path) + sets = ", ".join(f"{col} = ?" for col in safe) + conn.execute( + f"UPDATE jobs SET {sets} WHERE id = ?", + (*safe.values(), job_id), + ) + conn.commit() + conn.close() + + +def mark_applied(db_path: Path = DEFAULT_DB, ids: list[int] = None) -> None: + """Set status='applied' and record today's date for a list of job IDs.""" + if not ids: + return + today = datetime.now().isoformat()[:10] + conn = sqlite3.connect(db_path) + conn.execute( + f"UPDATE jobs SET status = 'applied', applied_at = ? WHERE id IN ({','.join('?' * len(ids))})", + [today] + list(ids), + ) + conn.commit() + conn.close() + + +def kill_stuck_tasks(db_path: Path = DEFAULT_DB) -> int: + """Mark all queued/running background tasks as failed. Returns count killed.""" + conn = sqlite3.connect(db_path) + count = conn.execute( + "UPDATE background_tasks SET status='failed', error='Killed by user'," + " finished_at=datetime('now') WHERE status IN ('queued','running')" + ).rowcount + conn.commit() + conn.close() + return count + + +def purge_email_data(db_path: Path = DEFAULT_DB) -> tuple[int, int]: + """Delete all job_contacts rows and email-sourced pending jobs. + Returns (contacts_deleted, jobs_deleted). + """ + conn = sqlite3.connect(db_path) + c1 = conn.execute("DELETE FROM job_contacts").rowcount + c2 = conn.execute("DELETE FROM jobs WHERE source='email'").rowcount + conn.commit() + conn.close() + return c1, c2 + + +def purge_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int: + """Delete jobs matching given statuses. Returns number of rows deleted. + If statuses is None or empty, deletes ALL jobs (full reset). + """ + conn = sqlite3.connect(db_path) + if statuses: + placeholders = ",".join("?" * len(statuses)) + cur = conn.execute(f"DELETE FROM jobs WHERE status IN ({placeholders})", statuses) + else: + cur = conn.execute("DELETE FROM jobs") + count = cur.rowcount + conn.commit() + conn.close() + return count + + +def purge_non_remote(db_path: Path = DEFAULT_DB) -> int: + """Delete non-remote jobs that are not yet in the active pipeline. + Preserves applied, phone_screen, interviewing, offer, hired, and synced records. + Returns number of rows deleted. + """ + _safe = ("applied", "phone_screen", "interviewing", "offer", "hired", "synced") + placeholders = ",".join("?" * len(_safe)) + conn = sqlite3.connect(db_path) + count = conn.execute( + f"DELETE FROM jobs WHERE (is_remote = 0 OR is_remote IS NULL)" + f" AND status NOT IN ({placeholders})", + _safe, + ).rowcount + conn.commit() + conn.close() + return count + + +def archive_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int: + """Set status='archived' for jobs matching given statuses. + + Archived jobs stay in the DB (preserving dedup by URL) but are invisible + to Job Review and other pipeline views. + Returns number of rows updated. + """ + if not statuses: + return 0 + placeholders = ",".join("?" * len(statuses)) + conn = sqlite3.connect(db_path) + count = conn.execute( + f"UPDATE jobs SET status = 'archived' WHERE status IN ({placeholders})", + statuses, + ).rowcount + conn.commit() + conn.close() + return count + + +# ── Interview pipeline helpers ──────────────────────────────────────────────── + +_STAGE_TS_COL = { + "phone_screen": "phone_screen_at", + "interviewing": "interviewing_at", + "offer": "offer_at", + "hired": "hired_at", + "survey": "survey_at", +} + + +def get_interview_jobs(db_path: Path = DEFAULT_DB) -> dict[str, list[dict]]: + """Return jobs grouped by interview/post-apply stage.""" + stages = ["applied", "survey", "phone_screen", "interviewing", "offer", "hired", "rejected"] + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + result: dict[str, list[dict]] = {} + for stage in stages: + cursor = conn.execute( + "SELECT * FROM jobs WHERE status = ? ORDER BY applied_at DESC, id DESC", + (stage,), + ) + result[stage] = [dict(row) for row in cursor.fetchall()] + conn.close() + return result + + +def advance_to_stage(db_path: Path = DEFAULT_DB, job_id: int = None, stage: str = "") -> None: + """Move a job to the next interview stage and record a timestamp.""" + now = datetime.now().isoformat()[:16] + ts_col = _STAGE_TS_COL.get(stage) + conn = sqlite3.connect(db_path) + if ts_col: + conn.execute( + f"UPDATE jobs SET status = ?, {ts_col} = ? WHERE id = ?", + (stage, now, job_id), + ) + else: + conn.execute("UPDATE jobs SET status = ? WHERE id = ?", (stage, job_id)) + conn.commit() + conn.close() + + +def reject_at_stage(db_path: Path = DEFAULT_DB, job_id: int = None, + rejection_stage: str = "") -> None: + """Mark a job as rejected and record at which stage it was rejected.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE jobs SET status = 'rejected', rejection_stage = ? WHERE id = ?", + (rejection_stage, job_id), + ) + conn.commit() + conn.close() + + +def set_interview_date(db_path: Path = DEFAULT_DB, job_id: int = None, + date_str: str = "") -> None: + """Persist an interview date for a job.""" + conn = sqlite3.connect(db_path) + conn.execute("UPDATE jobs SET interview_date = ? WHERE id = ?", (date_str, job_id)) + conn.commit() + conn.close() + + +# ── Contact log helpers ─────────────────────────────────────────────────────── + +def add_contact(db_path: Path = DEFAULT_DB, job_id: int = None, + direction: str = "inbound", subject: str = "", + from_addr: str = "", to_addr: str = "", + body: str = "", received_at: str = "", + message_id: str = "", + stage_signal: str = "") -> int: + """Log an email contact. Returns the new row id.""" + ts = received_at or datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + cur = conn.execute( + """INSERT INTO job_contacts + (job_id, direction, subject, from_addr, to_addr, body, + received_at, message_id, stage_signal) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (job_id, direction, subject, from_addr, to_addr, body, + ts, message_id, stage_signal or None), + ) + conn.commit() + row_id = cur.lastrowid + conn.close() + return row_id + + +def get_contacts(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]: + """Return all contact log entries for a job, oldest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM job_contacts WHERE job_id = ? ORDER BY received_at ASC", + (job_id,), + ) + rows = [dict(row) for row in cursor.fetchall()] + conn.close() + return rows + + +def get_unread_stage_signals(db_path: Path = DEFAULT_DB, + job_id: int = None) -> list[dict]: + """Return inbound contacts with a non-neutral, non-dismissed stage signal.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """SELECT * FROM job_contacts + WHERE job_id = ? + AND direction = 'inbound' + AND stage_signal IS NOT NULL + AND stage_signal != 'neutral' + AND (suggestion_dismissed IS NULL OR suggestion_dismissed = 0) + ORDER BY received_at ASC""", + (job_id,), + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def dismiss_stage_signal(db_path: Path = DEFAULT_DB, + contact_id: int = None) -> None: + """Mark a stage signal suggestion as dismissed.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET suggestion_dismissed = 1 WHERE id = ?", + (contact_id,), + ) + conn.commit() + conn.close() + + +def get_all_message_ids(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all known Message-IDs across all job contacts.""" + conn = sqlite3.connect(db_path) + rows = conn.execute( + "SELECT message_id FROM job_contacts WHERE message_id IS NOT NULL AND message_id != ''" + ).fetchall() + conn.close() + return {r[0] for r in rows} + + +# ── Company research helpers ────────────────────────────────────────────────── + +def save_research(db_path: Path = DEFAULT_DB, job_id: int = None, + company_brief: str = "", ceo_brief: str = "", + talking_points: str = "", raw_output: str = "", + tech_brief: str = "", funding_brief: str = "", + competitors_brief: str = "", red_flags: str = "", + accessibility_brief: str = "", + scrape_used: int = 0) -> None: + """Insert or replace a company research record for a job.""" + now = datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + conn.execute( + """INSERT INTO company_research + (job_id, generated_at, company_brief, ceo_brief, talking_points, + raw_output, tech_brief, funding_brief, competitors_brief, red_flags, + accessibility_brief, scrape_used) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(job_id) DO UPDATE SET + generated_at = excluded.generated_at, + company_brief = excluded.company_brief, + ceo_brief = excluded.ceo_brief, + talking_points = excluded.talking_points, + raw_output = excluded.raw_output, + tech_brief = excluded.tech_brief, + funding_brief = excluded.funding_brief, + competitors_brief = excluded.competitors_brief, + red_flags = excluded.red_flags, + accessibility_brief = excluded.accessibility_brief, + scrape_used = excluded.scrape_used""", + (job_id, now, company_brief, ceo_brief, talking_points, raw_output, + tech_brief, funding_brief, competitors_brief, red_flags, + accessibility_brief, scrape_used), + ) + conn.commit() + conn.close() + + +def get_research(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]: + """Return the company research record for a job, or None if absent.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM company_research WHERE job_id = ?", (job_id,) + ) + row = cursor.fetchone() + conn.close() + return dict(row) if row else None + + +# ── Survey response helpers ─────────────────────────────────────────────────── + +def insert_survey_response( + db_path: Path = DEFAULT_DB, + job_id: int = None, + survey_name: str = "", + received_at: str = "", + source: str = "text_paste", + raw_input: str = "", + image_path: str = "", + mode: str = "quick", + llm_output: str = "", + reported_score: str = "", +) -> int: + """Insert a survey response row. Returns the new row id.""" + conn = sqlite3.connect(db_path) + cur = conn.execute( + """INSERT INTO survey_responses + (job_id, survey_name, received_at, source, raw_input, + image_path, mode, llm_output, reported_score) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (job_id, survey_name or None, received_at or None, + source, raw_input or None, image_path or None, + mode, llm_output, reported_score or None), + ) + conn.commit() + row_id = cur.lastrowid + conn.close() + return row_id + + +def get_survey_responses(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]: + """Return all survey responses for a job, newest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT * FROM survey_responses WHERE job_id = ? ORDER BY created_at DESC", + (job_id,), + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +# ── Background task helpers ─────────────────────────────────────────────────── + +def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Insert a new background task. + + Returns (task_id, True) if inserted, or (existing_id, False) if a + queued/running task for the same (task_type, job_id) already exists. + """ + conn = sqlite3.connect(db_path) + existing = conn.execute( + "SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')", + (task_type, job_id), + ).fetchone() + if existing: + conn.close() + return existing[0], False + cur = conn.execute( + "INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')", + (task_type, job_id), + ) + task_id = cur.lastrowid + conn.commit() + conn.close() + return task_id, True + + +def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None, + status: str = "", error: Optional[str] = None) -> None: + """Update a task's status and set the appropriate timestamp.""" + now = datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + if status == "running": + conn.execute( + "UPDATE background_tasks SET status=?, started_at=?, updated_at=? WHERE id=?", + (status, now, now, task_id), + ) + elif status in ("completed", "failed"): + conn.execute( + "UPDATE background_tasks SET status=?, finished_at=?, updated_at=?, error=? WHERE id=?", + (status, now, now, error, task_id), + ) + else: + conn.execute( + "UPDATE background_tasks SET status=?, updated_at=? WHERE id=?", + (status, now, task_id), + ) + conn.commit() + conn.close() + + +def update_task_stage(db_path: Path = DEFAULT_DB, task_id: int = None, + stage: str = "") -> None: + """Update the stage label on a running task (for progress display).""" + conn = sqlite3.connect(db_path) + conn.execute("UPDATE background_tasks SET stage=? WHERE id=?", (stage, task_id)) + conn.commit() + conn.close() + + +def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return all queued/running tasks with job title and company joined in.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute(""" + SELECT bt.*, j.title, j.company + FROM background_tasks bt + LEFT JOIN jobs j ON j.id = bt.job_id + WHERE bt.status IN ('queued', 'running') + ORDER BY bt.created_at ASC + """).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> Optional[dict]: + """Return the most recent task row for a (task_type, job_id) pair, or None.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute( + """SELECT * FROM background_tasks + WHERE task_type=? AND job_id=? + ORDER BY id DESC LIMIT 1""", + (task_type, job_id), + ).fetchone() + conn.close() + return dict(row) if row else None diff --git a/scripts/discover.py b/scripts/discover.py new file mode 100644 index 0000000..bd7530a --- /dev/null +++ b/scripts/discover.py @@ -0,0 +1,285 @@ +# scripts/discover.py +""" +JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True). + +Usage: + conda run -n job-seeker python scripts/discover.py +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import yaml +from datetime import datetime + +import pandas as pd +from jobspy import scrape_jobs +from notion_client import Client + +from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls +from scripts.custom_boards import adzuna as _adzuna +from scripts.custom_boards import theladders as _theladders +from scripts.custom_boards import craigslist as _craigslist + +CONFIG_DIR = Path(__file__).parent.parent / "config" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml" +BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" + +# Registry of custom board scrapers keyed by name used in search_profiles.yaml +CUSTOM_SCRAPERS: dict[str, object] = { + "adzuna": _adzuna.scrape, + "theladders": _theladders.scrape, + "craigslist": _craigslist.scrape, +} + + +def load_config() -> tuple[dict, dict]: + profiles = yaml.safe_load(PROFILES_CFG.read_text()) + notion_cfg = yaml.safe_load(NOTION_CFG.read_text()) + return profiles, notion_cfg + + +def load_blocklist() -> dict: + """Load global blocklist config. Returns dict with companies, industries, locations lists.""" + if not BLOCKLIST_CFG.exists(): + return {"companies": [], "industries": [], "locations": []} + raw = yaml.safe_load(BLOCKLIST_CFG.read_text()) or {} + return { + "companies": [c.lower() for c in raw.get("companies", []) if c], + "industries": [i.lower() for i in raw.get("industries", []) if i], + "locations": [loc.lower() for loc in raw.get("locations", []) if loc], + } + + +def _is_blocklisted(job_row: dict, blocklist: dict) -> bool: + """Return True if this job matches any global blocklist rule.""" + company_lower = (job_row.get("company") or "").lower() + location_lower = (job_row.get("location") or "").lower() + desc_lower = (job_row.get("description") or "").lower() + content_lower = f"{company_lower} {desc_lower}" + + if any(bl in company_lower for bl in blocklist["companies"]): + return True + if any(bl in content_lower for bl in blocklist["industries"]): + return True + if any(bl in location_lower for bl in blocklist["locations"]): + return True + return False + + +def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]: + """Return the set of all job URLs already tracked in Notion (for notion_push mode).""" + existing: set[str] = set() + has_more = True + start_cursor = None + while has_more: + kwargs: dict = {"database_id": db_id, "page_size": 100} + if start_cursor: + kwargs["start_cursor"] = start_cursor + resp = notion.databases.query(**kwargs) + for page in resp["results"]: + url = page["properties"].get(url_field, {}).get("url") + if url: + existing.add(url) + has_more = resp.get("has_more", False) + start_cursor = resp.get("next_cursor") + return existing + + +def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None: + """Create a new page in the Notion jobs database for a single listing.""" + min_amt = job.get("min_amount") + max_amt = job.get("max_amount") + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + title_content = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""): + title_content = str(job["salary_source"]) + else: + title_content = str(job.get("title", "Unknown")) + + job_url = str(job.get("job_url", "") or "") + if job_url in ("nan", "None"): + job_url = "" + + notion.pages.create( + parent={"database_id": db_id}, + properties={ + fm["title_field"]: {"title": [{"text": {"content": title_content}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]}, + fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]}, + fm["url"]: {"url": job_url or None}, + fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", False))}, + fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}}, + }, + ) + + +def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False) -> None: + profiles_cfg, notion_cfg = load_config() + fm = notion_cfg["field_map"] + blocklist = load_blocklist() + + _bl_summary = {k: len(v) for k, v in blocklist.items() if v} + if _bl_summary: + print(f"[discover] Blocklist active: {_bl_summary}") + + # SQLite dedup — by URL and by (title, company) to catch cross-board reposts + init_db(db_path) + existing_urls = db_existing_urls(db_path) + + import sqlite3 as _sqlite3 + _conn = _sqlite3.connect(db_path) + existing_tc = { + (r[0].lower().strip()[:80], r[1].lower().strip()) + for r in _conn.execute("SELECT title, company FROM jobs").fetchall() + } + _conn.close() + + # Notion dedup (only in notion_push mode) + notion = None + if notion_push: + notion = Client(auth=notion_cfg["token"]) + existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"]) + + print(f"[discover] {len(existing_urls)} existing listings in DB") + new_count = 0 + + def _s(val, default="") -> str: + """Convert a value to str, treating pandas NaN/None as default.""" + if val is None: + return default + s = str(val) + return default if s in ("nan", "None", "NaN") else s + + def _insert_if_new(job_row: dict, source_label: str) -> bool: + """Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted.""" + url = job_row.get("url", "") + if not url or url in existing_urls: + return False + + # Global blocklist — checked before anything else + if _is_blocklisted(job_row, blocklist): + return False + + title_lower = job_row.get("title", "").lower() + desc_lower = job_row.get("description", "").lower() + exclude_kw = job_row.get("_exclude_kw", []) + if any(kw in title_lower or kw in desc_lower for kw in exclude_kw): + return False + + tc_key = (title_lower[:80], job_row.get("company", "").lower().strip()) + if tc_key in existing_tc: + return False + existing_tc.add(tc_key) + + insert_job(db_path, { + "title": job_row.get("title", ""), + "company": job_row.get("company", ""), + "url": url, + "source": job_row.get("source", source_label), + "location": job_row.get("location", ""), + "is_remote": bool(job_row.get("is_remote", False)), + "salary": job_row.get("salary", ""), + "description": job_row.get("description", ""), + "date_found": datetime.now().isoformat()[:10], + }) + existing_urls.add(url) + return True + + for profile in profiles_cfg["profiles"]: + print(f"\n[discover] ── Profile: {profile['name']} ──") + boards = profile.get("boards", []) + custom_boards = profile.get("custom_boards", []) + exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])] + results_per_board = profile.get("results_per_board", 25) + + for location in profile["locations"]: + + # ── JobSpy boards ────────────────────────────────────────────────── + if boards: + print(f" [jobspy] {location} — boards: {', '.join(boards)}") + try: + jobs: pd.DataFrame = scrape_jobs( + site_name=boards, + search_term=" OR ".join(f'"{t}"' for t in profile["titles"]), + location=location, + results_wanted=results_per_board, + hours_old=profile.get("hours_old", 72), + linkedin_fetch_description=True, + ) + print(f" [jobspy] {len(jobs)} raw results") + except Exception as exc: + print(f" [jobspy] ERROR: {exc}") + jobs = pd.DataFrame() + + jobspy_new = 0 + for _, job in jobs.iterrows(): + url = str(job.get("job_url", "") or "") + if not url or url in ("nan", "None"): + continue + + job_dict = job.to_dict() + + # Build salary string from JobSpy numeric fields + min_amt = job_dict.get("min_amount") + max_amt = job_dict.get("max_amount") + salary_str = "" + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + salary_str = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""): + salary_str = str(job_dict["salary_source"]) + + row = { + "url": url, + "title": _s(job_dict.get("title")), + "company": _s(job_dict.get("company")), + "source": _s(job_dict.get("site")), + "location": _s(job_dict.get("location")), + "is_remote": bool(job_dict.get("is_remote", False)), + "salary": salary_str, + "description": _s(job_dict.get("description")), + "_exclude_kw": exclude_kw, + } + if _insert_if_new(row, _s(job_dict.get("site"))): + if notion_push: + push_to_notion(notion, notion_cfg["database_id"], job_dict, fm) + new_count += 1 + jobspy_new += 1 + print(f" + {row['title']} @ {row['company']} [{row['source']}]") + + print(f" [jobspy] {jobspy_new} new listings from {location}") + + # ── Custom boards ────────────────────────────────────────────────── + for board_name in custom_boards: + scraper_fn = CUSTOM_SCRAPERS.get(board_name) + if scraper_fn is None: + print(f" [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)") + continue + + print(f" [{board_name}] {location} — fetching up to {results_per_board} results …") + try: + custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board) + except Exception as exc: + print(f" [{board_name}] ERROR: {exc}") + custom_jobs = [] + + print(f" [{board_name}] {len(custom_jobs)} raw results") + board_new = 0 + for job in custom_jobs: + row = {**job, "_exclude_kw": exclude_kw} + if _insert_if_new(row, board_name): + new_count += 1 + board_new += 1 + print(f" + {job.get('title')} @ {job.get('company')} [{board_name}]") + + print(f" [{board_name}] {board_new} new listings from {location}") + + print(f"\n[discover] Done — {new_count} new listings staged total.") + return new_count + + +if __name__ == "__main__": + run_discovery() diff --git a/scripts/enrich_descriptions.py b/scripts/enrich_descriptions.py new file mode 100644 index 0000000..dce1cae --- /dev/null +++ b/scripts/enrich_descriptions.py @@ -0,0 +1,284 @@ +# scripts/enrich_descriptions.py +""" +Post-discovery enrichment: retry Glassdoor job description fetches that +returned empty/null during the initial scrape (usually rate-limit 429s or +expired listings mid-batch). + +Fetches descriptions one at a time with a configurable delay between +requests to stay under Glassdoor's rate limit. + +Usage: + conda run -n job-seeker python scripts/enrich_descriptions.py + conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run + conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0 +""" +import re +import sqlite3 +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db + +DELAY_SECS = 1.5 # seconds between description fetches + + +def _extract_job_id(url: str) -> str | None: + """Pull the Glassdoor listing ID from a job URL (…?jl=1234567890).""" + m = re.search(r"jl=(\d+)", url or "") + return m.group(1) if m else None + + +def _setup_scraper(): + """ + Create a Glassdoor scraper instance initialised just enough to call + _fetch_job_description() — skips the full job-search setup. + """ + from jobspy.glassdoor import Glassdoor + from jobspy.glassdoor.constant import fallback_token, headers + from jobspy.model import ScraperInput, Site + from jobspy.util import create_session + + scraper = Glassdoor() + scraper.base_url = "https://www.glassdoor.com/" + scraper.session = create_session(has_retry=True) + token = scraper._get_csrf_token() + headers["gd-csrf-token"] = token if token else fallback_token + scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) + return scraper + + +def enrich_glassdoor_descriptions( + db_path: Path = DEFAULT_DB, + dry_run: bool = False, + delay: float = DELAY_SECS, +) -> dict: + """ + Find Glassdoor jobs with missing descriptions and re-fetch them. + + Returns: + {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} + """ + init_db(db_path) + + conn = sqlite3.connect(db_path) + rows = conn.execute( + """SELECT id, url, company, title FROM jobs + WHERE source = 'glassdoor' + AND (description IS NULL OR TRIM(description) = '') + ORDER BY id ASC""" + ).fetchall() + conn.close() + + result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} + + if not rows: + print("[enrich] No Glassdoor jobs missing descriptions.") + return result + + print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions — fetching…") + + try: + scraper = _setup_scraper() + except Exception as e: + msg = f"Glassdoor scraper init failed: {e}" + result["errors"].append(msg) + result["failed"] = len(rows) + print(f"[enrich] ERROR — {msg}") + return result + + for db_id, url, company, title in rows: + job_id = _extract_job_id(url) + if not job_id: + msg = f"job #{db_id}: cannot extract listing ID from URL: {url}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] SKIP — {msg}") + continue + + try: + description = scraper._fetch_job_description(int(job_id)) + if description and description.strip(): + if not dry_run: + upd = sqlite3.connect(db_path) + upd.execute( + "UPDATE jobs SET description = ? WHERE id = ?", + (description, db_id), + ) + upd.commit() + upd.close() + tag = "[DRY-RUN] " if dry_run else "" + print(f"[enrich] {tag}{company} — {title}: {len(description)} chars") + result["succeeded"] += 1 + else: + print(f"[enrich] {company} — {title}: empty response (expired listing?)") + result["failed"] += 1 + except Exception as e: + msg = f"job #{db_id} ({company}): {e}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] ERROR — {msg}") + + if delay > 0: + time.sleep(delay) + + return result + + +def enrich_all_descriptions( + db_path: Path = DEFAULT_DB, + dry_run: bool = False, + delay: float = DELAY_SECS, +) -> dict: + """ + Find ALL jobs with missing/empty descriptions (any source) and re-fetch them. + + Uses scrape_job_url for every source — it handles LinkedIn, Indeed, Glassdoor, + Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags. + + Returns: + {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} + """ + from scripts.scrape_url import scrape_job_url + + init_db(db_path) + + conn = sqlite3.connect(db_path) + rows = conn.execute( + """SELECT id, url, company, title, source FROM jobs + WHERE (description IS NULL OR TRIM(description) = '') + AND url IS NOT NULL AND url != '' + ORDER BY source, id ASC""" + ).fetchall() + conn.close() + + result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} + + if not rows: + print("[enrich] No jobs with missing descriptions.") + return result + + print(f"[enrich] {len(rows)} job(s) missing descriptions — fetching…") + + for db_id, url, company, title, source in rows: + if not url.startswith("http"): + result["failed"] += 1 + continue + + tag = "[DRY-RUN] " if dry_run else "" + try: + fields = {} if dry_run else scrape_job_url(db_path, db_id) + if fields or dry_run: + desc_len = len(fields.get("description", "") or "") + print(f"[enrich] {tag}[{source}] {company} — {title}: {desc_len} chars") + result["succeeded"] += 1 + else: + print(f"[enrich] [{source}] {company} — {title}: no data returned") + result["failed"] += 1 + except Exception as e: + msg = f"job #{db_id} ({company}): {e}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] ERROR — {msg}") + + if delay > 0: + time.sleep(delay) + + return result + + +def enrich_craigslist_fields( + db_path: Path = DEFAULT_DB, + job_id: int = None, +) -> dict: + """ + Use LLM to extract company name and salary from a Craigslist job description. + + Called after scrape_url populates the description for a craigslist job. + Only runs when: source='craigslist', company='', description non-empty. + + Returns dict with keys 'company' and/or 'salary' (may be empty strings). + """ + import json + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute( + "SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,) + ).fetchone() + conn.close() + + if not row: + return {} + if row["source"] != "craigslist": + return {} + if row["company"]: # already populated + return {} + if not (row["description"] or "").strip(): + return {} + + from scripts.llm_router import LLMRouter + + prompt = ( + "Extract the following from this job posting. " + "Return JSON only, no commentary.\n\n" + '{"company": "", ' + '"salary": ""}\n\n' + f"Posting:\n{row['description'][:3000]}" + ) + + try: + router = LLMRouter() + raw = router.complete(prompt) + except Exception as exc: + print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}") + return {} + + try: + clean = re.sub(r"```(?:json)?|```", "", raw).strip() + fields = json.loads(clean) + except (json.JSONDecodeError, ValueError): + print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}") + return {} + + extracted = { + k: (fields.get(k) or "").strip() + for k in ("company", "salary") + if (fields.get(k) or "").strip() + } + + if extracted: + from scripts.db import update_job_fields + update_job_fields(db_path, job_id, extracted) + print(f"[enrich_craigslist] job {job_id}: " + f"company={extracted.get('company', '—')} " + f"salary={extracted.get('salary', '—')}") + + return extracted + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Re-fetch missing job descriptions (all sources)" + ) + parser.add_argument("--glassdoor-only", action="store_true", + help="Only re-fetch Glassdoor listings (legacy behaviour)") + parser.add_argument("--dry-run", action="store_true", + help="Show what would be fetched without saving") + parser.add_argument("--delay", type=float, default=DELAY_SECS, + help=f"Seconds between requests (default: {DELAY_SECS})") + args = parser.parse_args() + + if args.glassdoor_only: + r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay) + else: + r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay) + + print( + f"\n[enrich] Done — {r['succeeded']} fetched, {r['failed']} failed" + + (f", {len(r['errors'])} error(s)" if r["errors"] else "") + ) diff --git a/scripts/finetune_local.py b/scripts/finetune_local.py new file mode 100644 index 0000000..6dfa406 --- /dev/null +++ b/scripts/finetune_local.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# scripts/finetune_local.py +""" +Local LoRA fine-tune on Alex's cover letter corpus. +No HuggingFace account or internet required after the base model is cached. + +Usage: + conda run -n ogma python scripts/finetune_local.py + conda run -n ogma python scripts/finetune_local.py --model unsloth/Llama-3.2-3B-Instruct + conda run -n ogma python scripts/finetune_local.py --epochs 15 --rank 16 + +After training, follow the printed instructions to load the model into Ollama. +""" +import argparse +import json +import os +import sys +from pathlib import Path + +# Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained +# pins every layer to GPU 0, avoiding the accelerate None-device bug that +# occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation. +# Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation. +os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") + +# ── Config ──────────────────────────────────────────────────────────────────── +DEFAULT_MODEL = "unsloth/Llama-3.2-3B-Instruct" # safe on 8 GB VRAM +LETTERS_JSONL = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl") +OUTPUT_DIR = Path("/Library/Documents/JobSearch/training_data/finetune_output") +GGUF_DIR = Path("/Library/Documents/JobSearch/training_data/gguf") +OLLAMA_NAME = "alex-cover-writer" + +SYSTEM_PROMPT = ( + "You are Alex Rivera's personal cover letter writer. " + "Write professional, warm, and results-focused cover letters in Alex's voice. " + "Draw on her background in customer success, technical account management, " + "and revenue operations. Be specific and avoid generic filler." +) + +# ── Args ────────────────────────────────────────────────────────────────────── +parser = argparse.ArgumentParser() +parser.add_argument("--model", default=DEFAULT_MODEL, help="Base model (HF repo id or local path)") +parser.add_argument("--epochs", type=int, default=10, help="Training epochs (default: 10)") +parser.add_argument("--rank", type=int, default=16, help="LoRA rank (default: 16)") +parser.add_argument("--batch", type=int, default=2, help="Per-device batch size (default: 2)") +parser.add_argument("--no-gguf", action="store_true", help="Skip GGUF export") +parser.add_argument("--max-length", type=int, default=1024, help="Max token length (default: 1024)") +args = parser.parse_args() + +print(f"\n{'='*60}") +print(f" Alex Cover Letter Fine-Tuner") +print(f" Base model : {args.model}") +print(f" Epochs : {args.epochs}") +print(f" LoRA rank : {args.rank}") +print(f" Dataset : {LETTERS_JSONL}") +print(f"{'='*60}\n") + +# ── Load dataset ────────────────────────────────────────────────────────────── +if not LETTERS_JSONL.exists(): + sys.exit(f"ERROR: Dataset not found at {LETTERS_JSONL}\n" + "Run: conda run -n job-seeker python scripts/prepare_training_data.py") + +records = [json.loads(l) for l in LETTERS_JSONL.read_text().splitlines() if l.strip()] +print(f"Loaded {len(records)} training examples.") + +# Convert to chat format expected by SFTTrainer +def to_messages(rec: dict) -> dict: + return {"messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": rec["instruction"]}, + {"role": "assistant", "content": rec["output"]}, + ]} + +chat_data = [to_messages(r) for r in records] + +# ── Load model with unsloth ──────────────────────────────────────────────────── +try: + from unsloth import FastLanguageModel + USE_UNSLOTH = True +except ImportError: + USE_UNSLOTH = False + print("WARNING: unsloth not found — falling back to standard transformers + PEFT") + print(" Install: pip install 'unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git'") + +import torch + +if USE_UNSLOTH: + model, tokenizer = FastLanguageModel.from_pretrained( + model_name = args.model, + max_seq_length = args.max_length, + load_in_4bit = True, # QLoRA — fits 7-9B in 8 GB VRAM + dtype = None, # auto-detect + device_map = {"": 0}, # pin everything to GPU 0; avoids accelerate None-device bug + ) + model = FastLanguageModel.get_peft_model( + model, + r = args.rank, + lora_alpha = args.rank * 2, + lora_dropout = 0, # 0 = full unsloth kernel patching (faster) + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + bias = "none", + use_gradient_checkpointing = "unsloth", + ) +else: + from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + from peft import LoraConfig, get_peft_model, TaskType + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=bnb_config, + device_map="auto", + ) + lora_config = LoraConfig( + r=args.rank, + lora_alpha=args.rank * 2, + lora_dropout=0.05, + task_type=TaskType.CAUSAL_LM, + ) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + +# ── Build HF Dataset ────────────────────────────────────────────────────────── +from datasets import Dataset + +raw = Dataset.from_list(chat_data) +split = raw.train_test_split(test_size=0.1, seed=42) +train_ds = split["train"] +eval_ds = split["test"] +print(f"Train: {len(train_ds)} Eval: {len(eval_ds)}") + +# formatting_func must ALWAYS return a list of strings. +# Unsloth tests it with a single example dict; during training it gets batches. +# Gemma 2 has no "system" role — fold it into the first user turn. +def _apply_template(msgs): + msgs = list(msgs) + if msgs and msgs[0]["role"] == "system": + sys_text = msgs.pop(0)["content"] + if msgs and msgs[0]["role"] == "user": + msgs[0] = {"role": "user", "content": f"{sys_text}\n\n{msgs[0]['content']}"} + return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False) + +def formatting_func(example): + msgs_field = example["messages"] + # Single example: messages is a list of role dicts {"role":..., "content":...} + # Batched example: messages is a list of those lists + if msgs_field and isinstance(msgs_field[0], dict): + return [_apply_template(msgs_field)] + return [_apply_template(m) for m in msgs_field] + +# ── Train ───────────────────────────────────────────────────────────────────── +from trl import SFTTrainer, SFTConfig + +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=train_ds, + eval_dataset=eval_ds, + formatting_func=formatting_func, + args=SFTConfig( + output_dir = str(OUTPUT_DIR), + num_train_epochs = args.epochs, + per_device_train_batch_size = args.batch, + gradient_accumulation_steps = max(1, 8 // args.batch), + learning_rate = 2e-4, + warmup_ratio = 0.1, + lr_scheduler_type = "cosine", + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 5, + eval_strategy = "epoch", + save_strategy = "epoch", + load_best_model_at_end = True, + max_length = args.max_length, + report_to = "none", + push_to_hub = False, # local only + ), +) + +print("\nStarting training…") +trainer.train() +print("Training complete.") + +# ── Save adapter ────────────────────────────────────────────────────────────── +adapter_path = OUTPUT_DIR / "adapter" +model.save_pretrained(str(adapter_path)) +tokenizer.save_pretrained(str(adapter_path)) +print(f"\nLoRA adapter saved to: {adapter_path}") + +# ── GGUF export ─────────────────────────────────────────────────────────────── +if not args.no_gguf and USE_UNSLOTH: + GGUF_DIR.mkdir(parents=True, exist_ok=True) + gguf_path = GGUF_DIR / f"{OLLAMA_NAME}.gguf" + print(f"\nExporting GGUF → {gguf_path} …") + model.save_pretrained_gguf( + str(GGUF_DIR / OLLAMA_NAME), + tokenizer, + quantization_method="q4_k_m", + ) + # unsloth names the file automatically — find it + gguf_files = list(GGUF_DIR.glob("*.gguf")) + if gguf_files: + gguf_path = gguf_files[0] + print(f"GGUF written: {gguf_path}") + else: + print("GGUF export may have succeeded — check GGUF_DIR above.") +else: + gguf_path = None + +# ── Print next steps ────────────────────────────────────────────────────────── +print(f"\n{'='*60}") +print(" DONE — next steps to load into Ollama:") +print(f"{'='*60}") + +if gguf_path and gguf_path.exists(): + modelfile = OUTPUT_DIR / "Modelfile" + modelfile.write_text(f"""FROM {gguf_path} +SYSTEM \"\"\" +{SYSTEM_PROMPT} +\"\"\" +PARAMETER temperature 0.7 +PARAMETER top_p 0.9 +PARAMETER num_ctx 32768 +""") + print(f"\n1. Modelfile written to: {modelfile}") + print(f"\n2. Create the Ollama model:") + print(f" ollama create {OLLAMA_NAME} -f {modelfile}") + print(f"\n3. Test it:") + print(f" ollama run {OLLAMA_NAME} 'Write a cover letter for a Senior Customer Success Manager position at Acme Corp.'") + print(f"\n4. Update llm.yaml to use '{OLLAMA_NAME}:latest' as the ollama model,") + print(f" then pick it in Settings → LLM Backends → Ollama → Model.") +else: + print(f"\n Adapter only (no GGUF). To convert manually:") + print(f" 1. Merge adapter:") + print(f" conda run -n ogma python -c \"") + print(f" from peft import AutoPeftModelForCausalLM") + print(f" m = AutoPeftModelForCausalLM.from_pretrained('{adapter_path}')") + print(f" m.merge_and_unload().save_pretrained('{OUTPUT_DIR}/merged')\"") + print(f" 2. Convert to GGUF using textgen env's convert_hf_to_gguf.py") + print(f" 3. ollama create {OLLAMA_NAME} -f Modelfile") +print() diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py new file mode 100644 index 0000000..071dd41 --- /dev/null +++ b/scripts/generate_cover_letter.py @@ -0,0 +1,224 @@ +# scripts/generate_cover_letter.py +""" +Generate a cover letter in Alex's voice using few-shot examples from her corpus. + +Usage: + conda run -n job-seeker python scripts/generate_cover_letter.py \ + --title "Director of Customer Success" \ + --company "Acme Corp" \ + --description "We are looking for..." + + Or pass a staging DB job ID: + conda run -n job-seeker python scripts/generate_cover_letter.py --job-id 42 +""" +import argparse +import re +import sys +from pathlib import Path + +LETTERS_DIR = Path("/Library/Documents/JobSearch") +LETTER_GLOB = "*Cover Letter*.md" + +# Background injected into every prompt so the model has Alex's facts +SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader. + +Background: +- 6+ years in customer success, technical account management, and CS leadership +- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95 +- Also founder of M3 Consulting, a CS advisory practice for SaaS startups +- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere +- Based in San Francisco Bay Area; open to remote/hybrid +- Pronouns: any + +Voice guidelines: +- Warm, confident, and specific — never generic +- Opens with "I'm delighted/thrilled to apply for [role] at [company]." +- 3–4 focused paragraphs, ~250–350 words total +- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric) +- Para 3: genuine connection to THIS company's mission/product +- Closes with "Thank you for considering my application." + warm sign-off +- Never use: "I am writing to express my interest", "passionate about making a difference", + "I look forward to hearing from you", or any hollow filler phrases +""" + + +# ── Mission-alignment detection ─────────────────────────────────────────────── +# When a company/JD signals one of these preferred industries, the cover letter +# prompt injects a hint so Para 3 can reflect genuine personal connection. +# This does NOT disclose any personal disability or family information. + +_MISSION_SIGNALS: dict[str, list[str]] = { + "music": [ + "music", "spotify", "tidal", "soundcloud", "bandcamp", "apple music", + "distrokid", "cd baby", "landr", "beatport", "reverb", "vinyl", + "streaming", "artist", "label", "live nation", "ticketmaster", "aeg", + "songkick", "concert", "venue", "festival", "audio", "podcast", + "studio", "record", "musician", "playlist", + ], + "animal_welfare": [ + "animal", "shelter", "rescue", "humane society", "spca", "aspca", + "veterinary", "vet ", "wildlife", "pet ", "adoption", "foster", + "dog", "cat", "feline", "canine", "sanctuary", "zoo", + ], + "education": [ + "education", "school", "learning", "student", "edtech", "classroom", + "curriculum", "tutoring", "academic", "university", "kids", "children", + "youth", "literacy", "khan academy", "duolingo", "chegg", "coursera", + "instructure", "canvas lms", "clever", "district", "teacher", + "k-12", "k12", "grade", "pedagogy", + ], +} + +_MISSION_NOTES: dict[str, str] = { + "music": ( + "This company is in the music industry, which is one of Alex's genuinely " + "ideal work environments — she has a real personal passion for the music scene. " + "Para 3 should warmly and specifically reflect this authentic alignment, not as " + "a generic fan statement, but as an honest statement of where she'd love to apply " + "her CS skills." + ), + "animal_welfare": ( + "This organization works in animal welfare/rescue — one of Alex's dream-job " + "domains and a genuine personal passion. Para 3 should reflect this authentic " + "connection warmly and specifically, tying her CS skills to this mission." + ), + "education": ( + "This company works in children's education or EdTech — one of Alex's ideal " + "work domains, reflecting genuine personal values around learning and young people. " + "Para 3 should reflect this authentic connection specifically and warmly." + ), +} + + +def detect_mission_alignment(company: str, description: str) -> str | None: + """Return a mission hint string if company/JD matches a preferred industry, else None.""" + text = f"{company} {description}".lower() + for industry, signals in _MISSION_SIGNALS.items(): + if any(sig in text for sig in signals): + return _MISSION_NOTES[industry] + return None + + +def load_corpus() -> list[dict]: + """Load all .md cover letters from LETTERS_DIR. Returns list of {path, company, text}.""" + corpus = [] + for path in sorted(LETTERS_DIR.glob(LETTER_GLOB)): + text = path.read_text(encoding="utf-8", errors="ignore").strip() + if not text: + continue + # Extract company from filename: "Tailscale Cover Letter.md" → "Tailscale" + company = re.sub(r"\s*Cover Letter.*", "", path.stem, flags=re.IGNORECASE).strip() + corpus.append({"path": path, "company": company, "text": text}) + return corpus + + +def find_similar_letters(job_description: str, corpus: list[dict], top_k: int = 3) -> list[dict]: + """Return the top_k letters most similar to the job description by TF-IDF cosine sim.""" + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + + if not corpus: + return [] + + docs = [job_description] + [c["text"] for c in corpus] + vectorizer = TfidfVectorizer(stop_words="english", max_features=500) + tfidf = vectorizer.fit_transform(docs) + sims = cosine_similarity(tfidf[0:1], tfidf[1:])[0] + + ranked = sorted(zip(sims, corpus), key=lambda x: x[0], reverse=True) + return [entry for _, entry in ranked[:top_k]] + + +def build_prompt( + title: str, + company: str, + description: str, + examples: list[dict], + mission_hint: str | None = None, +) -> str: + parts = [SYSTEM_CONTEXT.strip(), ""] + if examples: + parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n") + for i, ex in enumerate(examples, 1): + parts.append(f"--- Example {i} ({ex['company']}) ---") + parts.append(ex["text"]) + parts.append("") + parts.append("=== END EXAMPLES ===\n") + + if mission_hint: + parts.append(f"⭐ Mission alignment note (for Para 3): {mission_hint}\n") + + parts.append(f"Now write a new cover letter for:") + parts.append(f" Role: {title}") + parts.append(f" Company: {company}") + if description: + snippet = description[:1500].strip() + parts.append(f"\nJob description excerpt:\n{snippet}") + parts.append("\nWrite the full cover letter now:") + return "\n".join(parts) + + +def generate(title: str, company: str, description: str = "", _router=None) -> str: + """Generate a cover letter and return it as a string. + + _router is an optional pre-built LLMRouter (used in tests to avoid real LLM calls). + """ + corpus = load_corpus() + examples = find_similar_letters(description or f"{title} {company}", corpus) + mission_hint = detect_mission_alignment(company, description) + if mission_hint: + print(f"[cover-letter] Mission alignment detected for {company}", file=sys.stderr) + prompt = build_prompt(title, company, description, examples, mission_hint=mission_hint) + + if _router is None: + sys.path.insert(0, str(Path(__file__).parent.parent)) + from scripts.llm_router import LLMRouter + _router = LLMRouter() + + print(f"[cover-letter] Generating for: {title} @ {company}", file=sys.stderr) + print(f"[cover-letter] Style examples: {[e['company'] for e in examples]}", file=sys.stderr) + + result = _router.complete(prompt) + return result.strip() + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice") + parser.add_argument("--title", help="Job title") + parser.add_argument("--company", help="Company name") + parser.add_argument("--description", default="", help="Job description text") + parser.add_argument("--job-id", type=int, help="Load job from staging.db by ID") + parser.add_argument("--output", help="Write output to this file path") + args = parser.parse_args() + + title, company, description = args.title, args.company, args.description + + if args.job_id is not None: + from scripts.db import DEFAULT_DB + import sqlite3 + conn = sqlite3.connect(DEFAULT_DB) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone() + conn.close() + if not row: + print(f"No job with id={args.job_id} in staging.db", file=sys.stderr) + sys.exit(1) + job = dict(row) + title = title or job.get("title", "") + company = company or job.get("company", "") + description = description or job.get("description", "") + + if not title or not company: + parser.error("--title and --company are required (or use --job-id)") + + letter = generate(title, company, description) + + if args.output: + Path(args.output).write_text(letter) + print(f"Saved to {args.output}", file=sys.stderr) + else: + print(letter) + + +if __name__ == "__main__": + main() diff --git a/scripts/imap_sync.py b/scripts/imap_sync.py new file mode 100644 index 0000000..220a54f --- /dev/null +++ b/scripts/imap_sync.py @@ -0,0 +1,906 @@ +# scripts/imap_sync.py +""" +IMAP email sync — associates recruitment emails with job applications. + +Safety / privacy design: + - Only imports emails that pass BOTH checks: + 1. Sender or subject contains the exact company name (or derived domain) + 2. Subject contains at least one recruitment keyword + - Fuzzy / partial company name matches are rejected + - Emails between known personal contacts are never imported + - Only the INBOX and Sent folders are touched; no other folders + - Credentials stored in config/email.yaml (gitignored) + +Config: config/email.yaml (see config/email.yaml.example) + +Usage: + conda run -n job-seeker python scripts/imap_sync.py + conda run -n job-seeker python scripts/imap_sync.py --job-id 42 + conda run -n job-seeker python scripts/imap_sync.py --dry-run +""" +import email +import imaplib +import re +import sys +from datetime import datetime, timedelta +from email.header import decode_header as _raw_decode_header +from pathlib import Path +from typing import Optional +from urllib.parse import urlparse + +import yaml + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db, get_interview_jobs, add_contact, get_contacts +from scripts.llm_router import LLMRouter + +_CLASSIFIER_ROUTER = LLMRouter() + +_CLASSIFY_SYSTEM = ( + "You are an email classifier. Classify the recruitment email into exactly ONE of these categories:\n" + " interview_scheduled, offer_received, rejected, positive_response, survey_received, neutral\n\n" + "Rules:\n" + "- interview_scheduled: recruiter wants to book a call/interview\n" + "- offer_received: job offer is being extended\n" + "- rejected: explicitly not moving forward\n" + "- positive_response: interested/impressed but no interview booked yet\n" + "- survey_received: link or request to complete a survey, assessment, or questionnaire\n" + "- neutral: auto-confirmation, generic update, no clear signal\n\n" + "Respond with ONLY the category name. No explanation." +) + +_CLASSIFY_LABELS = [ + "interview_scheduled", "offer_received", "rejected", + "positive_response", "survey_received", "neutral", +] + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "email.yaml" + +# ── Recruitment keyword filter ──────────────────────────────────────────────── +# An email must match at least one of these in its subject line to be imported. +RECRUITMENT_KEYWORDS = { + # Application lifecycle + "interview", "application", "applicant", "apply", "applied", + "position", "opportunity", "role", "opening", "vacancy", + "offer", "offer letter", "schedule", "scheduling", + "screening", "screen", "phone screen", "video call", + "assessment", "hiring", "hired", "recruiter", "recruitment", + "talent", "candidate", "recruiting", "next steps", "follow up", "follow-up", + "onboarding", "start date", "background check", "reference", + "congratulations", "unfortunately", "decision", "update", + # Job board / ATS notifications + "viewed your profile", "interested in your background", + "job alert", "new job", "job match", "job opportunity", + "your application", "application received", "application status", + "application update", "we received", "thank you for applying", + "thanks for applying", "moved forward", "moving forward", + "not moving forward", "decided to", "other candidates", + "keep your resume", "keep you in mind", + # Recruiter outreach + "reaching out", "i came across", "your experience", + "connect with you", "exciting opportunity", "great fit", + "perfect fit", "right fit", "strong fit", "ideal candidate", +} + +# ── Rejection / ATS-confirm phrase filter ───────────────────────────────────── +# Checked against subject + first 800 chars of body BEFORE calling any LLM. +# Covers the cases phi3:mini consistently mis-classifies as "neutral". +_REJECTION_PHRASES = [ + # Explicit rejection — safe to check subject + body + "not moving forward", "decided not to move forward", + "not selected", "not be moving forward", "will not be moving forward", + "unfortunately", "regret to inform", "regret to let you know", + "decided to go with other", "decided to pursue other", + "other candidates", "other applicants", "position has been filled", + "filled the position", "no longer moving forward", + "we have decided", "we've decided", "after careful consideration", + "at this time we", "at this point we", + "we will not", "we won't be", "we are not able", + "wish you the best", "best of luck in your", + "keep your resume on file", +] + +# ATS-confirm phrases — checked against SUBJECT ONLY. +# Do NOT check these in the body: recruiters often quote ATS thread history, +# so "thank you for applying" can appear in a genuine follow-up body. +_ATS_CONFIRM_SUBJECTS = [ + "application received", "application confirmation", + "thanks for applying", "thank you for applying", + "thank you for your application", + "we received your application", + "application has been received", + "has received your application", + "successfully submitted", + "your application for", + "you applied to", +] + +# Phrases that immediately identify a non-recruitment email (retail, spam, etc.) +_SPAM_PHRASES = [ + # Retail / commerce offers + "special offer", "private offer", "exclusive offer", "limited time offer", + "limited-time offer", "sent you a special offer", "sent you an offer", + "holiday offer", "seasonal offer", "membership offer", + "round trip from $", "bonus points", + "% off", "% discount", "save up to", "free shipping", + "unsubscribe", "view in browser", "view this email in", + "update your preferences", "email preferences", + # LinkedIn apply confirmations & digests (not new inbound leads) + "your application was sent to", + "your application was viewed by", + "application updates this week", + "don't forget to complete your application", + "view your application updates", + "you have new application updates", + # Indeed apply confirmations + "indeed application:", + # DocuSign / e-signature + "requests you to sign", + "has sent you a reminder", + "please sign", + # Security / MFA codes + "security code for your application", + "verification code", +] + +# Subject prefixes that identify non-job emails +_SPAM_SUBJECT_PREFIXES = [ + "@", # "@user sent you a special offer" — Depop / social commerce + "re: fw:", # forwarded chains unlikely to be first-contact recruitment + "accepted:", # Google Calendar accepted invite + "notification:", # Google Calendar notification + "[meeting reminder]", # Google Calendar meeting reminder + "updated invitation:", # Google Calendar update + "[updated]", # Google Calendar update + "reminder:", # Generic reminder (AAA digital interview reminders, etc.) + "📄", # Newsletter/article emoji prefix + "invitation from", # Google Calendar invite forwarded by name +] + +# Unicode-safe "don't forget" variants (Gmail renders typographic apostrophes) +_DONT_FORGET_VARIANTS = [ + "don't forget to complete your application", # straight apostrophe + "don\u2019t forget to complete your application", # right single quotation mark ' + "don\u2018t forget to complete your application", # left single quotation mark ' +] + + +def _has_rejection_or_ats_signal(subject: str, body: str) -> bool: + """Return True if the email is a rejection, ATS auto-confirmation, or non-recruitment spam.""" + subject_lower = subject.lower().strip() + + # Fast subject-prefix checks (Depop "@user", etc.) + if any(subject_lower.startswith(p) for p in _SPAM_SUBJECT_PREFIXES): + return True + + # Fast subject-only check for ATS confirmations + if any(phrase in subject_lower for phrase in _ATS_CONFIRM_SUBJECTS): + return True + + # Check subject + opening body for rejection and spam phrases + haystack = subject_lower + " " + body[:1500].lower() + if any(phrase in haystack for phrase in _REJECTION_PHRASES + _SPAM_PHRASES): + return True + # Unicode-safe "don't forget" check (handles straight, right, and left apostrophes) + raw = (subject + " " + body[:1500]).lower() + return any(phrase in raw for phrase in _DONT_FORGET_VARIANTS) + + +# Legal entity suffixes to strip when normalising company names +_LEGAL_SUFFIXES = re.compile( + r",?\s*\b(Inc|LLC|Ltd|Limited|Corp|Corporation|Co|GmbH|AG|plc|PLC|SAS|SA|NV|BV|LP|LLP)\b\.?\s*$", + re.IGNORECASE, +) + +# Job-board SLDs that must never be used as company-match search terms. +# A LinkedIn job URL has domain "linkedin.com" → SLD "linkedin", which would +# incorrectly match every LinkedIn notification email against every LinkedIn job. +_JOB_BOARD_SLDS = { + "linkedin", "indeed", "glassdoor", "ziprecruiter", "monster", + "careerbuilder", "dice", "simplyhired", "wellfound", "angellist", + "greenhouse", "lever", "workday", "taleo", "icims", "smartrecruiters", + "bamboohr", "ashby", "rippling", "jobvite", "workable", "gusto", + "paylocity", "paycom", "adp", "breezy", "recruitee", "jazz", +} + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _decode_str(value: Optional[str]) -> str: + """Decode an RFC2047-encoded header value to a plain Python string.""" + if not value: + return "" + parts = _raw_decode_header(value) + result = [] + for part, encoding in parts: + if isinstance(part, bytes): + result.append(part.decode(encoding or "utf-8", errors="replace")) + else: + result.append(str(part)) + return " ".join(result).strip() + + +def _extract_domain(url_or_email: str) -> str: + """ + Pull the bare domain from a URL (https://company.com/jobs/...) or + an email address (recruiter@company.com). Returns '' if none found. + """ + url_or_email = url_or_email.strip() + if "@" in url_or_email: + return url_or_email.split("@")[-1].split(">")[0].strip().lower() + try: + parsed = urlparse(url_or_email) + host = parsed.netloc or parsed.path + # strip www. + return re.sub(r"^www\.", "", host).lower() + except Exception: + return "" + + +def _normalise_company(company: str) -> str: + """Strip legal suffixes and extra whitespace from a company name.""" + return _LEGAL_SUFFIXES.sub("", company).strip() + + +def _company_search_terms(company: str, job_url: str = "") -> list[str]: + """ + Return a list of strings that must appear (case-insensitively) in the + email's from-address or subject for it to be considered a match. + + We are deliberately conservative: + - Use the full normalised company name (not just the first word) + - Also include the company domain derived from the job URL, but ONLY + when the domain belongs to the actual company (not a job board). + LinkedIn jobs link to linkedin.com — if we used "linkedin" as a term + we'd match every LinkedIn notification email against every LinkedIn job. + """ + terms = [] + clean = _normalise_company(company) + if len(clean) >= 3: + terms.append(clean.lower()) + + domain = _extract_domain(job_url) + if domain and len(domain) > 4: + sld = domain.split(".")[0] + if len(sld) >= 3 and sld not in terms and sld not in _JOB_BOARD_SLDS: + terms.append(sld) + + return terms + + +def _has_recruitment_keyword(subject: str) -> bool: + """Return True if the subject contains at least one recruitment keyword.""" + subject_lower = subject.lower() + return any(kw in subject_lower for kw in RECRUITMENT_KEYWORDS) + + +def _email_is_relevant(from_addr: str, subject: str, search_terms: list[str]) -> bool: + """ + Two-gate filter: + Gate 1 — from-address OR subject must contain an exact company term + Gate 2 — subject must contain a recruitment keyword + + Both gates must pass. This prevents importing unrelated emails that + happen to mention a company name in passing. + """ + combined = (from_addr + " " + subject).lower() + + gate1 = any(term in combined for term in search_terms) + gate2 = _has_recruitment_keyword(subject) + + return gate1 and gate2 + + +def _get_existing_message_ids(job_id: int, db_path: Path) -> set[str]: + contacts = get_contacts(db_path, job_id=job_id) + return {c.get("message_id", "") for c in contacts if c.get("message_id")} + + +def classify_stage_signal(subject: str, body: str) -> Optional[str]: + """Classify an inbound email into a pipeline stage signal. + + Returns one of the 5 label strings, or None on failure. + Uses phi3:mini via Ollama (benchmarked 100% on 12-case test set). + """ + try: + prompt = f"Subject: {subject}\n\nEmail: {body[:400]}" + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_CLASSIFY_SYSTEM, + model_override="llama3.1:8b", + fallback_order=["ollama_research"], + ) + # Strip blocks (in case a reasoning model slips through) + text = re.sub(r".*?", "", raw, flags=re.DOTALL) + text = text.lower().strip() + for label in _CLASSIFY_LABELS: + if text.startswith(label) or label in text: + return label + return "neutral" + except Exception: + return None + + +_EXTRACT_SYSTEM = ( + "Extract the hiring company name and job title from this recruitment email, " + "but ONLY if it represents genuine new recruiter outreach — i.e. a recruiter " + "contacting you about an open role for the first time.\n\n" + "Return {\"company\": null, \"title\": null} if the email is any of:\n" + " - A rejection or 'not moving forward' notice\n" + " - An ATS auto-confirmation ('we received your application')\n" + " - A status update for an application already in progress\n" + " - A generic job-alert digest or newsletter\n" + " - A follow-up you sent, not a reply from a recruiter\n\n" + "Otherwise respond with ONLY valid JSON: " + '{"company": "Company Name", "title": "Job Title"}.' +) + + +def extract_lead_info(subject: str, body: str, + from_addr: str) -> tuple[Optional[str], Optional[str]]: + """Use LLM to extract (company, title) from an unmatched recruitment email. + + Returns (company, title) or (None, None) on failure / low confidence. + """ + import json as _json + try: + prompt = ( + f"From: {from_addr}\n" + f"Subject: {subject}\n\n" + f"Email excerpt:\n{body[:600]}" + ) + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_EXTRACT_SYSTEM, + fallback_order=["ollama_research"], + ) + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + m = re.search(r'\{.*\}', text, re.DOTALL) + if not m: + return None, None + data = _json.loads(m.group()) + company = data.get("company") or None + title = data.get("title") or None + return company, title + except Exception: + return None, None + + +# Keywords that indicate an email in a curated label needs attention. +# Intentionally separate from RECRUITMENT_KEYWORDS — these are action-oriented. +_TODO_LABEL_KEYWORDS = { + "action needed", "action required", + "please complete", "please submit", "please respond", "please reply", + "response needed", "response required", + "next steps", "next step", + "follow up", "follow-up", + "deadline", "by end of", + "your offer", "offer letter", + "background check", "reference check", + "onboarding", "start date", + "congrats", "congratulations", + "we'd like to", "we would like to", + "interview", "schedule", "scheduling", +} + + +def _has_todo_keyword(subject: str) -> bool: + """Return True if the subject contains a TODO-label action keyword.""" + subject_lower = subject.lower() + return any(kw in subject_lower for kw in _TODO_LABEL_KEYWORDS) + + +_LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com" + +# Social-proof / nav lines to skip when parsing alert blocks +_ALERT_SKIP_PHRASES = { + "school alumni", "apply with", "actively hiring", "manage alerts", + "view all jobs", "your job alert", "new jobs match", + "unsubscribe", "linkedin corporation", +} + + +def parse_linkedin_alert(body: str) -> list[dict]: + """ + Parse the plain-text body of a LinkedIn Job Alert digest email. + + Returns a list of dicts: {title, company, location, url}. + URL is canonicalized to https://www.linkedin.com/jobs/view// + (tracking parameters stripped). + """ + jobs = [] + # Split on separator lines (10+ dashes) + blocks = re.split(r"\n\s*-{10,}\s*\n", body) + for block in blocks: + lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()] + + # Find "View job:" URL + url = None + for line in lines: + m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE) + if m: + raw_url = m.group(1) + job_id_m = re.search(r"/jobs/view/(\d+)", raw_url) + if job_id_m: + url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/" + break + if not url: + continue + + # Filter noise lines + content = [ + ln for ln in lines + if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES) + and not ln.lower().startswith("view job:") + and not ln.startswith("http") + ] + if len(content) < 2: + continue + + jobs.append({ + "title": content[0], + "company": content[1], + "location": content[2] if len(content) > 2 else "", + "url": url, + }) + return jobs + + +def _scan_todo_label(conn: imaplib.IMAP4, cfg: dict, db_path: Path, + active_jobs: list[dict], + known_message_ids: set) -> int: + """Scan the configured Gmail label for action emails, matching them to pipeline jobs. + + Two gates per email: + 1. Company name appears in from-address or subject (same as sync_job_emails) + 2. Subject contains a TODO-label action keyword + + Returns count of new contacts attached. + """ + label = cfg.get("todo_label", "").strip() + if not label: + return 0 + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + + # Search the label folder for any emails (no keyword pre-filter — it's curated) + uids = _search_folder(conn, label, "ALL", since) + if not uids: + return 0 + + # Build a lookup: search_term → [job, ...] for all active jobs + term_to_jobs: dict[str, list[dict]] = {} + for job in active_jobs: + for term in _company_search_terms(job.get("company", ""), job.get("url", "")): + term_to_jobs.setdefault(term, []).append(job) + + added = 0 + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + mid = parsed["message_id"] + if mid in known_message_ids: + continue + + # Gate 1: company name match — from_addr + subject + first 300 chars of body + # Body fallback catches ATS emails (e.g. noreply@greenhouse.io) where the + # company name only appears in the email body, not the sender or subject. + combined = ( + parsed["from_addr"] + " " + + parsed["subject"] + " " + + parsed["body"][:300] + ).lower() + matched_jobs = [] + for term, jobs in term_to_jobs.items(): + if term in combined: + matched_jobs.extend(jobs) + # Deduplicate by job id + seen_ids: set[int] = set() + matched_jobs = [j for j in matched_jobs if not (j["id"] in seen_ids or seen_ids.add(j["id"]))] # type: ignore[func-returns-value] + if not matched_jobs: + continue + + # Gate 2: action keyword in subject + if not _has_todo_keyword(parsed["subject"]): + continue + + for job in matched_jobs: + contact_id = add_contact( + db_path, job_id=job["id"], direction="inbound", + subject=parsed["subject"], + from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], + body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=mid, + ) + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) + + known_message_ids.add(mid) + added += 1 + print(f"[imap] TODO label → {matched_jobs[0].get('company')} — {parsed['subject'][:60]}") + + return added + + +def _scan_unmatched_leads(conn: imaplib.IMAP4, cfg: dict, + db_path: Path, + known_message_ids: set) -> int: + """Scan INBOX for recruitment emails not matched to any pipeline job. + + Calls LLM to extract company/title; inserts qualifying emails as pending jobs. + Returns the count of new leads inserted. + """ + from scripts.db import get_existing_urls, insert_job, add_contact as _add_contact + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + + broad_terms = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"] + all_uids: set = set() + for term in broad_terms: + uids = _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since) + all_uids.update(uids) + + existing_urls = get_existing_urls(db_path) + new_leads = 0 + + for uid in all_uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + mid = parsed["message_id"] + if mid in known_message_ids: + continue + + # ── LinkedIn Job Alert digest — parse each card individually ────── + if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower(): + cards = parse_linkedin_alert(parsed["body"]) + for card in cards: + if card["url"] in existing_urls: + continue + job_id = insert_job(db_path, { + "title": card["title"], + "company": card["company"], + "url": card["url"], + "source": "linkedin", + "location": card["location"], + "is_remote": 0, + "salary": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + from scripts.task_runner import submit_task + submit_task(db_path, "scrape_url", job_id) + existing_urls.add(card["url"]) + new_leads += 1 + print(f"[imap] LinkedIn alert → {card['company']} — {card['title']}") + known_message_ids.add(mid) + continue # skip normal LLM extraction path + + if not _has_recruitment_keyword(parsed["subject"]): + continue + + # Fast phrase-based rejection / ATS-confirm filter (catches what phi3 misses) + if _has_rejection_or_ats_signal(parsed["subject"], parsed["body"]): + continue + + # LLM classification as secondary gate — skip on rejection or classifier failure + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal is None or signal == "rejected": + continue + + company, title = extract_lead_info( + parsed["subject"], parsed["body"], parsed["from_addr"] + ) + if not company: + continue + + from_domain = _extract_domain(parsed["from_addr"]) or "unknown" + mid_hash = str(abs(hash(mid)))[:10] + synthetic_url = f"email://{from_domain}/{mid_hash}" + + if synthetic_url in existing_urls: + continue + + job_id = insert_job(db_path, { + "title": title or "(untitled)", + "company": company, + "url": synthetic_url, + "source": "email", + "location": "", + "is_remote": 0, + "salary": "", + "description": parsed["body"][:2000], + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + _add_contact(db_path, job_id=job_id, direction="inbound", + subject=parsed["subject"], + from_addr=parsed["from_addr"], + body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else "", + message_id=mid) + known_message_ids.add(mid) + existing_urls.add(synthetic_url) + new_leads += 1 + + return new_leads + + +# ── IMAP connection ─────────────────────────────────────────────────────────── + +def load_config() -> dict: + if not CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Email config not found: {CONFIG_PATH}\n" + f"Copy config/email.yaml.example → config/email.yaml and fill it in." + ) + return yaml.safe_load(CONFIG_PATH.read_text()) or {} + + +def connect(cfg: dict) -> imaplib.IMAP4: + host = cfg.get("host", "imap.gmail.com") + port = int(cfg.get("port", 993)) + use_ssl = cfg.get("use_ssl", True) + conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port) + conn.login(cfg["username"], cfg["password"]) + return conn + + +def _detect_sent_folder(conn: imaplib.IMAP4) -> str: + """Try to auto-detect the Sent folder name.""" + candidates = ["[Gmail]/Sent Mail", "Sent", "Sent Items", "Sent Messages", "INBOX.Sent"] + try: + _, folder_list = conn.list() + flat = " ".join(f.decode() for f in (folder_list or [])) + for candidate in candidates: + if candidate.lower() in flat.lower(): + return candidate + except Exception: + pass + return "Sent" + + +def _quote_folder(name: str) -> str: + """Quote an IMAP folder name if it contains spaces. + Escapes internal backslashes and double-quotes per RFC 3501. + e.g. 'TO DO JOBS' → '"TO DO JOBS"', 'My "Jobs"' → '"My \\"Jobs\\""' + """ + if " " in name: + escaped = name.replace("\\", "\\\\").replace('"', '\\"') + return f'"{escaped}"' + return name + + +def _search_folder(conn: imaplib.IMAP4, folder: str, criteria: str, + since: str) -> list[bytes]: + """SELECT a folder and return matching UID list (empty on any error).""" + try: + conn.select(_quote_folder(folder), readonly=True) + _, data = conn.search(None, f'(SINCE "{since}" {criteria})') + return data[0].split() if data and data[0] else [] + except Exception: + return [] + + +def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: + """Fetch and parse one message. Returns None on failure.""" + try: + _, data = conn.fetch(uid, "(RFC822)") + if not data or not data[0]: + return None + msg = email.message_from_bytes(data[0][1]) + + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + try: + body = part.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + break + else: + try: + body = msg.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + + mid = msg.get("Message-ID", "").strip() + if not mid: + return None # No Message-ID → can't dedup; skip to avoid repeat inserts + + return { + "message_id": mid, + "subject": _decode_str(msg.get("Subject")), + "from_addr": _decode_str(msg.get("From")), + "to_addr": _decode_str(msg.get("To")), + "date": _decode_str(msg.get("Date")), + "body": body[:4000], + } + except Exception: + return None + + +# ── Per-job sync ────────────────────────────────────────────────────────────── + +def _update_contact_signal(db_path: Path, contact_id: int, signal: str) -> None: + """Write a stage signal onto an existing contact row.""" + import sqlite3 as _sqlite3 + conn = _sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET stage_signal = ? WHERE id = ?", + (signal, contact_id), + ) + conn.commit() + conn.close() + + +def sync_job_emails(job: dict, conn: imaplib.IMAP4, cfg: dict, + db_path: Path, dry_run: bool = False) -> tuple[int, int]: + """ + Sync recruitment emails for one job. + Returns (inbound_added, outbound_added). + """ + company = (job.get("company") or "").strip() + if not company: + return 0, 0 + + search_terms = _company_search_terms(company, job.get("url", "")) + if not search_terms: + return 0, 0 + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + existing_ids = _get_existing_message_ids(job["id"], db_path) + + inbound = outbound = 0 + + for term in search_terms: + # ── INBOX — inbound ─────────────────────────────────────────────── + uids = _search_folder( + conn, "INBOX", + f'(OR FROM "{term}" SUBJECT "{term}")', + since, + ) + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + if parsed["message_id"] in existing_ids: + continue + if not _email_is_relevant(parsed["from_addr"], parsed["subject"], search_terms): + continue + + if not dry_run: + contact_id = add_contact( + db_path, job_id=job["id"], direction="inbound", + subject=parsed["subject"], from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=parsed["message_id"], + ) + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) + existing_ids.add(parsed["message_id"]) + inbound += 1 + + # ── Sent — outbound ─────────────────────────────────────────────── + sent_folder = cfg.get("sent_folder") or _detect_sent_folder(conn) + uids = _search_folder( + conn, sent_folder, + f'(OR TO "{term}" SUBJECT "{term}")', + since, + ) + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + if parsed["message_id"] in existing_ids: + continue + if not _email_is_relevant(parsed["to_addr"], parsed["subject"], search_terms): + continue + + if not dry_run: + add_contact( + db_path, job_id=job["id"], direction="outbound", + subject=parsed["subject"], from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=parsed["message_id"], + ) + existing_ids.add(parsed["message_id"]) + outbound += 1 + + return inbound, outbound + + +# ── Main entry ──────────────────────────────────────────────────────────────── + +def sync_all(db_path: Path = DEFAULT_DB, + dry_run: bool = False, + job_ids: Optional[list[int]] = None, + on_stage=None) -> dict: + """ + Sync emails for all active pipeline jobs (or a specific subset). + + Returns a summary dict: + {"synced": N, "inbound": N, "outbound": N, "errors": [...]} + """ + def _stage(msg: str) -> None: + if on_stage: + on_stage(msg) + + cfg = load_config() + init_db(db_path) + + jobs_by_stage = get_interview_jobs(db_path) + active_stages = ["applied", "phone_screen", "interviewing", "offer", "hired"] + all_active = [j for stage in active_stages for j in jobs_by_stage.get(stage, [])] + + if job_ids: + all_active = [j for j in all_active if j["id"] in job_ids] + + if not all_active: + return {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "todo_attached": 0, "errors": []} + + _stage("connecting") + print(f"[imap] Connecting to {cfg.get('host', 'imap.gmail.com')} …") + conn = connect(cfg) + summary = {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "errors": []} + + try: + for i, job in enumerate(all_active, 1): + _stage(f"job {i}/{len(all_active)}") + try: + inb, out = sync_job_emails(job, conn, cfg, db_path, dry_run=dry_run) + label = "DRY-RUN " if dry_run else "" + print(f"[imap] {label}{job.get('company'):30s} +{inb} in +{out} out") + if inb + out > 0: + summary["synced"] += 1 + summary["inbound"] += inb + summary["outbound"] += out + except Exception as e: + msg = f"{job.get('company')}: {e}" + summary["errors"].append(msg) + print(f"[imap] ERROR — {msg}") + + _stage("scanning todo label") + from scripts.db import get_all_message_ids + known_mids = get_all_message_ids(db_path) + summary["todo_attached"] = _scan_todo_label(conn, cfg, db_path, all_active, known_mids) + + _stage("scanning leads") + summary["new_leads"] = _scan_unmatched_leads(conn, cfg, db_path, known_mids) + finally: + try: + conn.logout() + except Exception: + pass + + return summary + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Sync IMAP emails to job contacts") + parser.add_argument("--job-id", type=int, nargs="+", help="Sync only these job IDs") + parser.add_argument("--dry-run", action="store_true", help="Show matches without saving") + args = parser.parse_args() + + result = sync_all( + dry_run=args.dry_run, + job_ids=args.job_id, + ) + print(f"\n[imap] Done — {result['synced']} jobs updated, " + f"{result['inbound']} inbound, {result['outbound']} outbound" + + (f", {len(result['errors'])} errors" if result["errors"] else "")) diff --git a/scripts/llm_router.py b/scripts/llm_router.py new file mode 100644 index 0000000..d4eb237 --- /dev/null +++ b/scripts/llm_router.py @@ -0,0 +1,170 @@ +""" +LLM abstraction layer with priority fallback chain. +Reads config/llm.yaml. Tries backends in order; falls back on any error. +""" +import os +import yaml +import requests +from pathlib import Path +from openai import OpenAI + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml" + + +class LLMRouter: + def __init__(self, config_path: Path = CONFIG_PATH): + with open(config_path) as f: + self.config = yaml.safe_load(f) + + def _is_reachable(self, base_url: str) -> bool: + """Quick health-check ping. Returns True if backend is up.""" + health_url = base_url.rstrip("/").removesuffix("/v1") + "/health" + try: + resp = requests.get(health_url, timeout=2) + return resp.status_code < 500 + except Exception: + return False + + def _resolve_model(self, client: OpenAI, model: str) -> str: + """Resolve __auto__ to the first model served by vLLM.""" + if model != "__auto__": + return model + models = client.models.list() + return models.data[0].id + + def complete(self, prompt: str, system: str | None = None, + model_override: str | None = None, + fallback_order: list[str] | None = None, + images: list[str] | None = None) -> str: + """ + Generate a completion. Tries each backend in fallback_order. + + model_override: when set, replaces the configured model for + openai_compat backends (e.g. pass a research-specific ollama model). + fallback_order: when set, overrides config fallback_order for this + call (e.g. pass config["research_fallback_order"] for research tasks). + images: optional list of base64-encoded PNG/JPG strings. When provided, + backends without supports_images=true are skipped. vision_service backends + are only tried when images is provided. + Raises RuntimeError if all backends are exhausted. + """ + order = fallback_order if fallback_order is not None else self.config["fallback_order"] + for name in order: + backend = self.config["backends"][name] + + if not backend.get("enabled", True): + print(f"[LLMRouter] {name}: disabled, skipping") + continue + + supports_images = backend.get("supports_images", False) + is_vision_service = backend["type"] == "vision_service" + + # vision_service only used when images provided + if is_vision_service and not images: + print(f"[LLMRouter] {name}: vision_service skipped (no images)") + continue + + # non-vision backends skipped when images provided and they don't support it + if images and not supports_images and not is_vision_service: + print(f"[LLMRouter] {name}: no image support, skipping") + continue + + if is_vision_service: + if not self._is_reachable(backend["base_url"]): + print(f"[LLMRouter] {name}: unreachable, skipping") + continue + try: + resp = requests.post( + backend["base_url"].rstrip("/") + "/analyze", + json={ + "prompt": prompt, + "image_base64": images[0] if images else "", + }, + timeout=60, + ) + resp.raise_for_status() + print(f"[LLMRouter] Used backend: {name} (vision_service)") + return resp.json()["text"] + except Exception as e: + print(f"[LLMRouter] {name}: error — {e}, trying next") + continue + + elif backend["type"] == "openai_compat": + if not self._is_reachable(backend["base_url"]): + print(f"[LLMRouter] {name}: unreachable, skipping") + continue + try: + client = OpenAI( + base_url=backend["base_url"], + api_key=backend.get("api_key") or "any", + ) + raw_model = model_override or backend["model"] + model = self._resolve_model(client, raw_model) + messages = [] + if system: + messages.append({"role": "system", "content": system}) + if images and supports_images: + content = [{"type": "text", "text": prompt}] + for img in images: + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img}"}, + }) + messages.append({"role": "user", "content": content}) + else: + messages.append({"role": "user", "content": prompt}) + + resp = client.chat.completions.create( + model=model, messages=messages + ) + print(f"[LLMRouter] Used backend: {name} ({model})") + return resp.choices[0].message.content + + except Exception as e: + print(f"[LLMRouter] {name}: error — {e}, trying next") + continue + + elif backend["type"] == "anthropic": + api_key = os.environ.get(backend["api_key_env"], "") + if not api_key: + print(f"[LLMRouter] {name}: {backend['api_key_env']} not set, skipping") + continue + try: + import anthropic as _anthropic + client = _anthropic.Anthropic(api_key=api_key) + if images and supports_images: + content = [] + for img in images: + content.append({ + "type": "image", + "source": {"type": "base64", "media_type": "image/png", "data": img}, + }) + content.append({"type": "text", "text": prompt}) + else: + content = prompt + kwargs: dict = { + "model": backend["model"], + "max_tokens": 4096, + "messages": [{"role": "user", "content": content}], + } + if system: + kwargs["system"] = system + msg = client.messages.create(**kwargs) + print(f"[LLMRouter] Used backend: {name}") + return msg.content[0].text + except Exception as e: + print(f"[LLMRouter] {name}: error — {e}, trying next") + continue + + raise RuntimeError("All LLM backends exhausted") + + +# Module-level singleton for convenience +_router: LLMRouter | None = None + + +def complete(prompt: str, system: str | None = None) -> str: + global _router + if _router is None: + _router = LLMRouter() + return _router.complete(prompt, system) diff --git a/scripts/manage-ui.sh b/scripts/manage-ui.sh new file mode 100755 index 0000000..55cadd9 --- /dev/null +++ b/scripts/manage-ui.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# scripts/manage-ui.sh — manage the Streamlit job-seeker web UI +# Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs] + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +STREAMLIT_BIN="/devl/miniconda3/envs/job-seeker/bin/streamlit" +APP_ENTRY="$REPO_DIR/app/app.py" +PID_FILE="$REPO_DIR/.streamlit.pid" +LOG_FILE="$REPO_DIR/.streamlit.log" +PORT="${STREAMLIT_PORT:-8501}" + +start() { + if is_running; then + echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload." + return 0 + fi + + echo "Starting Streamlit on http://localhost:$PORT …" + "$STREAMLIT_BIN" run "$APP_ENTRY" \ + --server.port "$PORT" \ + --server.headless true \ + --server.fileWatcherType none \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 2 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID …" + kill "$PID" 2>/dev/null || true + sleep 1 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" + echo "Stopped." +} + +restart() { + stop + sleep 1 + start +} + +status() { + if is_running; then + echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT" + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file found at $LOG_FILE" + fi +} + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +CMD="${1:-help}" +case "$CMD" in + start) start ;; + stop) stop ;; + restart) restart ;; + status) status ;; + logs) logs ;; + *) + echo "Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]" + echo "" + echo " start Start the Streamlit UI (default port: $PORT)" + echo " stop Stop the running UI" + echo " restart Stop then start" + echo " status Show whether it's running" + echo " logs Tail the last 50 lines of the log" + echo "" + echo " STREAMLIT_PORT=8502 bash scripts/manage-ui.sh start (custom port)" + ;; +esac diff --git a/scripts/manage-vision.sh b/scripts/manage-vision.sh new file mode 100755 index 0000000..43b089c --- /dev/null +++ b/scripts/manage-vision.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# scripts/manage-vision.sh — manage the moondream2 vision service +# Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs +# +# First-time setup: +# conda env create -f scripts/vision_service/environment.yml +# +# On first start, moondream2 is downloaded from HuggingFace (~1.8GB). +# Model stays resident in memory between requests. + +set -euo pipefail + +CONDA_ENV="job-seeker-vision" +UVICORN_BIN="/devl/miniconda3/envs/${CONDA_ENV}/bin/uvicorn" +PID_FILE="/tmp/vision-service.pid" +LOG_FILE="/tmp/vision-service.log" +PORT=8002 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +start() { + if is_running; then + echo "Already running (PID $(cat "$PID_FILE"))." + return 0 + fi + + if [[ ! -f "$UVICORN_BIN" ]]; then + echo "ERROR: conda env '$CONDA_ENV' not found." + echo "Install with: conda env create -f scripts/vision_service/environment.yml" + exit 1 + fi + + echo "Starting vision service (moondream2) on port $PORT…" + cd "$REPO_ROOT" + PYTHONPATH="$REPO_ROOT" "$UVICORN_BIN" \ + scripts.vision_service.main:app \ + --host 0.0.0.0 \ + --port "$PORT" \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 2 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + echo "Health: http://localhost:$PORT/health" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + rm -f "$PID_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID…" + kill "$PID" 2>/dev/null || true + sleep 2 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" + echo "Stopped." +} + +restart() { stop; sleep 1; start; } + +status() { + if is_running; then + echo "Running (PID $(cat "$PID_FILE")) — http://localhost:$PORT" + curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || true + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file at $LOG_FILE" + fi +} + +CMD="${1:-help}" +case "$CMD" in + start) start ;; + stop) stop ;; + restart) restart ;; + status) status ;; + logs) logs ;; + *) + echo "Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs" + echo "" + echo " Manages the moondream2 vision service on port $PORT." + echo " First-time setup: conda env create -f scripts/vision_service/environment.yml" + ;; +esac diff --git a/scripts/manage-vllm.sh b/scripts/manage-vllm.sh new file mode 100755 index 0000000..8386e20 --- /dev/null +++ b/scripts/manage-vllm.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# scripts/manage-vllm.sh — manage the vLLM inference server +# Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list] + +set -euo pipefail + +VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python" +MODEL_DIR="/Library/Assets/LLM/vllm/models" +PID_FILE="/tmp/vllm-server.pid" +LOG_FILE="/tmp/vllm-server.log" +MODEL_FILE="/tmp/vllm-server.model" +PORT=8000 +GPU=1 + +_list_model_names() { + if [[ -d "$MODEL_DIR" ]]; then + find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort + fi +} + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +start() { + local model_name="${1:-}" + + if [[ -z "$model_name" ]]; then + model_name=$(_list_model_names | head -1) + if [[ -z "$model_name" ]]; then + echo "No models found in $MODEL_DIR" + exit 1 + fi + fi + + local model_path + if [[ "$model_name" == /* ]]; then + model_path="$model_name" + model_name=$(basename "$model_path") + else + model_path="$MODEL_DIR/$model_name" + fi + + if [[ ! -d "$model_path" ]]; then + echo "Model not found: $model_path" + exit 1 + fi + + if is_running; then + echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload." + return 0 + fi + + echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…" + echo "$model_name" > "$MODEL_FILE" + + # Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard + # transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928. + CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \ + --model "$model_path" \ + --trust-remote-code \ + --max-model-len 3072 \ + --gpu-memory-utilization 0.75 \ + --enforce-eager \ + --max-num-seqs 8 \ + --port "$PORT" \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 3 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + rm -f "$PID_FILE" "$MODEL_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID …" + kill "$PID" 2>/dev/null || true + sleep 2 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" "$MODEL_FILE" + echo "Stopped." +} + +restart() { + local model_name="${1:-}" + stop + sleep 1 + start "$model_name" +} + +status() { + if is_running; then + local model="" + if [[ -f "$MODEL_FILE" ]]; then + model=" — model: $(cat "$MODEL_FILE")" + fi + echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model" + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file found at $LOG_FILE" + fi +} + +list() { + echo "Available models in $MODEL_DIR:" + _list_model_names | while read -r name; do + echo " - $name" + done +} + +CMD="${1:-help}" +case "$CMD" in + start) start "${2:-}" ;; + stop) stop ;; + restart) restart "${2:-}" ;; + status) status ;; + logs) logs ;; + list) list ;; + *) + echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]" + echo "" + echo " start [model] Start vLLM with the specified model (default: first in $MODEL_DIR)" + echo " stop Stop the running vLLM server" + echo " restart [model] Stop then start (pass a new model name to swap)" + echo " status Show whether it's running and which model is loaded" + echo " logs Tail the last 50 lines of the log" + echo " list List available models" + echo "" + echo " GPU: $GPU (CUDA_VISIBLE_DEVICES)" + echo " Port: $PORT" + ;; +esac diff --git a/scripts/match.py b/scripts/match.py new file mode 100644 index 0000000..af1d000 --- /dev/null +++ b/scripts/match.py @@ -0,0 +1,156 @@ +""" +Resume match scoring. + +Two modes: + 1. SQLite batch — score all unscored pending/approved jobs in staging.db + Usage: python scripts/match.py + + 2. Notion single — score one Notion page by URL/ID and write results back + Usage: python scripts/match.py +""" +import re +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import requests +import yaml +from bs4 import BeautifulSoup +from notion_client import Client + +CONFIG_DIR = Path(__file__).parent.parent / "config" +RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf") + + +def load_notion() -> tuple[Client, dict]: + cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + return Client(auth=cfg["token"]), cfg["field_map"] + + +def extract_page_id(url_or_id: str) -> str: + """Extract 32-char Notion page ID from a URL or return as-is.""" + clean = url_or_id.replace("-", "") + match = re.search(r"[0-9a-f]{32}", clean) + return match.group(0) if match else url_or_id.strip() + + +def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str: + page = notion.pages.retrieve(page_id) + return page["properties"][url_field]["url"] or "" + + +def extract_job_description(url: str) -> str: + """Fetch a job listing URL and return its visible text.""" + resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "header", "footer"]): + tag.decompose() + return " ".join(soup.get_text(separator=" ").split()) + + +def read_resume_text() -> str: + """Extract text from the ATS-clean PDF resume.""" + import pypdf + reader = pypdf.PdfReader(str(RESUME_PATH)) + return " ".join(page.extract_text() or "" for page in reader.pages) + + +def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]: + """ + Score resume against job description using TF-IDF cosine similarity. + Returns (score 0–100, list of high-value job keywords missing from resume). + """ + import numpy as np + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + + vectorizer = TfidfVectorizer(stop_words="english", max_features=200) + tfidf = vectorizer.fit_transform([resume_text, job_text]) + score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100 + + resume_terms = set(resume_text.lower().split()) + feature_names = vectorizer.get_feature_names_out() + job_tfidf = tfidf[1].toarray()[0] + top_indices = np.argsort(job_tfidf)[::-1][:30] + top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0] + gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10] # t==t drops NaN + + return round(score, 1), gaps + + +def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None: + notion.pages.update( + page_id=page_id, + properties={ + fm["match_score"]: {"number": score}, + fm["keyword_gaps"]: {"rich_text": [{"text": {"content": ", ".join(gaps)}}]}, + }, + ) + + +def run_match(page_url_or_id: str) -> None: + notion, fm = load_notion() + page_id = extract_page_id(page_url_or_id) + + print(f"[match] Page ID: {page_id}") + job_url = get_job_url_from_notion(notion, page_id, fm["url"]) + print(f"[match] Fetching job description from: {job_url}") + + job_text = extract_job_description(job_url) + resume_text = read_resume_text() + + score, gaps = match_score(resume_text, job_text) + print(f"[match] Score: {score}/100") + print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}") + + write_match_to_notion(notion, page_id, score, gaps, fm) + print("[match] Written to Notion.") + + +def score_pending_jobs(db_path: Path = None) -> int: + """ + Score all unscored jobs (any status) in SQLite using the description + already scraped during discovery. Writes match_score + keyword_gaps back. + Returns the number of jobs scored. + """ + from scripts.db import DEFAULT_DB, write_match_scores + + if db_path is None: + db_path = DEFAULT_DB + + import sqlite3 + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, title, company, description FROM jobs " + "WHERE match_score IS NULL " + "AND description IS NOT NULL AND description != '' AND description != 'nan'" + ).fetchall() + conn.close() + + if not rows: + print("[match] No unscored jobs with descriptions found.") + return 0 + + resume_text = read_resume_text() + scored = 0 + for row in rows: + job_id, title, company, description = row["id"], row["title"], row["company"], row["description"] + try: + score, gaps = match_score(resume_text, description) + write_match_scores(db_path, job_id, score, ", ".join(gaps)) + print(f"[match] {title} @ {company}: {score}/100 gaps: {', '.join(gaps) or 'none'}") + scored += 1 + except Exception as e: + print(f"[match] Error scoring job {job_id}: {e}") + + print(f"[match] Done — {scored} jobs scored.") + return scored + + +if __name__ == "__main__": + if len(sys.argv) < 2: + score_pending_jobs() + else: + run_match(sys.argv[1]) diff --git a/scripts/prepare_training_data.py b/scripts/prepare_training_data.py new file mode 100644 index 0000000..5b2010b --- /dev/null +++ b/scripts/prepare_training_data.py @@ -0,0 +1,134 @@ +# scripts/prepare_training_data.py +""" +Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning. + +Outputs a JSONL file where each line is: + {"instruction": "Write a cover letter for the [role] position at [company].", + "output": ""} + +Usage: + conda run -n job-seeker python scripts/prepare_training_data.py + conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl +""" +import argparse +import json +import re +import sys +from pathlib import Path + +LETTERS_DIR = Path("/Library/Documents/JobSearch") +# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter") +LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"] +DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl" + +# Patterns that appear in opening sentences to extract role +ROLE_PATTERNS = [ + r"apply for (?:the )?(.+?) (?:position|role|opportunity) at", + r"apply for (?:the )?(.+?) (?:at|with)\b", +] + + +def extract_role_from_text(text: str) -> str: + """Try to extract the role title from the first ~500 chars of a cover letter.""" + # Search the opening of the letter, skipping past any greeting line + search_text = text[:600] + for pattern in ROLE_PATTERNS: + m = re.search(pattern, search_text, re.IGNORECASE) + if m: + role = m.group(1).strip().rstrip(".") + # Filter out noise — role should be ≤6 words + if 1 <= len(role.split()) <= 6: + return role + return "" + + +def extract_company_from_filename(stem: str) -> str: + """Extract company name from cover letter filename stem.""" + return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip() + + +def strip_greeting(text: str) -> str: + """Remove the 'Dear X,' line so the output is just the letter body + sign-off.""" + lines = text.splitlines() + for i, line in enumerate(lines): + if line.strip().lower().startswith("dear "): + # Skip the greeting line and any following blank lines + rest = lines[i + 1:] + while rest and not rest[0].strip(): + rest = rest[1:] + return "\n".join(rest).strip() + return text.strip() + + +def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]: + """Parse all cover letters and return list of training records.""" + records = [] + seen: set[Path] = set() + all_paths = [] + for glob in LETTER_GLOBS: + for p in letters_dir.glob(glob): + if p not in seen: + seen.add(p) + all_paths.append(p) + for path in sorted(all_paths): + text = path.read_text(encoding="utf-8", errors="ignore").strip() + if not text or len(text) < 100: + continue + + company = extract_company_from_filename(path.stem) + role = extract_role_from_text(text) + body = strip_greeting(text) + + if not role: + # Use a generic instruction when role extraction fails + instruction = f"Write a cover letter for a position at {company}." + else: + instruction = f"Write a cover letter for the {role} position at {company}." + + records.append({ + "instruction": instruction, + "output": body, + "source_file": path.name, + }) + + return records + + +def write_jsonl(records: list[dict], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus") + parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path") + parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters") + parser.add_argument("--stats", action="store_true", help="Print statistics and exit") + args = parser.parse_args() + + records = build_records(Path(args.letters_dir)) + + if args.stats: + print(f"Total letters: {len(records)}") + with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position")) + print(f"Role extracted: {with_role}/{len(records)}") + avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1) + print(f"Avg letter length: {avg_len:.0f} chars") + for r in records: + print(f" {r['source_file']!r:55s} → {r['instruction'][:70]}") + return + + output_path = Path(args.output) + write_jsonl(records, output_path) + print(f"Wrote {len(records)} training records to {output_path}") + print() + print("Next step for LoRA fine-tuning:") + print(" 1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct") + print(" 2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)") + print(" 3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh — hugging-face-model-trainer skill") + + +if __name__ == "__main__": + main() diff --git a/scripts/scrape_url.py b/scripts/scrape_url.py new file mode 100644 index 0000000..e577fe6 --- /dev/null +++ b/scripts/scrape_url.py @@ -0,0 +1,228 @@ +# scripts/scrape_url.py +""" +Scrape a job listing from its URL and update the job record. + +Supports: + - LinkedIn (guest jobs API — no auth required) + - Indeed (HTML parse) + - Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py) + - Generic (JSON-LD → og:tags fallback) + +Usage (background task — called by task_runner): + from scripts.scrape_url import scrape_job_url + scrape_job_url(db_path, job_id) +""" +import json +import re +import sqlite3 +import sys +from pathlib import Path +from typing import Optional +from urllib.parse import urlparse, urlencode, parse_qsl + +import requests +from bs4 import BeautifulSoup + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, update_job_fields + +_STRIP_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", + "trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig", + "eid", "otpToken", "ssid", "fmid", +} + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} +_TIMEOUT = 12 + + +def _detect_board(url: str) -> str: + """Return 'linkedin', 'indeed', 'glassdoor', or 'generic'.""" + url_lower = url.lower() + if "linkedin.com" in url_lower: + return "linkedin" + if "indeed.com" in url_lower: + return "indeed" + if "glassdoor.com" in url_lower: + return "glassdoor" + return "generic" + + +def _extract_linkedin_job_id(url: str) -> Optional[str]: + """Extract numeric job ID from a LinkedIn job URL.""" + m = re.search(r"/jobs/view/(\d+)", url) + return m.group(1) if m else None + + +def canonicalize_url(url: str) -> str: + """ + Strip tracking parameters from a job URL and return a clean canonical form. + + LinkedIn: https://www.linkedin.com/jobs/view//?trk=... → https://www.linkedin.com/jobs/view// + Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId + """ + url = url.strip() + if "linkedin.com" in url.lower(): + job_id = _extract_linkedin_job_id(url) + if job_id: + return f"https://www.linkedin.com/jobs/view/{job_id}/" + parsed = urlparse(url) + clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS]) + return parsed._replace(query=clean_qs).geturl() + + +def _scrape_linkedin(url: str) -> dict: + """Fetch via LinkedIn guest jobs API (no auth required).""" + job_id = _extract_linkedin_job_id(url) + if not job_id: + return {} + api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}" + resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + def _text(selector, **kwargs): + tag = soup.find(selector, **kwargs) + return tag.get_text(strip=True) if tag else "" + + title = _text("h2", class_="top-card-layout__title") + company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link") + location = _text("span", class_="topcard__flavor--bullet") + desc_div = soup.find("div", class_="show-more-less-html__markup") + description = desc_div.get_text(separator="\n", strip=True) if desc_div else "" + + return {k: v for k, v in { + "title": title, + "company": company, + "location": location, + "description": description, + "source": "linkedin", + }.items() if v} + + +def _scrape_indeed(url: str) -> dict: + """Scrape an Indeed job page.""" + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def _scrape_glassdoor(url: str) -> dict: + """Re-use JobSpy's Glassdoor scraper for description fetch.""" + m = re.search(r"jl=(\d+)", url) + if not m: + return {} + try: + from jobspy.glassdoor import Glassdoor + from jobspy.glassdoor.constant import fallback_token, headers + from jobspy.model import ScraperInput, Site + from jobspy.util import create_session + + scraper = Glassdoor() + scraper.base_url = "https://www.glassdoor.com/" + scraper.session = create_session(has_retry=True) + token = scraper._get_csrf_token() + headers["gd-csrf-token"] = token if token else fallback_token + scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) + description = scraper._fetch_job_description(int(m.group(1))) + return {"description": description} if description else {} + except Exception: + return {} + + +def _parse_json_ld_or_og(html: str) -> dict: + """Extract job fields from JSON-LD structured data, then og: meta tags.""" + soup = BeautifulSoup(html, "html.parser") + + for script in soup.find_all("script", type="application/ld+json"): + try: + data = json.loads(script.string or "") + if isinstance(data, list): + data = next((d for d in data if d.get("@type") == "JobPosting"), {}) + if data.get("@type") == "JobPosting": + org = data.get("hiringOrganization") or {} + loc = data.get("jobLocation") or {} + if isinstance(loc, list): + loc = loc[0] if loc else {} + addr = loc.get("address") or {} + location = ( + addr.get("addressLocality", "") or + addr.get("addressRegion", "") or + addr.get("addressCountry", "") + ) + return {k: v for k, v in { + "title": data.get("title", ""), + "company": org.get("name", ""), + "location": location, + "description": data.get("description", ""), + "salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "", + }.items() if v} + except Exception: + continue + + def _meta(prop): + tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop}) + return tag.get("content", "") if tag else "" + + title_tag = soup.find("title") + title = _meta("og:title") or (title_tag.get_text(strip=True) if title_tag else "") + description = _meta("og:description") + return {k: v for k, v in {"title": title, "description": description}.items() if v} + + +def _scrape_generic(url: str) -> dict: + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict: + """ + Fetch the job listing at the stored URL and update the job record. + + Returns the dict of fields scraped (may be empty on failure). + Does not raise — failures are logged and the job row is left as-is. + """ + if job_id is None: + return {} + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if not row: + return {} + + url = row["url"] or "" + if not url.startswith("http"): + return {} + + board = _detect_board(url) + try: + if board == "linkedin": + fields = _scrape_linkedin(url) + elif board == "indeed": + fields = _scrape_indeed(url) + elif board == "glassdoor": + fields = _scrape_glassdoor(url) + else: + fields = _scrape_generic(url) + except requests.RequestException as exc: + print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}") + return {} + except Exception as exc: + print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}") + return {} + + if fields: + fields.pop("url", None) + update_job_fields(db_path, job_id, fields) + print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}") + + return fields diff --git a/scripts/sync.py b/scripts/sync.py new file mode 100644 index 0000000..ddb5634 --- /dev/null +++ b/scripts/sync.py @@ -0,0 +1,97 @@ +# scripts/sync.py +""" +Push approved jobs from SQLite staging to Notion. + +Usage: + conda run -n job-seeker python scripts/sync.py +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import yaml +from datetime import datetime + +from notion_client import Client + +from scripts.db import DEFAULT_DB, get_jobs_by_status, update_job_status + +CONFIG_DIR = Path(__file__).parent.parent / "config" + + +def load_notion_config() -> dict: + return yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + + +def _build_properties(job: dict, fm: dict, include_optional: bool = True) -> dict: + """Build the Notion properties dict for a job. Optional fields (match_score, + keyword_gaps) are included by default but can be dropped for DBs that don't + have those columns yet.""" + props = { + fm["title_field"]: {"title": [{"text": {"content": job.get("salary") or job.get("title", "")}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": job.get("title", "")}}]}, + fm["company"]: {"rich_text": [{"text": {"content": job.get("company", "")}}]}, + fm["url"]: {"url": job.get("url") or None}, + fm["source"]: {"multi_select": [{"name": job.get("source", "unknown").title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", 0))}, + fm["date_found"]: {"date": {"start": job.get("date_found", datetime.now().isoformat()[:10])}}, + } + if include_optional: + score = job.get("match_score") + if score is not None and fm.get("match_score"): + props[fm["match_score"]] = {"number": score} + gaps = job.get("keyword_gaps") + if gaps and fm.get("keyword_gaps"): + props[fm["keyword_gaps"]] = {"rich_text": [{"text": {"content": gaps}}]} + return props + + +def sync_to_notion(db_path: Path = DEFAULT_DB) -> int: + """Push all approved and applied jobs to Notion. Returns count synced.""" + cfg = load_notion_config() + notion = Client(auth=cfg["token"]) + db_id = cfg["database_id"] + fm = cfg["field_map"] + + approved = get_jobs_by_status(db_path, "approved") + applied = get_jobs_by_status(db_path, "applied") + pending_sync = approved + applied + if not pending_sync: + print("[sync] No approved/applied jobs to sync.") + return 0 + + synced_ids = [] + for job in pending_sync: + try: + notion.pages.create( + parent={"database_id": db_id}, + properties=_build_properties(job, fm, include_optional=True), + ) + synced_ids.append(job["id"]) + print(f"[sync] + {job.get('title')} @ {job.get('company')}") + except Exception as e: + err = str(e) + # Notion returns 400 validation_error when a property column doesn't exist yet. + # Fall back to core fields only and warn the user. + if "validation_error" in err or "Could not find property" in err: + try: + notion.pages.create( + parent={"database_id": db_id}, + properties=_build_properties(job, fm, include_optional=False), + ) + synced_ids.append(job["id"]) + print(f"[sync] + {job.get('title')} @ {job.get('company')} " + f"(skipped optional fields — add Match Score / Keyword Gaps columns to Notion DB)") + except Exception as e2: + print(f"[sync] Error syncing {job.get('url')}: {e2}") + else: + print(f"[sync] Error syncing {job.get('url')}: {e}") + + update_job_status(db_path, synced_ids, "synced") + print(f"[sync] Done — {len(synced_ids)} jobs synced to Notion.") + return len(synced_ids) + + +if __name__ == "__main__": + sync_to_notion() diff --git a/scripts/task_runner.py b/scripts/task_runner.py new file mode 100644 index 0000000..9e6cafd --- /dev/null +++ b/scripts/task_runner.py @@ -0,0 +1,155 @@ +# scripts/task_runner.py +""" +Background task runner for LLM generation tasks. + +Submitting a task inserts a row in background_tasks and spawns a daemon thread. +The thread calls the appropriate generator, writes results to existing tables, +and marks the task completed or failed. + +Deduplication: only one queued/running task per (task_type, job_id) is allowed. +Different task types for the same job run concurrently (e.g. cover letter + research). +""" +import sqlite3 +import threading +from pathlib import Path + +from scripts.db import ( + DEFAULT_DB, + insert_task, + update_task_status, + update_task_stage, + update_cover_letter, + save_research, +) + + +def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Submit a background LLM task. + + Returns (task_id, True) if a new task was queued and a thread spawned. + Returns (existing_id, False) if an identical task is already in-flight. + """ + task_id, is_new = insert_task(db_path, task_type, job_id) + if is_new: + t = threading.Thread( + target=_run_task, + args=(db_path, task_id, task_type, job_id), + daemon=True, + ) + t.start() + return task_id, is_new + + +def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None: + """Thread body: run the generator and persist the result.""" + # job_id == 0 means a global task (e.g. discovery) with no associated job row. + job: dict = {} + if job_id: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if row is None: + update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found") + return + job = dict(row) + + update_task_status(db_path, task_id, "running") + + try: + if task_type == "discovery": + from scripts.discover import run_discovery + new_count = run_discovery(db_path) + n = new_count or 0 + update_task_status( + db_path, task_id, "completed", + error=f"{n} new listing{'s' if n != 1 else ''} added", + ) + return + + elif task_type == "cover_letter": + from scripts.generate_cover_letter import generate + result = generate( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + update_cover_letter(db_path, job_id, result) + + elif task_type == "company_research": + from scripts.company_research import research_company + result = research_company( + job, + on_stage=lambda s: update_task_stage(db_path, task_id, s), + ) + save_research(db_path, job_id=job_id, **result) + + elif task_type == "enrich_descriptions": + from scripts.enrich_descriptions import enrich_all_descriptions + r = enrich_all_descriptions(db_path) + errs = len(r.get("errors", [])) + msg = ( + f"{r['succeeded']} description(s) fetched, {r['failed']} failed" + + (f", {errs} error(s)" if errs else "") + ) + update_task_status(db_path, task_id, "completed", error=msg) + return + + elif task_type == "scrape_url": + from scripts.scrape_url import scrape_job_url + fields = scrape_job_url(db_path, job_id) + title = fields.get("title") or job.get("url", "?") + company = fields.get("company", "") + msg = f"{title}" + (f" @ {company}" if company else "") + update_task_status(db_path, task_id, "completed", error=msg) + # Auto-enrich company/salary for Craigslist jobs + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + job_row = conn.execute( + "SELECT source, company FROM jobs WHERE id=?", (job_id,) + ).fetchone() + conn.close() + if job_row and job_row["source"] == "craigslist" and not job_row["company"]: + submit_task(db_path, "enrich_craigslist", job_id) + return + + elif task_type == "enrich_craigslist": + from scripts.enrich_descriptions import enrich_craigslist_fields + extracted = enrich_craigslist_fields(db_path, job_id) + company = extracted.get("company", "") + msg = f"company={company}" if company else "no company found" + update_task_status(db_path, task_id, "completed", error=msg) + return + + elif task_type == "email_sync": + try: + from scripts.imap_sync import sync_all + result = sync_all(db_path, + on_stage=lambda s: update_task_stage(db_path, task_id, s)) + leads = result.get("new_leads", 0) + todo = result.get("todo_attached", 0) + errs = len(result.get("errors", [])) + msg = ( + f"{result['synced']} jobs updated, " + f"+{result['inbound']} in, +{result['outbound']} out" + + (f", {leads} new lead(s)" if leads else "") + + (f", {todo} todo attached" if todo else "") + + (f", {errs} error(s)" if errs else "") + ) + update_task_status(db_path, task_id, "completed", error=msg) + return + except FileNotFoundError: + update_task_status(db_path, task_id, "failed", + error="Email not configured — go to Settings → Email") + return + + else: + raise ValueError(f"Unknown task_type: {task_type!r}") + + update_task_status(db_path, task_id, "completed") + + except BaseException as exc: + # BaseException catches SystemExit (from companyScraper sys.exit calls) + # in addition to regular exceptions. + update_task_status(db_path, task_id, "failed", error=str(exc)) diff --git a/scripts/test_email_classify.py b/scripts/test_email_classify.py new file mode 100644 index 0000000..8ac47f2 --- /dev/null +++ b/scripts/test_email_classify.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +""" +Compare email classifiers across models on a live sample from IMAP. + +Usage: + conda run -n job-seeker python scripts/test_email_classify.py + conda run -n job-seeker python scripts/test_email_classify.py --limit 30 + conda run -n job-seeker python scripts/test_email_classify.py --dry-run # phrase filter only, no LLM + +Outputs a table: subject | phrase_blocked | phi3 | llama3.1 | vllm +""" +import argparse +import re +import sys +from datetime import datetime, timedelta +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.imap_sync import ( + load_config, connect, _search_folder, _parse_message, + _has_recruitment_keyword, _has_rejection_or_ats_signal, + _CLASSIFY_SYSTEM, _CLASSIFY_LABELS, + _REJECTION_PHRASES, _SPAM_PHRASES, _ATS_CONFIRM_SUBJECTS, _SPAM_SUBJECT_PREFIXES, +) +from scripts.llm_router import LLMRouter + +_ROUTER = LLMRouter() + +MODELS = { + "phi3": ("phi3:mini", ["ollama_research"]), + "llama3": ("llama3.1:8b", ["ollama_research"]), + "vllm": ("__auto__", ["vllm"]), +} + +BROAD_TERMS = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"] + + +def _classify(subject: str, body: str, model_override: str, fallback_order: list) -> str: + try: + prompt = f"Subject: {subject}\n\nEmail: {body[:600]}" + raw = _ROUTER.complete( + prompt, + system=_CLASSIFY_SYSTEM, + model_override=model_override, + fallback_order=fallback_order, + ) + text = re.sub(r".*?", "", raw, flags=re.DOTALL).lower().strip() + for label in _CLASSIFY_LABELS: + if text.startswith(label) or label in text: + return label + return f"? ({text[:30]})" + except Exception as e: + return f"ERR: {e!s:.20}" + + +def _short(s: str, n: int = 55) -> str: + return s if len(s) <= n else s[:n - 1] + "…" + + +def _explain_block(subject: str, body: str) -> str: + """Return the first phrase/rule that triggered a block.""" + subject_lower = subject.lower().strip() + for p in _SPAM_SUBJECT_PREFIXES: + if subject_lower.startswith(p): + return f"subject prefix: {p!r}" + for p in _ATS_CONFIRM_SUBJECTS: + if p in subject_lower: + return f"ATS subject: {p!r}" + haystack = subject_lower + " " + body[:800].lower() + for p in _REJECTION_PHRASES + _SPAM_PHRASES: + if p in haystack: + return f"phrase: {p!r}" + return "unknown" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--limit", type=int, default=20, help="Max emails to test") + parser.add_argument("--days", type=int, default=90) + parser.add_argument("--dry-run", action="store_true", + help="Skip LLM calls — show phrase filter only") + parser.add_argument("--verbose", action="store_true", + help="Show which phrase triggered each BLOCK") + args = parser.parse_args() + + cfg = load_config() + since = (datetime.now() - timedelta(days=args.days)).strftime("%d-%b-%Y") + + print(f"Connecting to {cfg.get('host')} …") + conn = connect(cfg) + + # Collect unique UIDs across broad terms + all_uids: dict[bytes, None] = {} + for term in BROAD_TERMS: + for uid in _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since): + all_uids[uid] = None + + sample = list(all_uids.keys())[: args.limit] + print(f"Fetched {len(all_uids)} matching UIDs, testing {len(sample)}\n") + + # Header + if args.dry_run: + print(f"{'Subject':<56} {'RK':3} {'Phrase':7}") + print("-" * 72) + else: + print(f"{'Subject':<56} {'RK':3} {'Phrase':7} {'phi3':<20} {'llama3':<20} {'vllm':<20}") + print("-" * 130) + + passed = skipped = 0 + rows = [] + + for uid in sample: + parsed = _parse_message(conn, uid) + if not parsed: + continue + subj = parsed["subject"] + body = parsed["body"] + + has_rk = _has_recruitment_keyword(subj) + phrase_block = _has_rejection_or_ats_signal(subj, body) + + if args.dry_run: + rk_mark = "✓" if has_rk else "✗" + pb_mark = "BLOCK" if phrase_block else "pass" + line = f"{_short(subj):<56} {rk_mark:3} {pb_mark:7}" + if phrase_block and args.verbose: + reason = _explain_block(subj, body) + line += f" [{reason}]" + print(line) + continue + + if phrase_block or not has_rk: + skipped += 1 + rk_mark = "✓" if has_rk else "✗" + pb_mark = "BLOCK" if phrase_block else "pass" + print(f"{_short(subj):<56} {rk_mark:3} {pb_mark:7} {'—':<20} {'—':<20} {'—':<20}") + continue + + passed += 1 + results = {} + for name, (model, fallback) in MODELS.items(): + results[name] = _classify(subj, body, model, fallback) + + pb_mark = "pass" + print(f"{_short(subj):<56} {'✓':3} {pb_mark:7} " + f"{results['phi3']:<20} {results['llama3']:<20} {results['vllm']:<20}") + + if not args.dry_run: + print(f"\nPhrase-blocked or no-keyword: {skipped} | Reached LLMs: {passed}") + + try: + conn.logout() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/vision_service/environment.yml b/scripts/vision_service/environment.yml new file mode 100644 index 0000000..bbbe697 --- /dev/null +++ b/scripts/vision_service/environment.yml @@ -0,0 +1,17 @@ +name: job-seeker-vision +channels: + - conda-forge + - defaults +dependencies: + - python=3.11 + - pip + - pip: + - torch>=2.0.0 + - torchvision>=0.15.0 + - transformers>=4.40.0 + - accelerate>=0.26.0 + - bitsandbytes>=0.43.0 + - einops>=0.7.0 + - Pillow>=10.0.0 + - fastapi>=0.110.0 + - "uvicorn[standard]>=0.27.0" diff --git a/scripts/vision_service/main.py b/scripts/vision_service/main.py new file mode 100644 index 0000000..0cdbf3d --- /dev/null +++ b/scripts/vision_service/main.py @@ -0,0 +1,98 @@ +""" +Vision service — moondream2 inference for survey screenshot analysis. + +Start: bash scripts/manage-vision.sh start +Or directly: conda run -n job-seeker-vision uvicorn scripts.vision_service.main:app --port 8002 + +First run downloads moondream2 from HuggingFace (~1.8GB). +Model is loaded lazily on first /analyze request and stays resident. +GPU is used if available (CUDA); falls back to CPU. +4-bit quantization on GPU keeps VRAM footprint ~1.5GB. +""" +import base64 +import io + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +app = FastAPI(title="Job Seeker Vision Service") + +# Module-level model state — lazy loaded on first /analyze request +_model = None +_tokenizer = None +_device = "cpu" +_loading = False + + +def _load_model() -> None: + global _model, _tokenizer, _device, _loading + if _model is not None: + return + _loading = True + print("[vision] Loading moondream2…") + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_id = "vikhyatk/moondream2" + revision = "2025-01-09" + _device = "cuda" if torch.cuda.is_available() else "cpu" + + if _device == "cuda": + from transformers import BitsAndBytesConfig + bnb = BitsAndBytesConfig(load_in_4bit=True) + _model = AutoModelForCausalLM.from_pretrained( + model_id, revision=revision, + quantization_config=bnb, + trust_remote_code=True, + device_map="auto", + ) + else: + _model = AutoModelForCausalLM.from_pretrained( + model_id, revision=revision, + trust_remote_code=True, + ) + _model.to(_device) + + _tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) + _loading = False + print(f"[vision] moondream2 ready on {_device}") + + +class AnalyzeRequest(BaseModel): + prompt: str + image_base64: str + + +class AnalyzeResponse(BaseModel): + text: str + + +@app.get("/health") +def health(): + import torch + return { + "status": "loading" if _loading else "ok", + "model": "moondream2", + "gpu": torch.cuda.is_available(), + "loaded": _model is not None, + } + + +@app.post("/analyze", response_model=AnalyzeResponse) +def analyze(req: AnalyzeRequest): + from PIL import Image + import torch + + _load_model() + + try: + image_data = base64.b64decode(req.image_base64) + image = Image.open(io.BytesIO(image_data)).convert("RGB") + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid image: {e}") + + with torch.no_grad(): + enc_image = _model.encode_image(image) + answer = _model.answer_question(enc_image, req.prompt, _tokenizer) + + return AnalyzeResponse(text=answer) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_company_research.py b/tests/test_company_research.py new file mode 100644 index 0000000..ea696dd --- /dev/null +++ b/tests/test_company_research.py @@ -0,0 +1,84 @@ +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.company_research import _score_experiences, _build_resume_context, _load_resume_and_keywords + + +RESUME = { + "experience_details": [ + { + "position": "Lead Technical Account Manager", + "company": "UpGuard", + "employment_period": "10/2022 - 05/2023", + "key_responsibilities": [ + {"r1": "Managed enterprise security accounts worth $2M ARR"}, + {"r2": "Led QBR cadence with C-suite stakeholders"}, + ], + }, + { + "position": "Founder and Principal Consultant", + "company": "M3 Consulting Services", + "employment_period": "07/2023 - Present", + "key_responsibilities": [ + {"r1": "Revenue operations consulting for SaaS clients"}, + {"r2": "Built customer success frameworks"}, + ], + }, + { + "position": "Customer Success Manager", + "company": "Generic Co", + "employment_period": "01/2020 - 09/2022", + "key_responsibilities": [ + {"r1": "Managed SMB portfolio"}, + ], + }, + ] +} + +KEYWORDS = ["ARR", "QBR", "enterprise", "security", "stakeholder"] +JD = "Looking for a TAM with enterprise ARR experience and QBR facilitation skills." + + +def test_score_experiences_returns_sorted(): + """UpGuard entry should score highest — most keywords present in text and JD.""" + scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD) + assert scored[0]["company"] == "UpGuard" + + +def test_score_experiences_adds_score_key(): + """Each returned entry has a 'score' integer key.""" + scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD) + for e in scored: + assert isinstance(e["score"], int) + + +def test_build_resume_context_top2_in_full(): + """Top 2 experiences appear with full bullet detail.""" + ctx = _build_resume_context(RESUME, KEYWORDS, JD) + assert "Lead Technical Account Manager" in ctx + assert "Managed enterprise security accounts" in ctx + assert "Founder and Principal Consultant" in ctx + + +def test_build_resume_context_rest_condensed(): + """Remaining experiences appear as condensed one-liners, not full bullets.""" + ctx = _build_resume_context(RESUME, KEYWORDS, JD) + assert "Also in Alex" in ctx + assert "Generic Co" in ctx + # Generic Co bullets should NOT appear in full + assert "Managed SMB portfolio" not in ctx + + +def test_upguard_nda_low_score(): + """UpGuard name replaced with 'enterprise security vendor' when score < 3.""" + ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops") + assert "enterprise security vendor" in ctx + + +def test_load_resume_and_keywords_returns_lists(): + """_load_resume_and_keywords returns a tuple of (dict, list[str]).""" + resume, keywords = _load_resume_and_keywords() + assert isinstance(resume, dict) + assert isinstance(keywords, list) + assert all(isinstance(k, str) for k in keywords) diff --git a/tests/test_cover_letter.py b/tests/test_cover_letter.py new file mode 100644 index 0000000..558d261 --- /dev/null +++ b/tests/test_cover_letter.py @@ -0,0 +1,120 @@ +# tests/test_cover_letter.py +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + + +# ── prepare_training_data tests ────────────────────────────────────────────── + +def test_extract_role_from_text(): + """extract_role_from_text pulls the role title from the opening sentence.""" + from scripts.prepare_training_data import extract_role_from_text + + text = "Dear Tailscale Hiring Team,\n\nI'm delighted to apply for the Customer Support Manager position at Tailscale." + assert extract_role_from_text(text) == "Customer Support Manager" + + +def test_extract_role_handles_missing(): + """extract_role_from_text returns empty string if no role found.""" + from scripts.prepare_training_data import extract_role_from_text + + assert extract_role_from_text("Dear Team,\n\nHello there.") == "" + + +def test_extract_company_from_filename(): + """extract_company_from_filename strips 'Cover Letter' suffix.""" + from scripts.prepare_training_data import extract_company_from_filename + + assert extract_company_from_filename("Tailscale Cover Letter") == "Tailscale" + assert extract_company_from_filename("Dagster Labs Cover Letter.md") == "Dagster Labs" + + +def test_strip_greeting(): + """strip_greeting removes the 'Dear X,' line and returns the body.""" + from scripts.prepare_training_data import strip_greeting + + text = "Dear Hiring Team,\n\nI'm delighted to apply for the CSM role.\n\nBest regards,\nAlex" + result = strip_greeting(text) + assert result.startswith("I'm delighted") + assert "Dear" not in result + + +def test_build_records_from_tmp_corpus(tmp_path): + """build_records parses a small corpus directory into training records.""" + from scripts.prepare_training_data import build_records + + letter = tmp_path / "Acme Corp Cover Letter.md" + letter.write_text( + "Dear Acme Hiring Team,\n\n" + "I'm delighted to apply for the Director of Customer Success position at Acme Corp. " + "With six years of experience, I bring strong skills.\n\n" + "Best regards,\nAlex Rivera" + ) + + records = build_records(tmp_path) + assert len(records) == 1 + assert "Acme Corp" in records[0]["instruction"] + assert "Director of Customer Success" in records[0]["instruction"] + assert records[0]["output"].startswith("I'm delighted") + + +def test_build_records_skips_empty_files(tmp_path): + """build_records ignores empty or very short files.""" + from scripts.prepare_training_data import build_records + + (tmp_path / "Empty Cover Letter.md").write_text("") + (tmp_path / "Tiny Cover Letter.md").write_text("Hi") + + records = build_records(tmp_path) + assert len(records) == 0 + + +# ── generate_cover_letter tests ─────────────────────────────────────────────── + +def test_find_similar_letters_returns_top_k(): + """find_similar_letters returns at most top_k entries.""" + from scripts.generate_cover_letter import find_similar_letters + + corpus = [ + {"company": "Acme", "text": "customer success technical account management SaaS"}, + {"company": "Beta", "text": "software engineering backend python"}, + {"company": "Gamma", "text": "customer onboarding enterprise NPS"}, + {"company": "Delta", "text": "customer success manager renewal QBR"}, + ] + results = find_similar_letters("customer success manager enterprise SaaS", corpus, top_k=2) + assert len(results) == 2 + # Should prefer customer success companies over software engineering + companies = [r["company"] for r in results] + assert "Beta" not in companies + + +def test_load_corpus_returns_list(): + """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash).""" + from scripts.generate_cover_letter import load_corpus, LETTERS_DIR + + if LETTERS_DIR.exists(): + corpus = load_corpus() + assert isinstance(corpus, list) + if corpus: + assert "company" in corpus[0] + assert "text" in corpus[0] + else: + pytest.skip("LETTERS_DIR not present in this environment") + + +def test_generate_calls_llm_router(): + """generate() calls the router's complete() and returns its output.""" + from scripts.generate_cover_letter import generate + + fake_corpus = [ + {"company": "Acme", "text": "I'm delighted to apply for the CSM role at Acme."}, + ] + mock_router = MagicMock() + mock_router.complete.return_value = "Dear Hiring Team,\n\nI'm delighted to apply.\n\nWarm regards,\nAlex Rivera" + + with patch("scripts.generate_cover_letter.load_corpus", return_value=fake_corpus): + result = generate("Customer Success Manager", "TestCo", "Looking for a CSM", + _router=mock_router) + + mock_router.complete.assert_called_once() + assert "Alex Rivera" in result diff --git a/tests/test_craigslist.py b/tests/test_craigslist.py new file mode 100644 index 0000000..1fccaf4 --- /dev/null +++ b/tests/test_craigslist.py @@ -0,0 +1,211 @@ +"""Tests for Craigslist RSS scraper.""" +from datetime import datetime, timezone, timedelta +from email.utils import format_datetime +from unittest.mock import patch, MagicMock +import xml.etree.ElementTree as ET + +import pytest +import requests + + +# ── RSS fixture helpers ──────────────────────────────────────────────────────── + +def _make_rss(items: list[dict]) -> bytes: + """Build minimal Craigslist-style RSS XML from a list of item dicts.""" + channel = ET.Element("channel") + for item_data in items: + item = ET.SubElement(channel, "item") + for tag, value in item_data.items(): + el = ET.SubElement(item, tag) + el.text = value + rss = ET.Element("rss") + rss.append(channel) + return ET.tostring(rss, encoding="utf-8", xml_declaration=True) + + +def _pubdate(hours_ago: float = 1.0) -> str: + """Return an RFC 2822 pubDate string for N hours ago.""" + dt = datetime.now(tz=timezone.utc) - timedelta(hours=hours_ago) + return format_datetime(dt) + + +def _mock_resp(content: bytes, status_code: int = 200) -> MagicMock: + mock = MagicMock() + mock.status_code = status_code + mock.content = content + mock.raise_for_status = MagicMock() + if status_code >= 400: + mock.raise_for_status.side_effect = requests.HTTPError(f"HTTP {status_code}") + return mock + + +# ── Fixtures ────────────────────────────────────────────────────────────────── + +_SAMPLE_RSS = _make_rss([{ + "title": "Customer Success Manager", + "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1234567890.html", + "description": "Great CSM role at Acme Corp. Salary $120k.", + "pubDate": _pubdate(1), +}]) + +_TWO_ITEM_RSS = _make_rss([ + { + "title": "Customer Success Manager", + "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1111111111.html", + "description": "CSM role 1.", + "pubDate": _pubdate(1), + }, + { + "title": "Account Manager", + "link": "https://sfbay.craigslist.org/jjj/d/am-role/2222222222.html", + "description": "AM role.", + "pubDate": _pubdate(2), + }, +]) + +_OLD_ITEM_RSS = _make_rss([{ + "title": "Old Job", + "link": "https://sfbay.craigslist.org/jjj/d/old-job/9999999999.html", + "description": "Very old posting.", + "pubDate": _pubdate(hours_ago=500), +}]) + +_TWO_METRO_CONFIG = { + "metros": ["sfbay", "newyork"], + "location_map": { + "San Francisco Bay Area, CA": "sfbay", + "New York, NY": "newyork", + }, + "category": "jjj", +} + +_SINGLE_METRO_CONFIG = { + "metros": ["sfbay"], + "location_map": {"San Francisco Bay Area, CA": "sfbay"}, +} + +_PROFILE = {"titles": ["Customer Success Manager"], "hours_old": 240} + + +# ── Tests ───────────────────────────────────────────────────────────────────── + +def test_scrape_returns_empty_on_missing_config(): + """Missing craigslist.yaml → returns [] without raising.""" + from scripts.custom_boards import craigslist + with patch("scripts.custom_boards.craigslist._load_config", + side_effect=FileNotFoundError("config not found")): + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + assert result == [] + + +def test_scrape_remote_hits_all_metros(): + """location='Remote' triggers one RSS fetch per configured metro.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_SAMPLE_RSS)) as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote") + + assert mock_get.call_count == 2 + fetched_urls = [call.args[0] for call in mock_get.call_args_list] + assert any("sfbay" in u for u in fetched_urls) + assert any("newyork" in u for u in fetched_urls) + assert all(r["is_remote"] for r in result) + + +def test_scrape_location_map_resolves(): + """Known location string maps to exactly one metro.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_SAMPLE_RSS)) as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + + assert mock_get.call_count == 1 + assert "sfbay" in mock_get.call_args.args[0] + assert len(result) == 1 + assert result[0]["is_remote"] is False + + +def test_scrape_location_not_in_map_returns_empty(): + """Location not in location_map → [] without raising.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get") as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Portland, OR") + + assert result == [] + mock_get.assert_not_called() + + +def test_hours_old_filter(): + """Items older than hours_old are excluded.""" + profile = {"titles": ["Customer Success Manager"], "hours_old": 48} + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_OLD_ITEM_RSS)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(profile, "San Francisco Bay Area, CA") + + assert result == [] + + +def test_dedup_within_run(): + """Same URL from two different metros is only returned once.""" + same_url_rss = _make_rss([{ + "title": "CSM Role", + "link": "https://sfbay.craigslist.org/jjj/d/csm/1234.html", + "description": "Same job.", + "pubDate": _pubdate(1), + }]) + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(same_url_rss)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote") + + urls = [r["url"] for r in result] + assert len(urls) == len(set(urls)) + + +def test_http_error_graceful(): + """HTTP error → [] without raising.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + side_effect=requests.RequestException("timeout")): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + + assert result == [] + + +def test_malformed_xml_graceful(): + """Malformed RSS XML → [] without raising.""" + bad_resp = MagicMock() + bad_resp.content = b"this is not xml <<<<" + bad_resp.raise_for_status = MagicMock() + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=bad_resp): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + assert result == [] + + +def test_results_wanted_cap(): + """Never returns more than results_wanted items.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_TWO_ITEM_RSS)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote", results_wanted=1) + + assert len(result) <= 1 diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..95e7ca7 --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,560 @@ +import pytest +import sqlite3 +from pathlib import Path +from unittest.mock import patch + + +def test_init_db_creates_jobs_table(tmp_path): + """init_db creates a jobs table with correct schema.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'") + assert cursor.fetchone() is not None + conn.close() + + +def test_insert_job_returns_id(tmp_path): + """insert_job inserts a row and returns its id.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Great role", "date_found": "2026-02-20", + } + row_id = insert_job(db_path, job) + assert isinstance(row_id, int) + assert row_id > 0 + + +def test_insert_job_skips_duplicate_url(tmp_path): + """insert_job returns None if URL already exists.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + insert_job(db_path, job) + result = insert_job(db_path, job) + assert result is None + + +def test_get_jobs_by_status(tmp_path): + """get_jobs_by_status returns only jobs with matching status.""" + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + row_id = insert_job(db_path, job) + update_job_status(db_path, [row_id], "approved") + approved = get_jobs_by_status(db_path, "approved") + pending = get_jobs_by_status(db_path, "pending") + assert len(approved) == 1 + assert len(pending) == 0 + + +def test_update_job_status_batch(tmp_path): + """update_job_status updates multiple rows at once.""" + from scripts.db import init_db, insert_job, update_job_status, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + ids = [] + for i in range(3): + job = {"title": f"Job {i}", "company": "Co", "url": f"https://example.com/{i}", + "source": "indeed", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + ids.append(insert_job(db_path, job)) + update_job_status(db_path, ids, "rejected") + assert len(get_jobs_by_status(db_path, "rejected")) == 3 + + +def test_migrate_db_adds_columns_to_existing_db(tmp_path): + """_migrate_db adds cover_letter and applied_at to a db created without them.""" + import sqlite3 + from scripts.db import _migrate_db + db_path = tmp_path / "legacy.db" + # Create old-style table without the new columns + conn = sqlite3.connect(db_path) + conn.execute("""CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, company TEXT, url TEXT UNIQUE, status TEXT DEFAULT 'pending' + )""") + conn.commit() + conn.close() + _migrate_db(db_path) + conn = sqlite3.connect(db_path) + cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()} + conn.close() + assert "cover_letter" in cols + assert "applied_at" in cols + + +def test_update_cover_letter(tmp_path): + """update_cover_letter persists text to the DB.""" + from scripts.db import init_db, insert_job, update_cover_letter, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + update_cover_letter(db_path, job_id, "Dear Hiring Manager,\nGreat role!") + rows = get_jobs_by_status(db_path, "pending") + assert rows[0]["cover_letter"] == "Dear Hiring Manager,\nGreat role!" + + +def test_mark_applied_sets_status_and_date(tmp_path): + """mark_applied sets status='applied' and populates applied_at.""" + from scripts.db import init_db, insert_job, mark_applied, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + mark_applied(db_path, [job_id]) + applied = get_jobs_by_status(db_path, "applied") + assert len(applied) == 1 + assert applied[0]["status"] == "applied" + assert applied[0]["applied_at"] is not None + + +# ── background_tasks tests ──────────────────────────────────────────────────── + +def test_init_db_creates_background_tasks_table(tmp_path): + """init_db creates a background_tasks table.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cur = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'" + ) + assert cur.fetchone() is not None + conn.close() + + +def test_insert_task_returns_id_and_true(tmp_path): + """insert_task returns (task_id, True) for a new task.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_insert_task_deduplicates_active_task(tmp_path): + """insert_task returns (existing_id, False) if a queued/running task already exists.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + second_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_insert_task_allows_different_types_same_job(tmp_path): + """insert_task allows cover_letter and company_research for the same job concurrently.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + _, cl_new = insert_task(db_path, "cover_letter", job_id) + _, res_new = insert_task(db_path, "company_research", job_id) + assert cl_new is True + assert res_new is True + + +def test_update_task_status_running(tmp_path): + """update_task_status('running') sets started_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "running") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "running" + assert row[1] is not None + + +def test_update_task_status_completed(tmp_path): + """update_task_status('completed') sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "completed") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "completed" + assert row[1] is not None + + +def test_update_task_status_failed_stores_error(tmp_path): + """update_task_status('failed') stores error message and sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "failed", error="LLM timeout") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "failed" + assert row[1] == "LLM timeout" + assert row[2] is not None + + +def test_get_active_tasks_returns_only_active(tmp_path): + """get_active_tasks returns only queued/running tasks with job info joined.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + active_id, _ = insert_task(db_path, "cover_letter", job_id) + done_id, _ = insert_task(db_path, "company_research", job_id) + update_task_status(db_path, done_id, "completed") + + tasks = get_active_tasks(db_path) + assert len(tasks) == 1 + assert tasks[0]["id"] == active_id + assert tasks[0]["company"] == "Acme" + assert tasks[0]["title"] == "CSM" + + +def test_get_task_for_job_returns_latest(tmp_path): + """get_task_for_job returns the most recent task for the given type+job.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, first_id, "completed") + second_id, _ = insert_task(db_path, "cover_letter", job_id) # allowed since first is done + + task = get_task_for_job(db_path, "cover_letter", job_id) + assert task is not None + assert task["id"] == second_id + + +def test_get_task_for_job_returns_none_when_absent(tmp_path): + """get_task_for_job returns None when no task exists for that job+type.""" + from scripts.db import init_db, insert_job, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + assert get_task_for_job(db_path, "cover_letter", job_id) is None + + +# ── company_research new-column tests ───────────────────────────────────────── + +def test_company_research_has_new_columns(tmp_path): + """init_db creates company_research with the four extended columns.""" + from scripts.db import init_db + db = tmp_path / "test.db" + init_db(db) + conn = sqlite3.connect(db) + cols = [r[1] for r in conn.execute("PRAGMA table_info(company_research)").fetchall()] + conn.close() + assert "tech_brief" in cols + assert "funding_brief" in cols + assert "competitors_brief" in cols + assert "red_flags" in cols + +def test_save_and_get_research_new_fields(tmp_path): + """save_research persists and get_research returns the four new brief fields.""" + from scripts.db import init_db, insert_job, save_research, get_research + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "TAM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + + save_research(db, job_id=job_id, + company_brief="overview", ceo_brief="ceo", + talking_points="points", raw_output="raw", + tech_brief="tech stack", funding_brief="series B", + competitors_brief="vs competitors", red_flags="none") + r = get_research(db, job_id=job_id) + assert r["tech_brief"] == "tech stack" + assert r["funding_brief"] == "series B" + assert r["competitors_brief"] == "vs competitors" + assert r["red_flags"] == "none" + + +# ── stage_signal / suggestion_dismissed tests ───────────────────────────────── + +def test_stage_signal_columns_exist(tmp_path): + """init_db creates stage_signal and suggestion_dismissed columns on job_contacts.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cols = {row[1] for row in conn.execute("PRAGMA table_info(job_contacts)").fetchall()} + conn.close() + assert "stage_signal" in cols + assert "suggestion_dismissed" in cols + + +def test_add_contact_with_stage_signal(tmp_path): + """add_contact stores stage_signal when provided.""" + from scripts.db import init_db, insert_job, add_contact, get_contacts + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" + + +def test_get_unread_stage_signals(tmp_path): + """get_unread_stage_signals returns only non-neutral, non-dismissed signals.""" + from scripts.db import (init_db, insert_job, add_contact, + get_unread_stage_signals, dismiss_stage_signal) + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + c1 = add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Auto-confirm", stage_signal="neutral") + signals = get_unread_stage_signals(db_path, job_id) + assert len(signals) == 1 + assert signals[0]["stage_signal"] == "interview_scheduled" + + dismiss_stage_signal(db_path, c1) + assert get_unread_stage_signals(db_path, job_id) == [] + + +def test_get_email_leads(tmp_path): + """get_email_leads returns only source='email' pending jobs.""" + from scripts.db import init_db, insert_job, get_email_leads + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + insert_job(db_path, { + "title": "TAM", "company": "Wiz", "url": "email://wiz.com/abc123", + "source": "email", "location": "", "is_remote": 0, + "salary": "", "description": "Hi Alex…", "date_found": "2026-02-21", + }) + leads = get_email_leads(db_path) + assert len(leads) == 1 + assert leads[0]["company"] == "Wiz" + assert leads[0]["source"] == "email" + + +def test_get_all_message_ids(tmp_path): + """get_all_message_ids returns all message IDs across jobs.""" + from scripts.db import init_db, insert_job, add_contact, get_all_message_ids + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + add_contact(db_path, job_id=job_id, message_id="") + add_contact(db_path, job_id=job_id, message_id="") + mids = get_all_message_ids(db_path) + assert "" in mids + assert "" in mids + + +# ── survey_responses tests ──────────────────────────────────────────────────── + +def test_survey_responses_table_created(tmp_path): + """init_db creates survey_responses table.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cur = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='survey_responses'" + ) + assert cur.fetchone() is not None + conn.close() + + +def test_survey_at_column_exists(tmp_path): + """jobs table has survey_at column after init_db.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cols = [row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()] + assert "survey_at" in cols + conn.close() + + +def test_insert_and_get_survey_response(tmp_path): + """insert_survey_response inserts a row; get_survey_responses returns it.""" + from scripts.db import init_db, insert_job, insert_survey_response, get_survey_responses + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + row_id = insert_survey_response( + db_path, job_id=job_id, survey_name="Culture Fit", + source="text_paste", raw_input="Q1: A B C", mode="quick", + llm_output="1. B — collaborative", reported_score="82%", + ) + assert isinstance(row_id, int) + responses = get_survey_responses(db_path, job_id=job_id) + assert len(responses) == 1 + assert responses[0]["survey_name"] == "Culture Fit" + assert responses[0]["reported_score"] == "82%" + + +def test_get_interview_jobs_includes_survey(tmp_path): + """get_interview_jobs returns survey-stage jobs.""" + from scripts.db import init_db, insert_job, update_job_status, get_interview_jobs + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/2", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + update_job_status(db_path, [job_id], "survey") + result = get_interview_jobs(db_path) + assert any(j["id"] == job_id for j in result.get("survey", [])) + + +def test_advance_to_survey_sets_survey_at(tmp_path): + """advance_to_stage('survey') sets survey_at timestamp.""" + from scripts.db import init_db, insert_job, update_job_status, advance_to_stage, get_job_by_id + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/3", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + update_job_status(db_path, [job_id], "applied") + advance_to_stage(db_path, job_id=job_id, stage="survey") + job = get_job_by_id(db_path, job_id=job_id) + assert job["status"] == "survey" + assert job["survey_at"] is not None + + +def test_update_job_fields(tmp_path): + from scripts.db import init_db, insert_job, update_job_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "Importing…", "company": "", "url": "https://example.com/job/1", + "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", + }) + update_job_fields(db, job_id, { + "title": "Customer Success Manager", + "company": "Acme Corp", + "location": "San Francisco, CA", + "description": "Great role.", + "salary": "$120k", + "is_remote": 1, + }) + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Customer Success Manager" + assert row["company"] == "Acme Corp" + assert row["description"] == "Great role." + assert row["is_remote"] == 1 + + +def test_update_job_fields_ignores_unknown_columns(tmp_path): + from scripts.db import init_db, insert_job, update_job_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "Importing…", "company": "", "url": "https://example.com/job/2", + "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", + }) + # Should not raise even with an unknown column + update_job_fields(db, job_id, {"title": "Real Title", "nonexistent_col": "ignored"}) + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Real Title" diff --git a/tests/test_discover.py b/tests/test_discover.py new file mode 100644 index 0000000..4cc0fee --- /dev/null +++ b/tests/test_discover.py @@ -0,0 +1,185 @@ +# tests/test_discover.py +import pytest +from unittest.mock import patch, MagicMock +import pandas as pd +from pathlib import Path + +SAMPLE_JOB = { + "title": "Customer Success Manager", + "company": "Acme Corp", + "location": "Remote", + "is_remote": True, + "job_url": "https://linkedin.com/jobs/view/123456", + "site": "linkedin", + "min_amount": 90000, + "max_amount": 120000, + "salary_source": "$90,000 - $120,000", + "description": "Great CS role", +} + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} +SAMPLE_PROFILES_CFG = { + "profiles": [{"name": "cs", "titles": ["Customer Success Manager"], + "locations": ["Remote"], "boards": ["linkedin"], + "results_per_board": 5, "hours_old": 72}] +} + + +def make_jobs_df(jobs=None): + return pd.DataFrame(jobs or [SAMPLE_JOB]) + + +def test_discover_writes_to_sqlite(tmp_path): + """run_discovery inserts new jobs into SQLite staging db.""" + from scripts.discover import run_discovery + from scripts.db import get_jobs_by_status + + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 + assert jobs[0]["title"] == "Customer Success Manager" + + +def test_discover_skips_duplicate_urls(tmp_path): + """run_discovery does not insert a job whose URL is already in SQLite.""" + from scripts.discover import run_discovery + from scripts.db import init_db, insert_job, get_jobs_by_status + + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-01-01", + }) + + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 # only the pre-existing one, not a duplicate + + +def test_discover_pushes_new_jobs(tmp_path): + """Legacy: discover still calls push_to_notion when notion_push=True.""" + from scripts.discover import run_discovery + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.push_to_notion") as mock_push, \ + patch("scripts.discover.get_existing_urls", return_value=set()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path, notion_push=True) + assert mock_push.call_count == 1 + + +def test_push_to_notion_sets_status_new(): + """push_to_notion always sets Status to the configured status_new value.""" + from scripts.discover import push_to_notion + mock_notion = MagicMock() + push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM) + call_kwargs = mock_notion.pages.create.call_args[1] + status = call_kwargs["properties"]["Status of Application"]["select"]["name"] + assert status == "Application Submitted" + + +# ── Custom boards integration ───────────────────────────────────────────────── + +_PROFILE_WITH_CUSTOM = { + "profiles": [{ + "name": "cs", "titles": ["Customer Success Manager"], + "locations": ["Remote"], "boards": [], + "custom_boards": ["adzuna"], + "results_per_board": 5, "hours_old": 72, + }] +} + +_ADZUNA_JOB = { + "title": "Customer Success Manager", + "company": "TestCo", + "url": "https://www.adzuna.com/jobs/details/999", + "source": "adzuna", + "location": "Remote", + "is_remote": True, + "salary": "$90,000 – $120,000", + "description": "Great remote CSM role", +} + + +def test_discover_custom_board_inserts_jobs(tmp_path): + """run_discovery dispatches custom_boards scrapers and inserts returned jobs.""" + from scripts.discover import run_discovery + from scripts.db import get_jobs_by_status + + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \ + patch("scripts.discover.Client"): + count = run_discovery(db_path=db_path) + + assert count == 1 + jobs = get_jobs_by_status(db_path, "pending") + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["source"] == "adzuna" + + +def test_discover_custom_board_skips_unknown(tmp_path, capsys): + """run_discovery logs and skips an unregistered custom board name.""" + from scripts.discover import run_discovery + + profile_unknown = { + "profiles": [{ + "name": "cs", "titles": ["CSM"], "locations": ["Remote"], + "boards": [], "custom_boards": ["nonexistent_board"], + "results_per_board": 5, "hours_old": 72, + }] + } + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(profile_unknown, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + captured = capsys.readouterr() + assert "nonexistent_board" in captured.out + assert "Unknown scraper" in captured.out + + +def test_discover_custom_board_deduplicates(tmp_path): + """Custom board results are deduplicated by URL against pre-existing jobs.""" + from scripts.discover import run_discovery + from scripts.db import init_db, insert_job, get_jobs_by_status + + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "CSM", "company": "TestCo", + "url": "https://www.adzuna.com/jobs/details/999", + "source": "adzuna", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-01-01", + }) + + with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \ + patch("scripts.discover.Client"): + count = run_discovery(db_path=db_path) + + assert count == 0 # duplicate skipped + assert len(get_jobs_by_status(db_path, "pending")) == 1 diff --git a/tests/test_enrich_descriptions.py b/tests/test_enrich_descriptions.py new file mode 100644 index 0000000..f3df6e7 --- /dev/null +++ b/tests/test_enrich_descriptions.py @@ -0,0 +1,96 @@ +# tests/test_enrich_descriptions.py +"""Tests for scripts/enrich_descriptions.py — enrich_craigslist_fields().""" +from unittest.mock import patch, MagicMock +import sqlite3 + + +def test_enrich_craigslist_fields_skips_non_craigslist(tmp_path): + """Non-craigslist source → returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://example.com/1", + "source": "linkedin", "location": "", "description": "Some company here.", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_skips_populated_company(tmp_path): + """Company already set → returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "Acme Corp", "url": "https://sfbay.craigslist.org/jjj/d/1.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp today.", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_skips_empty_description(tmp_path): + """Empty description → returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/2.html", + "source": "craigslist", "location": "", "description": "", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_extracts_and_updates(tmp_path): + """Valid LLM response → updates company/salary in DB, returns extracted dict.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/3.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $120k/yr.", + "date_found": "2026-02-24", + }) + mock_router = MagicMock() + mock_router.complete.return_value = '{"company": "Acme Corp", "salary": "$120k/yr"}' + with patch("scripts.llm_router.LLMRouter", return_value=mock_router): + result = enrich_craigslist_fields(db, job_id) + assert result == {"company": "Acme Corp", "salary": "$120k/yr"} + conn = sqlite3.connect(db) + row = conn.execute("SELECT company, salary FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row[0] == "Acme Corp" + assert row[1] == "$120k/yr" + + +def test_enrich_craigslist_fields_handles_bad_llm_json(tmp_path): + """Unparseable LLM response → returns {} without raising.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/4.html", + "source": "craigslist", "location": "", "description": "Great opportunity.", + "date_found": "2026-02-24", + }) + mock_router = MagicMock() + mock_router.complete.return_value = "Sorry, I cannot extract that." + with patch("scripts.llm_router.LLMRouter", return_value=mock_router): + result = enrich_craigslist_fields(db, job_id) + assert result == {} diff --git a/tests/test_imap_sync.py b/tests/test_imap_sync.py new file mode 100644 index 0000000..d6d057b --- /dev/null +++ b/tests/test_imap_sync.py @@ -0,0 +1,330 @@ +"""Tests for imap_sync helpers (no live IMAP connection required).""" +import pytest +from unittest.mock import patch, MagicMock + + +def test_classify_stage_signal_interview(): + """classify_stage_signal returns interview_scheduled for a call-scheduling email.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "interview_scheduled" + result = classify_stage_signal( + "Let's schedule a call", + "Hi Alex, we'd love to book a 30-min phone screen with you.", + ) + assert result == "interview_scheduled" + + +def test_classify_stage_signal_returns_none_on_error(): + """classify_stage_signal returns None when LLM call raises.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.side_effect = RuntimeError("model not loaded") + result = classify_stage_signal("subject", "body") + assert result is None + + +def test_classify_stage_signal_strips_think_tags(): + """classify_stage_signal strips ... blocks before parsing.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "Let me think...\nrejected" + result = classify_stage_signal("Update on your application", "We went with another candidate.") + assert result == "rejected" + + +def test_normalise_company(): + """_normalise_company strips legal suffixes.""" + from scripts.imap_sync import _normalise_company + assert _normalise_company("DataStax, Inc.") == "DataStax" + assert _normalise_company("Wiz Ltd") == "Wiz" + assert _normalise_company("Crusoe Energy") == "Crusoe Energy" + + +def test_company_search_terms_excludes_job_board_sld(): + """Job-board domains like linkedin.com are never used as match terms.""" + from scripts.imap_sync import _company_search_terms + # LinkedIn-sourced job: SLD "linkedin" must not appear in the terms + terms = _company_search_terms("Bamboo Health", "https://www.linkedin.com/jobs/view/123") + assert "linkedin" not in terms + assert "bamboo health" in terms + + # Company with its own domain: SLD should be included + terms = _company_search_terms("Crusoe Energy", "https://crusoe.ai/jobs/456") + assert "crusoe" in terms + + # Indeed-sourced job: "indeed" excluded + terms = _company_search_terms("DoorDash", "https://www.indeed.com/viewjob?jk=abc") + assert "indeed" not in terms + assert "doordash" in terms + + +def test_has_recruitment_keyword(): + """_has_recruitment_keyword matches known keywords.""" + from scripts.imap_sync import _has_recruitment_keyword + assert _has_recruitment_keyword("Interview Invitation — Senior TAM") + assert _has_recruitment_keyword("Your application with DataStax") + assert not _has_recruitment_keyword("Team lunch tomorrow") + + +def test_extract_lead_info_returns_company_and_title(): + """extract_lead_info parses LLM JSON response into (company, title).""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = '{"company": "Wiz", "title": "Senior TAM"}' + result = extract_lead_info("Senior TAM at Wiz", "Hi Alex, we have a role…", "recruiter@wiz.com") + assert result == ("Wiz", "Senior TAM") + + +def test_extract_lead_info_returns_none_on_bad_json(): + """extract_lead_info returns (None, None) when LLM returns unparseable output.""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "I cannot determine the company." + result = extract_lead_info("Job opportunity", "blah", "noreply@example.com") + assert result == (None, None) + + +def test_classify_labels_includes_survey_received(): + """_CLASSIFY_LABELS includes survey_received.""" + from scripts.imap_sync import _CLASSIFY_LABELS + assert "survey_received" in _CLASSIFY_LABELS + + +def test_classify_stage_signal_returns_survey_received(): + """classify_stage_signal returns 'survey_received' when LLM outputs that label.""" + from unittest.mock import patch + from scripts.imap_sync import classify_stage_signal + + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "survey_received" + result = classify_stage_signal("Complete our culture survey", "Please fill out this form") + assert result == "survey_received" + + +def test_sync_job_emails_classifies_inbound(tmp_path): + """sync_job_emails classifies inbound emails and stores the stage_signal.""" + from scripts.db import init_db, insert_job, get_contacts + from scripts.imap_sync import sync_job_emails + + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", + "url": "https://acme.com/jobs/1", + "source": "linkedin", "location": "Remote", + "is_remote": True, "salary": "", "description": "", + "date_found": "2026-02-21", + }) + job = {"id": job_id, "company": "Acme", "url": "https://acme.com/jobs/1"} + + fake_msg_bytes = ( + b"From: recruiter@acme.com\r\n" + b"To: alex@example.com\r\n" + b"Subject: Interview Invitation\r\n" + b"Message-ID: \r\n" + b"\r\n" + b"Hi Alex, we'd like to schedule a phone screen." + ) + + conn_mock = MagicMock() + conn_mock.select.return_value = ("OK", [b"1"]) + conn_mock.search.return_value = ("OK", [b"1"]) + conn_mock.fetch.return_value = ("OK", [(b"1 (RFC822 {123})", fake_msg_bytes)]) + + with patch("scripts.imap_sync.classify_stage_signal", return_value="interview_scheduled"): + inb, out = sync_job_emails(job, conn_mock, {"lookback_days": 90}, db_path) + + assert inb == 1 + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" + + +def test_parse_linkedin_alert_extracts_jobs(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Your job alert for customer success manager in United States +New jobs match your preferences. +Manage alerts: https://www.linkedin.com/comm/jobs/alerts?... + +Customer Success Manager +Reflow +California, United States +View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz + +--------------------------------------------------------- + +Customer Engagement Manager +Bitwarden +United States + +2 school alumni +Apply with resume & profile +View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D + +--------------------------------------------------------- + +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 2 + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["company"] == "Reflow" + assert jobs[0]["location"] == "California, United States" + assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/" + assert jobs[1]["title"] == "Customer Engagement Manager" + assert jobs[1]["company"] == "Bitwarden" + assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/" + + +def test_parse_linkedin_alert_skips_blocks_without_view_job(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Customer Success Manager +Some Company +United States + +--------------------------------------------------------- + +Valid Job Title +Valid Company +Remote +View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y + +--------------------------------------------------------- +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 1 + assert jobs[0]["title"] == "Valid Job Title" + + +def test_parse_linkedin_alert_empty_body(): + from scripts.imap_sync import parse_linkedin_alert + assert parse_linkedin_alert("") == [] + assert parse_linkedin_alert("No jobs here.") == [] + + +# ── _scan_unmatched_leads integration ───────────────────────────────────────── + +_ALERT_BODY = """\ +Your job alert for customer success manager in United States +New jobs match your preferences. + +Customer Success Manager +Acme Corp +California, United States +View job: https://www.linkedin.com/comm/jobs/view/9999001/?trackingId=abc + +--------------------------------------------------------- + +Director of Customer Success +Beta Inc +Remote +View job: https://www.linkedin.com/comm/jobs/view/9999002/?trackingId=def + +--------------------------------------------------------- +""" + +_ALERT_EMAIL = { + "message_id": "", + "from_addr": "jobalerts-noreply@linkedin.com", + "to_addr": "alex@example.com", + "subject": "2 new jobs for customer success manager", + "body": _ALERT_BODY, + "date": "2026-02-24 12:00:00", +} + + +def test_scan_unmatched_leads_linkedin_alert_inserts_jobs(tmp_path): + """_scan_unmatched_leads detects a LinkedIn alert and inserts each job card.""" + import sqlite3 + from unittest.mock import patch, MagicMock + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task") as mock_submit: + + from scripts.imap_sync import _scan_unmatched_leads + known_ids: set = set() + new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, known_ids) + + assert new_leads == 2 + + # Message ID added so it won't be reprocessed + assert "" in known_ids + + # Both jobs inserted with correct fields + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + jobs = conn.execute("SELECT * FROM jobs ORDER BY id").fetchall() + conn.close() + + assert len(jobs) == 2 + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["company"] == "Acme Corp" + assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/9999001/" + assert jobs[0]["source"] == "linkedin" + assert jobs[1]["title"] == "Director of Customer Success" + assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/9999002/" + + # scrape_url task submitted for each inserted job + assert mock_submit.call_count == 2 + task_types = [call.args[1] for call in mock_submit.call_args_list] + assert task_types == ["scrape_url", "scrape_url"] + + +def test_scan_unmatched_leads_linkedin_alert_skips_duplicates(tmp_path): + """URLs already in the DB are not re-inserted.""" + from unittest.mock import patch, MagicMock + from scripts.db import init_db, insert_job + + db_path = tmp_path / "test.db" + init_db(db_path) + + # Pre-insert one of the two URLs + insert_job(db_path, { + "title": "Customer Success Manager", "company": "Acme Corp", + "url": "https://www.linkedin.com/jobs/view/9999001/", + "source": "linkedin", "location": "", "is_remote": 0, + "salary": "", "description": "", "date_found": "2026-02-24", + }) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task") as mock_submit: + + from scripts.imap_sync import _scan_unmatched_leads + new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set()) + + # Only one new job (the duplicate was skipped) + assert new_leads == 1 + assert mock_submit.call_count == 1 + + +def test_scan_unmatched_leads_linkedin_alert_skips_llm_path(tmp_path): + """After a LinkedIn alert email, the LLM extraction path is never reached.""" + from unittest.mock import patch, MagicMock + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task"), \ + patch("scripts.imap_sync.extract_lead_info") as mock_llm: + + from scripts.imap_sync import _scan_unmatched_leads + _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set()) + + # LLM extraction must never be called for alert emails + mock_llm.assert_not_called() diff --git a/tests/test_llm_router.py b/tests/test_llm_router.py new file mode 100644 index 0000000..0d5a897 --- /dev/null +++ b/tests/test_llm_router.py @@ -0,0 +1,135 @@ +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path +import yaml + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml" + + +def test_config_loads(): + """Config file is valid YAML with required keys.""" + cfg = yaml.safe_load(CONFIG_PATH.read_text()) + assert "fallback_order" in cfg + assert "backends" in cfg + assert len(cfg["fallback_order"]) >= 1 + + +def test_router_uses_first_reachable_backend(): + """Router skips unreachable backends and uses the first that responds.""" + from scripts.llm_router import LLMRouter + + router = LLMRouter(CONFIG_PATH) + + mock_response = MagicMock() + mock_response.choices[0].message.content = "hello" + + with patch.object(router, "_is_reachable", side_effect=[False, True, True, True, True]), \ + patch("scripts.llm_router.OpenAI") as MockOpenAI: + instance = MockOpenAI.return_value + instance.chat.completions.create.return_value = mock_response + mock_model = MagicMock() + mock_model.id = "test-model" + instance.models.list.return_value.data = [mock_model] + + result = router.complete("say hello") + + assert result == "hello" + + +def test_router_raises_when_all_backends_fail(): + """Router raises RuntimeError when every backend is unreachable or errors.""" + from scripts.llm_router import LLMRouter + + router = LLMRouter(CONFIG_PATH) + + with patch.object(router, "_is_reachable", return_value=False): + with pytest.raises(RuntimeError, match="All LLM backends exhausted"): + router.complete("say hello") + + +def test_is_reachable_returns_false_on_connection_error(): + """_is_reachable returns False when the health endpoint is unreachable.""" + from scripts.llm_router import LLMRouter + import requests + + router = LLMRouter(CONFIG_PATH) + + with patch("scripts.llm_router.requests.get", side_effect=requests.ConnectionError): + result = router._is_reachable("http://localhost:9999/v1") + + assert result is False + + +def test_complete_skips_backend_without_image_support(tmp_path): + """When images= is passed, backends without supports_images are skipped.""" + import yaml + from scripts.llm_router import LLMRouter + + cfg = { + "fallback_order": ["ollama", "vision_service"], + "backends": { + "ollama": { + "type": "openai_compat", + "base_url": "http://localhost:11434/v1", + "model": "llava", + "api_key": "ollama", + "enabled": True, + "supports_images": False, + }, + "vision_service": { + "type": "vision_service", + "base_url": "http://localhost:8002", + "enabled": True, + "supports_images": True, + }, + }, + } + cfg_file = tmp_path / "llm.yaml" + cfg_file.write_text(yaml.dump(cfg)) + + from unittest.mock import patch, MagicMock + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"text": "B — collaborative"} + + with patch("scripts.llm_router.requests.get") as mock_get, \ + patch("scripts.llm_router.requests.post") as mock_post: + # health check returns ok for vision_service + mock_get.return_value = MagicMock(status_code=200) + mock_post.return_value = mock_resp + + router = LLMRouter(config_path=cfg_file) + result = router.complete("Which option?", images=["base64data"]) + + assert result == "B — collaborative" + # vision_service POST /analyze should have been called + assert mock_post.called + + +def test_complete_without_images_skips_vision_service(tmp_path): + """When images=None, vision_service backend is skipped.""" + import yaml + from scripts.llm_router import LLMRouter + from unittest.mock import patch, MagicMock + + cfg = { + "fallback_order": ["vision_service"], + "backends": { + "vision_service": { + "type": "vision_service", + "base_url": "http://localhost:8002", + "enabled": True, + "supports_images": True, + }, + }, + } + cfg_file = tmp_path / "llm.yaml" + cfg_file.write_text(yaml.dump(cfg)) + + router = LLMRouter(config_path=cfg_file) + with patch("scripts.llm_router.requests.post") as mock_post: + try: + router.complete("text only prompt") + except RuntimeError: + pass # all backends exhausted is expected + assert not mock_post.called diff --git a/tests/test_match.py b/tests/test_match.py new file mode 100644 index 0000000..25a823e --- /dev/null +++ b/tests/test_match.py @@ -0,0 +1,47 @@ +import pytest +from unittest.mock import patch, MagicMock + + +def test_extract_job_description_from_url(): + """extract_job_description fetches and returns visible text from a URL.""" + from scripts.match import extract_job_description + + with patch("scripts.match.requests.get") as mock_get: + mock_get.return_value.text = "

We need a CSM with Salesforce.

Customer Success Manager

+ Acme Corp + San Francisco, CA +

Exciting CSM role with great benefits.

+ """ + + mock_resp = MagicMock() + mock_resp.text = linkedin_html + mock_resp.raise_for_status = MagicMock() + + with patch("scripts.scrape_url.requests.get", return_value=mock_resp): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + assert result.get("title") == "Customer Success Manager" + assert result.get("company") == "Acme Corp" + assert "CSM role" in result.get("description", "") + + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Customer Success Manager" + assert row["company"] == "Acme Corp" + + +def test_scrape_url_generic_json_ld(tmp_path): + db, job_id = _make_db(tmp_path, url="https://jobs.example.com/post/42") + + json_ld_html = """ + + """ + + mock_resp = MagicMock() + mock_resp.text = json_ld_html + mock_resp.raise_for_status = MagicMock() + + with patch("scripts.scrape_url.requests.get", return_value=mock_resp): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + assert result.get("title") == "TAM Role" + assert result.get("company") == "TechCo" + + +def test_scrape_url_graceful_on_http_error(tmp_path): + db, job_id = _make_db(tmp_path) + import requests as req + + with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + # Should return empty dict and not raise; job row still exists + assert isinstance(result, dict) + import sqlite3 + conn = sqlite3.connect(db) + row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row is not None diff --git a/tests/test_sync.py b/tests/test_sync.py new file mode 100644 index 0000000..21c3eea --- /dev/null +++ b/tests/test_sync.py @@ -0,0 +1,88 @@ +# tests/test_sync.py +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path + + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} + + +def test_sync_pushes_approved_jobs(tmp_path): + """sync_to_notion pushes approved jobs and marks them synced.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + + db_path = tmp_path / "test.db" + init_db(db_path) + row_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Good role", "date_found": "2026-02-20", + }) + update_job_status(db_path, [row_id], "approved") + + mock_notion = MagicMock() + mock_notion.pages.create.return_value = {"id": "notion-page-abc"} + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client", return_value=mock_notion): + count = sync_to_notion(db_path=db_path) + + assert count == 1 + mock_notion.pages.create.assert_called_once() + synced = get_jobs_by_status(db_path, "synced") + assert len(synced) == 1 + + +def test_sync_falls_back_to_core_fields_on_validation_error(tmp_path): + """When Notion returns a validation_error (missing column), sync retries without optional fields.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + + db_path = tmp_path / "test.db" + init_db(db_path) + row_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://example.com/2", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + update_job_status(db_path, [row_id], "approved") + + mock_notion = MagicMock() + # First call raises validation_error; second call (fallback) succeeds + mock_notion.pages.create.side_effect = [ + Exception("validation_error: Could not find property with name: Match Score"), + {"id": "notion-page-fallback"}, + ] + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client", return_value=mock_notion): + count = sync_to_notion(db_path=db_path) + + assert count == 1 + assert mock_notion.pages.create.call_count == 2 + synced = get_jobs_by_status(db_path, "synced") + assert len(synced) == 1 + + +def test_sync_returns_zero_when_nothing_approved(tmp_path): + """sync_to_notion returns 0 when there are no approved jobs.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client"): + count = sync_to_notion(db_path=db_path) + + assert count == 0 diff --git a/tests/test_task_runner.py b/tests/test_task_runner.py new file mode 100644 index 0000000..3ea5090 --- /dev/null +++ b/tests/test_task_runner.py @@ -0,0 +1,210 @@ +import threading +import time +import pytest +from pathlib import Path +from unittest.mock import patch +import sqlite3 + + +def _make_db(tmp_path): + from scripts.db import init_db, insert_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "Great role.", "date_found": "2026-02-20", + }) + return db, job_id + + +def test_submit_task_returns_id_and_true(tmp_path): + """submit_task returns (task_id, True) and spawns a thread.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): # don't actually call LLM + from scripts.task_runner import submit_task + task_id, is_new = submit_task(db, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_submit_task_deduplicates(tmp_path): + """submit_task returns (existing_id, False) for a duplicate in-flight task.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): + from scripts.task_runner import submit_task + first_id, _ = submit_task(db, "cover_letter", job_id) + second_id, is_new = submit_task(db, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_run_task_cover_letter_success(tmp_path): + """_run_task marks running→completed and saves cover letter to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" + assert task["error"] is None + + conn = sqlite3.connect(db) + row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row[0] == "Dear Hiring Manager,\nGreat fit!" + + +def test_run_task_company_research_success(tmp_path): + """_run_task marks running→completed and saves research to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job, get_research + + task_id, _ = insert_task(db, "company_research", job_id) + fake_result = { + "raw_output": "raw", "company_brief": "brief", + "ceo_brief": "ceo", "talking_points": "points", + } + with patch("scripts.company_research.research_company", return_value=fake_result): + from scripts.task_runner import _run_task + _run_task(db, task_id, "company_research", job_id) + + task = get_task_for_job(db, "company_research", job_id) + assert task["status"] == "completed" + + research = get_research(db, job_id=job_id) + assert research["company_brief"] == "brief" + + +def test_run_task_marks_failed_on_exception(tmp_path): + """_run_task marks status=failed and stores error when generator raises.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "failed" + assert "LLM timeout" in task["error"] + + +def test_run_task_discovery_success(tmp_path): + """_run_task with task_type=discovery calls run_discovery and stores count in error field.""" + from scripts.db import init_db, insert_task, get_task_for_job + db = tmp_path / "test.db" + init_db(db) + task_id, _ = insert_task(db, "discovery", 0) + + with patch("scripts.discover.run_discovery", return_value=7): + from scripts.task_runner import _run_task + _run_task(db, task_id, "discovery", 0) + + task = get_task_for_job(db, "discovery", 0) + assert task["status"] == "completed" + assert "7 new listings" in task["error"] + + +def test_run_task_email_sync_success(tmp_path): + """email_sync task calls sync_all and marks completed with summary.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + summary = {"synced": 3, "inbound": 5, "outbound": 2, "new_leads": 1, "errors": []} + with patch("scripts.imap_sync.sync_all", return_value=summary): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "completed" + assert "3 jobs" in task["error"] + + +def test_run_task_email_sync_file_not_found(tmp_path): + """email_sync marks failed with helpful message when config is missing.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + with patch("scripts.imap_sync.sync_all", side_effect=FileNotFoundError("config/email.yaml")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "failed" + assert "email" in task["error"].lower() + + +def test_submit_task_actually_completes(tmp_path): + """Integration: submit_task spawns a thread that completes asynchronously.""" + db, job_id = _make_db(tmp_path) + from scripts.db import get_task_for_job + + with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"): + from scripts.task_runner import submit_task + task_id, _ = submit_task(db, "cover_letter", job_id) + # Wait for thread to complete (max 5s) + for _ in range(50): + task = get_task_for_job(db, "cover_letter", job_id) + if task and task["status"] in ("completed", "failed"): + break + time.sleep(0.1) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" + + +def test_run_task_enrich_craigslist_success(tmp_path): + """enrich_craigslist task calls enrich_craigslist_fields and marks completed.""" + from scripts.db import init_db, insert_job, insert_task, get_task_for_job + from unittest.mock import MagicMock + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/9.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $100k.", + "date_found": "2026-02-24", + }) + task_id, _ = insert_task(db, "enrich_craigslist", job_id) + + with patch("scripts.enrich_descriptions.enrich_craigslist_fields", + return_value={"company": "Acme Corp", "salary": "$100k"}) as mock_enrich: + from scripts.task_runner import _run_task + _run_task(db, task_id, "enrich_craigslist", job_id) + + mock_enrich.assert_called_once_with(db, job_id) + task = get_task_for_job(db, "enrich_craigslist", job_id) + assert task["status"] == "completed" + + +def test_scrape_url_submits_enrich_craigslist_for_craigslist_job(tmp_path): + """After scrape_url completes for a craigslist job with empty company, enrich_craigslist is queued.""" + from scripts.db import init_db, insert_job, insert_task, get_task_for_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/10.html", + "source": "craigslist", "location": "", "description": "", + "date_found": "2026-02-24", + }) + task_id, _ = insert_task(db, "scrape_url", job_id) + + with patch("scripts.scrape_url.scrape_job_url", return_value={"title": "CSM", "company": ""}): + with patch("scripts.task_runner.submit_task", wraps=None) as mock_submit: + # Use wraps=None so we can capture calls without actually spawning threads + mock_submit.return_value = (99, True) + from scripts.task_runner import _run_task + _run_task(db, task_id, "scrape_url", job_id) + + # submit_task should have been called with enrich_craigslist + assert mock_submit.called + call_args = mock_submit.call_args + assert call_args[0][1] == "enrich_craigslist" + assert call_args[0][2] == job_id