# scripts/enrich_descriptions.py """ Post-discovery enrichment: retry Glassdoor job description fetches that returned empty/null during the initial scrape (usually rate-limit 429s or expired listings mid-batch). Fetches descriptions one at a time with a configurable delay between requests to stay under Glassdoor's rate limit. Usage: conda run -n job-seeker python scripts/enrich_descriptions.py conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0 """ import re import sqlite3 import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from scripts.db import DEFAULT_DB, init_db DELAY_SECS = 1.5 # seconds between description fetches def _extract_job_id(url: str) -> str | None: """Pull the Glassdoor listing ID from a job URL (…?jl=1234567890).""" m = re.search(r"jl=(\d+)", url or "") return m.group(1) if m else None def _setup_scraper(): """ Create a Glassdoor scraper instance initialised just enough to call _fetch_job_description() — skips the full job-search setup. """ from jobspy.glassdoor import Glassdoor from jobspy.glassdoor.constant import fallback_token, headers from jobspy.model import ScraperInput, Site from jobspy.util import create_session scraper = Glassdoor() scraper.base_url = "https://www.glassdoor.com/" scraper.session = create_session(has_retry=True) token = scraper._get_csrf_token() headers["gd-csrf-token"] = token if token else fallback_token scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) return scraper def enrich_glassdoor_descriptions( db_path: Path = DEFAULT_DB, dry_run: bool = False, delay: float = DELAY_SECS, ) -> dict: """ Find Glassdoor jobs with missing descriptions and re-fetch them. Returns: {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} """ init_db(db_path) conn = sqlite3.connect(db_path) rows = conn.execute( """SELECT id, url, company, title FROM jobs WHERE source = 'glassdoor' AND (description IS NULL OR TRIM(description) = '') ORDER BY id ASC""" ).fetchall() conn.close() result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} if not rows: print("[enrich] No Glassdoor jobs missing descriptions.") return result print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions — fetching…") try: scraper = _setup_scraper() except Exception as e: msg = f"Glassdoor scraper init failed: {e}" result["errors"].append(msg) result["failed"] = len(rows) print(f"[enrich] ERROR — {msg}") return result for db_id, url, company, title in rows: job_id = _extract_job_id(url) if not job_id: msg = f"job #{db_id}: cannot extract listing ID from URL: {url}" result["errors"].append(msg) result["failed"] += 1 print(f"[enrich] SKIP — {msg}") continue try: description = scraper._fetch_job_description(int(job_id)) if description and description.strip(): if not dry_run: upd = sqlite3.connect(db_path) upd.execute( "UPDATE jobs SET description = ? WHERE id = ?", (description, db_id), ) upd.commit() upd.close() tag = "[DRY-RUN] " if dry_run else "" print(f"[enrich] {tag}{company} — {title}: {len(description)} chars") result["succeeded"] += 1 else: print(f"[enrich] {company} — {title}: empty response (expired listing?)") result["failed"] += 1 except Exception as e: msg = f"job #{db_id} ({company}): {e}" result["errors"].append(msg) result["failed"] += 1 print(f"[enrich] ERROR — {msg}") if delay > 0: time.sleep(delay) return result def enrich_all_descriptions( db_path: Path = DEFAULT_DB, dry_run: bool = False, delay: float = DELAY_SECS, ) -> dict: """ Find ALL jobs with missing/empty descriptions (any source) and re-fetch them. Uses scrape_job_url for every source — it handles LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags. Returns: {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} """ from scripts.scrape_url import scrape_job_url init_db(db_path) conn = sqlite3.connect(db_path) rows = conn.execute( """SELECT id, url, company, title, source FROM jobs WHERE (description IS NULL OR TRIM(description) = '') AND url IS NOT NULL AND url != '' ORDER BY source, id ASC""" ).fetchall() conn.close() result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} if not rows: print("[enrich] No jobs with missing descriptions.") return result print(f"[enrich] {len(rows)} job(s) missing descriptions — fetching…") for db_id, url, company, title, source in rows: if not url.startswith("http"): result["failed"] += 1 continue tag = "[DRY-RUN] " if dry_run else "" try: fields = {} if dry_run else scrape_job_url(db_path, db_id) if fields or dry_run: desc_len = len(fields.get("description", "") or "") print(f"[enrich] {tag}[{source}] {company} — {title}: {desc_len} chars") result["succeeded"] += 1 else: print(f"[enrich] [{source}] {company} — {title}: no data returned") result["failed"] += 1 except Exception as e: msg = f"job #{db_id} ({company}): {e}" result["errors"].append(msg) result["failed"] += 1 print(f"[enrich] ERROR — {msg}") if delay > 0: time.sleep(delay) return result def enrich_craigslist_fields( db_path: Path = DEFAULT_DB, job_id: int = None, ) -> dict: """ Use LLM to extract company name and salary from a Craigslist job description. Called after scrape_url populates the description for a craigslist job. Only runs when: source='craigslist', company='', description non-empty. Returns dict with keys 'company' and/or 'salary' (may be empty strings). """ import json conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row row = conn.execute( "SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,) ).fetchone() conn.close() if not row: return {} if row["source"] != "craigslist": return {} if row["company"]: # already populated return {} if not (row["description"] or "").strip(): return {} from scripts.llm_router import LLMRouter prompt = ( "Extract the following from this job posting. " "Return JSON only, no commentary.\n\n" '{"company": "", ' '"salary": ""}\n\n' f"Posting:\n{row['description'][:3000]}" ) try: router = LLMRouter() raw = router.complete(prompt) except Exception as exc: print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}") return {} try: clean = re.sub(r"```(?:json)?|```", "", raw).strip() fields = json.loads(clean) except (json.JSONDecodeError, ValueError): print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}") return {} extracted = { k: (fields.get(k) or "").strip() for k in ("company", "salary") if (fields.get(k) or "").strip() } if extracted: from scripts.db import update_job_fields update_job_fields(db_path, job_id, extracted) print(f"[enrich_craigslist] job {job_id}: " f"company={extracted.get('company', '—')} " f"salary={extracted.get('salary', '—')}") return extracted if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Re-fetch missing job descriptions (all sources)" ) parser.add_argument("--glassdoor-only", action="store_true", help="Only re-fetch Glassdoor listings (legacy behaviour)") parser.add_argument("--dry-run", action="store_true", help="Show what would be fetched without saving") parser.add_argument("--delay", type=float, default=DELAY_SECS, help=f"Seconds between requests (default: {DELAY_SECS})") args = parser.parse_args() if args.glassdoor_only: r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay) else: r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay) print( f"\n[enrich] Done — {r['succeeded']} fetched, {r['failed']} failed" + (f", {len(r['errors'])} error(s)" if r["errors"] else "") )