peregrine/scripts/enrich_descriptions.py
pyr0ball f11a38eb0b chore: seed Peregrine from personal job-seeker (pre-generalization)
App: Peregrine
Company: Circuit Forge LLC
Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
2026-02-24 18:25:39 -08:00

284 lines
9.2 KiB
Python

# scripts/enrich_descriptions.py
"""
Post-discovery enrichment: retry Glassdoor job description fetches that
returned empty/null during the initial scrape (usually rate-limit 429s or
expired listings mid-batch).
Fetches descriptions one at a time with a configurable delay between
requests to stay under Glassdoor's rate limit.
Usage:
conda run -n job-seeker python scripts/enrich_descriptions.py
conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run
conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0
"""
import re
import sqlite3
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.db import DEFAULT_DB, init_db
DELAY_SECS = 1.5 # seconds between description fetches
def _extract_job_id(url: str) -> str | None:
"""Pull the Glassdoor listing ID from a job URL (…?jl=1234567890)."""
m = re.search(r"jl=(\d+)", url or "")
return m.group(1) if m else None
def _setup_scraper():
"""
Create a Glassdoor scraper instance initialised just enough to call
_fetch_job_description() — skips the full job-search setup.
"""
from jobspy.glassdoor import Glassdoor
from jobspy.glassdoor.constant import fallback_token, headers
from jobspy.model import ScraperInput, Site
from jobspy.util import create_session
scraper = Glassdoor()
scraper.base_url = "https://www.glassdoor.com/"
scraper.session = create_session(has_retry=True)
token = scraper._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
return scraper
def enrich_glassdoor_descriptions(
db_path: Path = DEFAULT_DB,
dry_run: bool = False,
delay: float = DELAY_SECS,
) -> dict:
"""
Find Glassdoor jobs with missing descriptions and re-fetch them.
Returns:
{"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
"""
init_db(db_path)
conn = sqlite3.connect(db_path)
rows = conn.execute(
"""SELECT id, url, company, title FROM jobs
WHERE source = 'glassdoor'
AND (description IS NULL OR TRIM(description) = '')
ORDER BY id ASC"""
).fetchall()
conn.close()
result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
if not rows:
print("[enrich] No Glassdoor jobs missing descriptions.")
return result
print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions — fetching…")
try:
scraper = _setup_scraper()
except Exception as e:
msg = f"Glassdoor scraper init failed: {e}"
result["errors"].append(msg)
result["failed"] = len(rows)
print(f"[enrich] ERROR — {msg}")
return result
for db_id, url, company, title in rows:
job_id = _extract_job_id(url)
if not job_id:
msg = f"job #{db_id}: cannot extract listing ID from URL: {url}"
result["errors"].append(msg)
result["failed"] += 1
print(f"[enrich] SKIP — {msg}")
continue
try:
description = scraper._fetch_job_description(int(job_id))
if description and description.strip():
if not dry_run:
upd = sqlite3.connect(db_path)
upd.execute(
"UPDATE jobs SET description = ? WHERE id = ?",
(description, db_id),
)
upd.commit()
upd.close()
tag = "[DRY-RUN] " if dry_run else ""
print(f"[enrich] {tag}{company}{title}: {len(description)} chars")
result["succeeded"] += 1
else:
print(f"[enrich] {company}{title}: empty response (expired listing?)")
result["failed"] += 1
except Exception as e:
msg = f"job #{db_id} ({company}): {e}"
result["errors"].append(msg)
result["failed"] += 1
print(f"[enrich] ERROR — {msg}")
if delay > 0:
time.sleep(delay)
return result
def enrich_all_descriptions(
db_path: Path = DEFAULT_DB,
dry_run: bool = False,
delay: float = DELAY_SECS,
) -> dict:
"""
Find ALL jobs with missing/empty descriptions (any source) and re-fetch them.
Uses scrape_job_url for every source — it handles LinkedIn, Indeed, Glassdoor,
Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags.
Returns:
{"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
"""
from scripts.scrape_url import scrape_job_url
init_db(db_path)
conn = sqlite3.connect(db_path)
rows = conn.execute(
"""SELECT id, url, company, title, source FROM jobs
WHERE (description IS NULL OR TRIM(description) = '')
AND url IS NOT NULL AND url != ''
ORDER BY source, id ASC"""
).fetchall()
conn.close()
result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
if not rows:
print("[enrich] No jobs with missing descriptions.")
return result
print(f"[enrich] {len(rows)} job(s) missing descriptions — fetching…")
for db_id, url, company, title, source in rows:
if not url.startswith("http"):
result["failed"] += 1
continue
tag = "[DRY-RUN] " if dry_run else ""
try:
fields = {} if dry_run else scrape_job_url(db_path, db_id)
if fields or dry_run:
desc_len = len(fields.get("description", "") or "")
print(f"[enrich] {tag}[{source}] {company}{title}: {desc_len} chars")
result["succeeded"] += 1
else:
print(f"[enrich] [{source}] {company}{title}: no data returned")
result["failed"] += 1
except Exception as e:
msg = f"job #{db_id} ({company}): {e}"
result["errors"].append(msg)
result["failed"] += 1
print(f"[enrich] ERROR — {msg}")
if delay > 0:
time.sleep(delay)
return result
def enrich_craigslist_fields(
db_path: Path = DEFAULT_DB,
job_id: int = None,
) -> dict:
"""
Use LLM to extract company name and salary from a Craigslist job description.
Called after scrape_url populates the description for a craigslist job.
Only runs when: source='craigslist', company='', description non-empty.
Returns dict with keys 'company' and/or 'salary' (may be empty strings).
"""
import json
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
row = conn.execute(
"SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,)
).fetchone()
conn.close()
if not row:
return {}
if row["source"] != "craigslist":
return {}
if row["company"]: # already populated
return {}
if not (row["description"] or "").strip():
return {}
from scripts.llm_router import LLMRouter
prompt = (
"Extract the following from this job posting. "
"Return JSON only, no commentary.\n\n"
'{"company": "<company name or empty string>", '
'"salary": "<salary/compensation or empty string>"}\n\n'
f"Posting:\n{row['description'][:3000]}"
)
try:
router = LLMRouter()
raw = router.complete(prompt)
except Exception as exc:
print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}")
return {}
try:
clean = re.sub(r"```(?:json)?|```", "", raw).strip()
fields = json.loads(clean)
except (json.JSONDecodeError, ValueError):
print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}")
return {}
extracted = {
k: (fields.get(k) or "").strip()
for k in ("company", "salary")
if (fields.get(k) or "").strip()
}
if extracted:
from scripts.db import update_job_fields
update_job_fields(db_path, job_id, extracted)
print(f"[enrich_craigslist] job {job_id}: "
f"company={extracted.get('company', '')} "
f"salary={extracted.get('salary', '')}")
return extracted
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Re-fetch missing job descriptions (all sources)"
)
parser.add_argument("--glassdoor-only", action="store_true",
help="Only re-fetch Glassdoor listings (legacy behaviour)")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be fetched without saving")
parser.add_argument("--delay", type=float, default=DELAY_SECS,
help=f"Seconds between requests (default: {DELAY_SECS})")
args = parser.parse_args()
if args.glassdoor_only:
r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay)
else:
r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay)
print(
f"\n[enrich] Done — {r['succeeded']} fetched, {r['failed']} failed"
+ (f", {len(r['errors'])} error(s)" if r["errors"] else "")
)