App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
228 lines
7.9 KiB
Python
228 lines
7.9 KiB
Python
# scripts/scrape_url.py
|
|
"""
|
|
Scrape a job listing from its URL and update the job record.
|
|
|
|
Supports:
|
|
- LinkedIn (guest jobs API — no auth required)
|
|
- Indeed (HTML parse)
|
|
- Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py)
|
|
- Generic (JSON-LD → og:tags fallback)
|
|
|
|
Usage (background task — called by task_runner):
|
|
from scripts.scrape_url import scrape_job_url
|
|
scrape_job_url(db_path, job_id)
|
|
"""
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from urllib.parse import urlparse, urlencode, parse_qsl
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from scripts.db import DEFAULT_DB, update_job_fields
|
|
|
|
_STRIP_PARAMS = {
|
|
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
|
"trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig",
|
|
"eid", "otpToken", "ssid", "fmid",
|
|
}
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
_TIMEOUT = 12
|
|
|
|
|
|
def _detect_board(url: str) -> str:
|
|
"""Return 'linkedin', 'indeed', 'glassdoor', or 'generic'."""
|
|
url_lower = url.lower()
|
|
if "linkedin.com" in url_lower:
|
|
return "linkedin"
|
|
if "indeed.com" in url_lower:
|
|
return "indeed"
|
|
if "glassdoor.com" in url_lower:
|
|
return "glassdoor"
|
|
return "generic"
|
|
|
|
|
|
def _extract_linkedin_job_id(url: str) -> Optional[str]:
|
|
"""Extract numeric job ID from a LinkedIn job URL."""
|
|
m = re.search(r"/jobs/view/(\d+)", url)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def canonicalize_url(url: str) -> str:
|
|
"""
|
|
Strip tracking parameters from a job URL and return a clean canonical form.
|
|
|
|
LinkedIn: https://www.linkedin.com/jobs/view/<id>/?trk=... → https://www.linkedin.com/jobs/view/<id>/
|
|
Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId
|
|
"""
|
|
url = url.strip()
|
|
if "linkedin.com" in url.lower():
|
|
job_id = _extract_linkedin_job_id(url)
|
|
if job_id:
|
|
return f"https://www.linkedin.com/jobs/view/{job_id}/"
|
|
parsed = urlparse(url)
|
|
clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS])
|
|
return parsed._replace(query=clean_qs).geturl()
|
|
|
|
|
|
def _scrape_linkedin(url: str) -> dict:
|
|
"""Fetch via LinkedIn guest jobs API (no auth required)."""
|
|
job_id = _extract_linkedin_job_id(url)
|
|
if not job_id:
|
|
return {}
|
|
api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
|
|
resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
def _text(selector, **kwargs):
|
|
tag = soup.find(selector, **kwargs)
|
|
return tag.get_text(strip=True) if tag else ""
|
|
|
|
title = _text("h2", class_="top-card-layout__title")
|
|
company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link")
|
|
location = _text("span", class_="topcard__flavor--bullet")
|
|
desc_div = soup.find("div", class_="show-more-less-html__markup")
|
|
description = desc_div.get_text(separator="\n", strip=True) if desc_div else ""
|
|
|
|
return {k: v for k, v in {
|
|
"title": title,
|
|
"company": company,
|
|
"location": location,
|
|
"description": description,
|
|
"source": "linkedin",
|
|
}.items() if v}
|
|
|
|
|
|
def _scrape_indeed(url: str) -> dict:
|
|
"""Scrape an Indeed job page."""
|
|
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
|
resp.raise_for_status()
|
|
return _parse_json_ld_or_og(resp.text) or {}
|
|
|
|
|
|
def _scrape_glassdoor(url: str) -> dict:
|
|
"""Re-use JobSpy's Glassdoor scraper for description fetch."""
|
|
m = re.search(r"jl=(\d+)", url)
|
|
if not m:
|
|
return {}
|
|
try:
|
|
from jobspy.glassdoor import Glassdoor
|
|
from jobspy.glassdoor.constant import fallback_token, headers
|
|
from jobspy.model import ScraperInput, Site
|
|
from jobspy.util import create_session
|
|
|
|
scraper = Glassdoor()
|
|
scraper.base_url = "https://www.glassdoor.com/"
|
|
scraper.session = create_session(has_retry=True)
|
|
token = scraper._get_csrf_token()
|
|
headers["gd-csrf-token"] = token if token else fallback_token
|
|
scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
|
|
description = scraper._fetch_job_description(int(m.group(1)))
|
|
return {"description": description} if description else {}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def _parse_json_ld_or_og(html: str) -> dict:
|
|
"""Extract job fields from JSON-LD structured data, then og: meta tags."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
for script in soup.find_all("script", type="application/ld+json"):
|
|
try:
|
|
data = json.loads(script.string or "")
|
|
if isinstance(data, list):
|
|
data = next((d for d in data if d.get("@type") == "JobPosting"), {})
|
|
if data.get("@type") == "JobPosting":
|
|
org = data.get("hiringOrganization") or {}
|
|
loc = data.get("jobLocation") or {}
|
|
if isinstance(loc, list):
|
|
loc = loc[0] if loc else {}
|
|
addr = loc.get("address") or {}
|
|
location = (
|
|
addr.get("addressLocality", "") or
|
|
addr.get("addressRegion", "") or
|
|
addr.get("addressCountry", "")
|
|
)
|
|
return {k: v for k, v in {
|
|
"title": data.get("title", ""),
|
|
"company": org.get("name", ""),
|
|
"location": location,
|
|
"description": data.get("description", ""),
|
|
"salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "",
|
|
}.items() if v}
|
|
except Exception:
|
|
continue
|
|
|
|
def _meta(prop):
|
|
tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
|
|
return tag.get("content", "") if tag else ""
|
|
|
|
title_tag = soup.find("title")
|
|
title = _meta("og:title") or (title_tag.get_text(strip=True) if title_tag else "")
|
|
description = _meta("og:description")
|
|
return {k: v for k, v in {"title": title, "description": description}.items() if v}
|
|
|
|
|
|
def _scrape_generic(url: str) -> dict:
|
|
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
|
resp.raise_for_status()
|
|
return _parse_json_ld_or_og(resp.text) or {}
|
|
|
|
|
|
def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict:
|
|
"""
|
|
Fetch the job listing at the stored URL and update the job record.
|
|
|
|
Returns the dict of fields scraped (may be empty on failure).
|
|
Does not raise — failures are logged and the job row is left as-is.
|
|
"""
|
|
if job_id is None:
|
|
return {}
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone()
|
|
conn.close()
|
|
if not row:
|
|
return {}
|
|
|
|
url = row["url"] or ""
|
|
if not url.startswith("http"):
|
|
return {}
|
|
|
|
board = _detect_board(url)
|
|
try:
|
|
if board == "linkedin":
|
|
fields = _scrape_linkedin(url)
|
|
elif board == "indeed":
|
|
fields = _scrape_indeed(url)
|
|
elif board == "glassdoor":
|
|
fields = _scrape_glassdoor(url)
|
|
else:
|
|
fields = _scrape_generic(url)
|
|
except requests.RequestException as exc:
|
|
print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}")
|
|
return {}
|
|
except Exception as exc:
|
|
print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}")
|
|
return {}
|
|
|
|
if fields:
|
|
fields.pop("url", None)
|
|
update_job_fields(db_path, job_id, fields)
|
|
print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}")
|
|
|
|
return fields
|