From b3893e9ad945917adcb44757af3961fa272f0fdb Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sun, 15 Mar 2026 04:56:26 -0700
Subject: [PATCH] feat: add Jobgether URL detection and scraper to
 scrape_url.py

---
 scripts/scrape_url.py    | 67 ++++++++++++++++++++++++++++++++++++++++
 tests/test_scrape_url.py | 33 ++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/scripts/scrape_url.py b/scripts/scrape_url.py
index e577fe6..ea55306 100644
--- a/scripts/scrape_url.py
+++ b/scripts/scrape_url.py
@@ -33,6 +33,20 @@ _STRIP_PARAMS = {
     "eid", "otpToken", "ssid", "fmid",
 }
 
+def _company_from_jobgether_url(url: str) -> str:
+    """Extract company name from Jobgether offer URL slug.
+
+    Slug format: /offer/{24-hex-hash}-{title-slug}---{company-slug}
+    Triple-dash separator delimits title from company.
+    Returns title-cased company name, or "" if pattern not found.
+    """
+    m = re.search(r"---([^/?]+)$", urlparse(url).path)
+    if not m:
+        print(f"[scrape_url] Jobgether URL slug: no company separator found in {url}")
+        return ""
+    return m.group(1).replace("-", " ").title()
+
+
 _HEADERS = {
     "User-Agent": (
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
@@ -51,6 +65,8 @@ def _detect_board(url: str) -> str:
         return "indeed"
     if "glassdoor.com" in url_lower:
         return "glassdoor"
+    if "jobgether.com" in url_lower:
+        return "jobgether"
     return "generic"
 
 
@@ -136,6 +152,55 @@ def _scrape_glassdoor(url: str) -> dict:
         return {}
 
 
+def _scrape_jobgether(url: str) -> dict:
+    """Scrape a Jobgether offer page using Playwright to bypass 403.
+
+    Falls back to URL slug for company name when Playwright is unavailable.
+    Does not use requests — no raise_for_status().
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError:
+        company = _company_from_jobgether_url(url)
+        if company:
+            print(f"[scrape_url] Jobgether: Playwright not installed, using slug fallback → {company}")
+        return {"company": company, "source": "jobgether"} if company else {}
+
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            try:
+                ctx = browser.new_context(user_agent=_HEADERS["User-Agent"])
+                page = ctx.new_page()
+                page.goto(url, timeout=30_000)
+                page.wait_for_load_state("networkidle", timeout=20_000)
+
+                result = page.evaluate("""() => {
+                    const title = document.querySelector('h1')?.textContent?.trim() || '';
+                    const company = document.querySelector('[class*="company"], [class*="employer"], [data-testid*="company"]')
+                        ?.textContent?.trim() || '';
+                    const location = document.querySelector('[class*="location"], [data-testid*="location"]')
+                        ?.textContent?.trim() || '';
+                    const desc = document.querySelector('[class*="description"], [class*="job-desc"], article')
+                        ?.innerText?.trim() || '';
+                    return { title, company, location, description: desc };
+                }""")
+            finally:
+                browser.close()
+
+        # Fall back to slug for company if DOM extraction missed it
+        if not result.get("company"):
+            result["company"] = _company_from_jobgether_url(url)
+
+        result["source"] = "jobgether"
+        return {k: v for k, v in result.items() if v}
+
+    except Exception as exc:
+        print(f"[scrape_url] Jobgether Playwright error for {url}: {exc}")
+        company = _company_from_jobgether_url(url)
+        return {"company": company, "source": "jobgether"} if company else {}
+
+
 def _parse_json_ld_or_og(html: str) -> dict:
     """Extract job fields from JSON-LD structured data, then og: meta tags."""
     soup = BeautifulSoup(html, "html.parser")
@@ -211,6 +276,8 @@ def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict:
             fields = _scrape_indeed(url)
         elif board == "glassdoor":
             fields = _scrape_glassdoor(url)
+        elif board == "jobgether":
+            fields = _scrape_jobgether(url)
         else:
             fields = _scrape_generic(url)
     except requests.RequestException as exc:
diff --git a/tests/test_scrape_url.py b/tests/test_scrape_url.py
index 37eace4..df599ae 100644
--- a/tests/test_scrape_url.py
+++ b/tests/test_scrape_url.py
@@ -133,3 +133,36 @@ def test_scrape_url_graceful_on_http_error(tmp_path):
     row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone()
     conn.close()
     assert row is not None
+
+
+def test_detect_board_jobgether():
+    from scripts.scrape_url import _detect_board
+    assert _detect_board("https://jobgether.com/offer/69b42d9d24d79271ee0618e8-csm---resware") == "jobgether"
+    assert _detect_board("https://www.jobgether.com/offer/abc-role---company") == "jobgether"
+
+
+def test_jobgether_slug_company_extraction():
+    from scripts.scrape_url import _company_from_jobgether_url
+    assert _company_from_jobgether_url(
+        "https://jobgether.com/offer/69b42d9d24d79271ee0618e8-customer-success-manager---resware"
+    ) == "Resware"
+    assert _company_from_jobgether_url(
+        "https://jobgether.com/offer/abc123-director-of-cs---acme-corp"
+    ) == "Acme Corp"
+    assert _company_from_jobgether_url(
+        "https://jobgether.com/offer/abc123-no-separator-here"
+    ) == ""
+
+
+def test_scrape_jobgether_no_playwright(tmp_path):
+    """When Playwright is unavailable, _scrape_jobgether falls back to URL slug for company."""
+    import sys
+    import unittest.mock as mock
+
+    url = "https://jobgether.com/offer/69b42d9d24d79271ee0618e8-customer-success-manager---resware"
+    with mock.patch.dict(sys.modules, {"playwright": None, "playwright.sync_api": None}):
+        from scripts.scrape_url import _scrape_jobgether
+        result = _scrape_jobgether(url)
+
+    assert result.get("company") == "Resware"
+    assert result.get("source") == "jobgether"