feat: add /extract-links endpoint with URL scoring

2026-03-20 06:59:26 -07:00 · 2026-03-20 06:59:26 -07:00 · aee8038be9
commit aee8038be9
parent b718439c54
2 changed files with 120 additions and 0 deletions
--- a/dev-api.py
+++ b/dev-api.py
@ -10,6 +10,7 @@ import sys
 import re
 import json
 import threading
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from datetime import datetime
 from pathlib import Path
@ -71,6 +72,63 @@ def _startup():
        db.close()


+# ── Link extraction helpers ───────────────────────────────────────────────
+
+_JOB_DOMAINS = frozenset({
+    'greenhouse.io', 'lever.co', 'workday.com', 'linkedin.com',
+    'ashbyhq.com', 'smartrecruiters.com', 'icims.com', 'taleo.net',
+    'jobvite.com', 'breezy.hr', 'recruitee.com', 'bamboohr.com',
+    'myworkdayjobs.com',
+})
+
+_JOB_PATH_SEGMENTS = frozenset({'careers', 'jobs'})
+
+_FILTER_RE = re.compile(
+    r'(unsubscribe|mailto:|/track/|pixel\.|\.gif|\.png|\.jpg'
+    r'|/open\?|/click\?|list-unsubscribe)',
+    re.I,
+)
+
+_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.I)
+
+
+def _score_url(url: str) -> int:
+    """Return 2 for likely job URLs, 1 for others, -1 to exclude."""
+    if _FILTER_RE.search(url):
+        return -1
+    parsed = urlparse(url)
+    hostname = (parsed.hostname or '').lower()
+    path = parsed.path.lower()
+    for domain in _JOB_DOMAINS:
+        if domain in hostname or domain in path:
+            return 2
+    for seg in _JOB_PATH_SEGMENTS:
+        if f'/{seg}/' in path or path.startswith(f'/{seg}'):
+            return 2
+    return 1
+
+
+def _extract_links(body: str) -> list[dict]:
+    """Extract and rank URLs from raw HTML email body."""
+    if not body:
+        return []
+    seen: set[str] = set()
+    results = []
+    for m in _URL_RE.finditer(body):
+        url = m.group(0).rstrip('.,;)')
+        if url in seen:
+            continue
+        seen.add(url)
+        score = _score_url(url)
+        if score < 0:
+            continue
+        start = max(0, m.start() - 60)
+        hint = body[start:m.start()].strip().split('\n')[-1].strip()
+        results.append({'url': url, 'score': score, 'hint': hint})
+    results.sort(key=lambda x: -x['score'])
+    return results
+
+
 def _row_to_job(row) -> dict:
    d = dict(row)
    d["is_remote"] = bool(d.get("is_remote", 0))
@ -500,6 +558,24 @@ def add_to_digest_queue(body: DigestQueueBody):
    return {"ok": True, "created": created}


+# ── POST /api/digest-queue/{id}/extract-links ─────────────────────────────
+
+@app.post("/api/digest-queue/{digest_id}/extract-links")
+def extract_digest_links(digest_id: int):
+    db = _get_db()
+    row = db.execute(
+        """SELECT jc.body
+           FROM digest_queue dq
+           JOIN job_contacts jc ON jc.id = dq.job_contact_id
+           WHERE dq.id = ?""",
+        (digest_id,),
+    ).fetchone()
+    db.close()
+    if not row:
+        raise HTTPException(404, "Digest entry not found")
+    return {"links": _extract_links(row["body"] or "")}
+
+
 # ── POST /api/jobs/{id}/move ───────────────────────────────────────────────────

 STATUS_TIMESTAMP_COL = {
--- a/tests/test_dev_api_digest.py
+++ b/tests/test_dev_api_digest.py
@ -108,3 +108,47 @@ def test_digest_queue_add_duplicate(client):
 def test_digest_queue_add_missing_contact(client):
    resp = client.post("/api/digest-queue", json={"job_contact_id": 9999})
    assert resp.status_code == 404
+
+
+# ── POST /api/digest-queue/{id}/extract-links ───────────────────────────────
+
+def _add_digest_entry(tmp_db, contact_id=10):
+    """Helper: insert a digest_queue row and return its id."""
+    con = sqlite3.connect(tmp_db)
+    cur = con.execute("INSERT INTO digest_queue (job_contact_id) VALUES (?)", (contact_id,))
+    entry_id = cur.lastrowid
+    con.commit()
+    con.close()
+    return entry_id
+
+
+def test_digest_extract_links(client, tmp_db):
+    entry_id = _add_digest_entry(tmp_db)
+    resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
+    assert resp.status_code == 200
+    links = resp.json()["links"]
+
+    # greenhouse.io link should be present with score=2
+    gh_links = [l for l in links if "greenhouse.io" in l["url"]]
+    assert len(gh_links) == 1
+    assert gh_links[0]["score"] == 2
+
+    # lever.co link should be present with score=2
+    lever_links = [l for l in links if "lever.co" in l["url"]]
+    assert len(lever_links) == 1
+    assert lever_links[0]["score"] == 2
+
+
+def test_digest_extract_links_filters_trackers(client, tmp_db):
+    entry_id = _add_digest_entry(tmp_db)
+    resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
+    assert resp.status_code == 200
+    links = resp.json()["links"]
+    urls = [l["url"] for l in links]
+    # Unsubscribe URL should be excluded
+    assert not any("unsubscribe" in u for u in urls)
+
+
+def test_digest_extract_links_404(client):
+    resp = client.post("/api/digest-queue/9999/extract-links")
+    assert resp.status_code == 404