diff --git a/dev-api.py b/dev-api.py index f37e73a..8c557bc 100644 --- a/dev-api.py +++ b/dev-api.py @@ -10,6 +10,7 @@ import sys import re import json import threading +from urllib.parse import urlparse from bs4 import BeautifulSoup from datetime import datetime from pathlib import Path @@ -71,6 +72,63 @@ def _startup(): db.close() +# ── Link extraction helpers ─────────────────────────────────────────────── + +_JOB_DOMAINS = frozenset({ + 'greenhouse.io', 'lever.co', 'workday.com', 'linkedin.com', + 'ashbyhq.com', 'smartrecruiters.com', 'icims.com', 'taleo.net', + 'jobvite.com', 'breezy.hr', 'recruitee.com', 'bamboohr.com', + 'myworkdayjobs.com', +}) + +_JOB_PATH_SEGMENTS = frozenset({'careers', 'jobs'}) + +_FILTER_RE = re.compile( + r'(unsubscribe|mailto:|/track/|pixel\.|\.gif|\.png|\.jpg' + r'|/open\?|/click\?|list-unsubscribe)', + re.I, +) + +_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.I) + + +def _score_url(url: str) -> int: + """Return 2 for likely job URLs, 1 for others, -1 to exclude.""" + if _FILTER_RE.search(url): + return -1 + parsed = urlparse(url) + hostname = (parsed.hostname or '').lower() + path = parsed.path.lower() + for domain in _JOB_DOMAINS: + if domain in hostname or domain in path: + return 2 + for seg in _JOB_PATH_SEGMENTS: + if f'/{seg}/' in path or path.startswith(f'/{seg}'): + return 2 + return 1 + + +def _extract_links(body: str) -> list[dict]: + """Extract and rank URLs from raw HTML email body.""" + if not body: + return [] + seen: set[str] = set() + results = [] + for m in _URL_RE.finditer(body): + url = m.group(0).rstrip('.,;)') + if url in seen: + continue + seen.add(url) + score = _score_url(url) + if score < 0: + continue + start = max(0, m.start() - 60) + hint = body[start:m.start()].strip().split('\n')[-1].strip() + results.append({'url': url, 'score': score, 'hint': hint}) + results.sort(key=lambda x: -x['score']) + return results + + def _row_to_job(row) -> dict: d = dict(row) d["is_remote"] = bool(d.get("is_remote", 0)) @@ -500,6 +558,24 @@ def add_to_digest_queue(body: DigestQueueBody): return {"ok": True, "created": created} +# ── POST /api/digest-queue/{id}/extract-links ───────────────────────────── + +@app.post("/api/digest-queue/{digest_id}/extract-links") +def extract_digest_links(digest_id: int): + db = _get_db() + row = db.execute( + """SELECT jc.body + FROM digest_queue dq + JOIN job_contacts jc ON jc.id = dq.job_contact_id + WHERE dq.id = ?""", + (digest_id,), + ).fetchone() + db.close() + if not row: + raise HTTPException(404, "Digest entry not found") + return {"links": _extract_links(row["body"] or "")} + + # ── POST /api/jobs/{id}/move ─────────────────────────────────────────────────── STATUS_TIMESTAMP_COL = { diff --git a/tests/test_dev_api_digest.py b/tests/test_dev_api_digest.py index 66b1aa2..d749763 100644 --- a/tests/test_dev_api_digest.py +++ b/tests/test_dev_api_digest.py @@ -108,3 +108,47 @@ def test_digest_queue_add_duplicate(client): def test_digest_queue_add_missing_contact(client): resp = client.post("/api/digest-queue", json={"job_contact_id": 9999}) assert resp.status_code == 404 + + +# ── POST /api/digest-queue/{id}/extract-links ─────────────────────────────── + +def _add_digest_entry(tmp_db, contact_id=10): + """Helper: insert a digest_queue row and return its id.""" + con = sqlite3.connect(tmp_db) + cur = con.execute("INSERT INTO digest_queue (job_contact_id) VALUES (?)", (contact_id,)) + entry_id = cur.lastrowid + con.commit() + con.close() + return entry_id + + +def test_digest_extract_links(client, tmp_db): + entry_id = _add_digest_entry(tmp_db) + resp = client.post(f"/api/digest-queue/{entry_id}/extract-links") + assert resp.status_code == 200 + links = resp.json()["links"] + + # greenhouse.io link should be present with score=2 + gh_links = [l for l in links if "greenhouse.io" in l["url"]] + assert len(gh_links) == 1 + assert gh_links[0]["score"] == 2 + + # lever.co link should be present with score=2 + lever_links = [l for l in links if "lever.co" in l["url"]] + assert len(lever_links) == 1 + assert lever_links[0]["score"] == 2 + + +def test_digest_extract_links_filters_trackers(client, tmp_db): + entry_id = _add_digest_entry(tmp_db) + resp = client.post(f"/api/digest-queue/{entry_id}/extract-links") + assert resp.status_code == 200 + links = resp.json()["links"] + urls = [l["url"] for l in links] + # Unsubscribe URL should be excluded + assert not any("unsubscribe" in u for u in urls) + + +def test_digest_extract_links_404(client): + resp = client.post("/api/digest-queue/9999/extract-links") + assert resp.status_code == 404