feat: add /extract-links endpoint with URL scoring

This commit is contained in:
pyr0ball 2026-03-20 06:59:26 -07:00
parent b718439c54
commit aee8038be9
2 changed files with 120 additions and 0 deletions

View file

@ -10,6 +10,7 @@ import sys
import re import re
import json import json
import threading import threading
from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@ -71,6 +72,63 @@ def _startup():
db.close() db.close()
# ── Link extraction helpers ───────────────────────────────────────────────
_JOB_DOMAINS = frozenset({
'greenhouse.io', 'lever.co', 'workday.com', 'linkedin.com',
'ashbyhq.com', 'smartrecruiters.com', 'icims.com', 'taleo.net',
'jobvite.com', 'breezy.hr', 'recruitee.com', 'bamboohr.com',
'myworkdayjobs.com',
})
_JOB_PATH_SEGMENTS = frozenset({'careers', 'jobs'})
_FILTER_RE = re.compile(
r'(unsubscribe|mailto:|/track/|pixel\.|\.gif|\.png|\.jpg'
r'|/open\?|/click\?|list-unsubscribe)',
re.I,
)
_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.I)
def _score_url(url: str) -> int:
"""Return 2 for likely job URLs, 1 for others, -1 to exclude."""
if _FILTER_RE.search(url):
return -1
parsed = urlparse(url)
hostname = (parsed.hostname or '').lower()
path = parsed.path.lower()
for domain in _JOB_DOMAINS:
if domain in hostname or domain in path:
return 2
for seg in _JOB_PATH_SEGMENTS:
if f'/{seg}/' in path or path.startswith(f'/{seg}'):
return 2
return 1
def _extract_links(body: str) -> list[dict]:
"""Extract and rank URLs from raw HTML email body."""
if not body:
return []
seen: set[str] = set()
results = []
for m in _URL_RE.finditer(body):
url = m.group(0).rstrip('.,;)')
if url in seen:
continue
seen.add(url)
score = _score_url(url)
if score < 0:
continue
start = max(0, m.start() - 60)
hint = body[start:m.start()].strip().split('\n')[-1].strip()
results.append({'url': url, 'score': score, 'hint': hint})
results.sort(key=lambda x: -x['score'])
return results
def _row_to_job(row) -> dict: def _row_to_job(row) -> dict:
d = dict(row) d = dict(row)
d["is_remote"] = bool(d.get("is_remote", 0)) d["is_remote"] = bool(d.get("is_remote", 0))
@ -500,6 +558,24 @@ def add_to_digest_queue(body: DigestQueueBody):
return {"ok": True, "created": created} return {"ok": True, "created": created}
# ── POST /api/digest-queue/{id}/extract-links ─────────────────────────────
@app.post("/api/digest-queue/{digest_id}/extract-links")
def extract_digest_links(digest_id: int):
db = _get_db()
row = db.execute(
"""SELECT jc.body
FROM digest_queue dq
JOIN job_contacts jc ON jc.id = dq.job_contact_id
WHERE dq.id = ?""",
(digest_id,),
).fetchone()
db.close()
if not row:
raise HTTPException(404, "Digest entry not found")
return {"links": _extract_links(row["body"] or "")}
# ── POST /api/jobs/{id}/move ─────────────────────────────────────────────────── # ── POST /api/jobs/{id}/move ───────────────────────────────────────────────────
STATUS_TIMESTAMP_COL = { STATUS_TIMESTAMP_COL = {

View file

@ -108,3 +108,47 @@ def test_digest_queue_add_duplicate(client):
def test_digest_queue_add_missing_contact(client): def test_digest_queue_add_missing_contact(client):
resp = client.post("/api/digest-queue", json={"job_contact_id": 9999}) resp = client.post("/api/digest-queue", json={"job_contact_id": 9999})
assert resp.status_code == 404 assert resp.status_code == 404
# ── POST /api/digest-queue/{id}/extract-links ───────────────────────────────
def _add_digest_entry(tmp_db, contact_id=10):
"""Helper: insert a digest_queue row and return its id."""
con = sqlite3.connect(tmp_db)
cur = con.execute("INSERT INTO digest_queue (job_contact_id) VALUES (?)", (contact_id,))
entry_id = cur.lastrowid
con.commit()
con.close()
return entry_id
def test_digest_extract_links(client, tmp_db):
entry_id = _add_digest_entry(tmp_db)
resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
assert resp.status_code == 200
links = resp.json()["links"]
# greenhouse.io link should be present with score=2
gh_links = [l for l in links if "greenhouse.io" in l["url"]]
assert len(gh_links) == 1
assert gh_links[0]["score"] == 2
# lever.co link should be present with score=2
lever_links = [l for l in links if "lever.co" in l["url"]]
assert len(lever_links) == 1
assert lever_links[0]["score"] == 2
def test_digest_extract_links_filters_trackers(client, tmp_db):
entry_id = _add_digest_entry(tmp_db)
resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
assert resp.status_code == 200
links = resp.json()["links"]
urls = [l["url"] for l in links]
# Unsubscribe URL should be excluded
assert not any("unsubscribe" in u for u in urls)
def test_digest_extract_links_404(client):
resp = client.post("/api/digest-queue/9999/extract-links")
assert resp.status_code == 404