feat: add /extract-links endpoint with URL scoring
This commit is contained in:
parent
b718439c54
commit
aee8038be9
2 changed files with 120 additions and 0 deletions
76
dev-api.py
76
dev-api.py
|
|
@ -10,6 +10,7 @@ import sys
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
from urllib.parse import urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -71,6 +72,63 @@ def _startup():
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Link extraction helpers ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
_JOB_DOMAINS = frozenset({
|
||||||
|
'greenhouse.io', 'lever.co', 'workday.com', 'linkedin.com',
|
||||||
|
'ashbyhq.com', 'smartrecruiters.com', 'icims.com', 'taleo.net',
|
||||||
|
'jobvite.com', 'breezy.hr', 'recruitee.com', 'bamboohr.com',
|
||||||
|
'myworkdayjobs.com',
|
||||||
|
})
|
||||||
|
|
||||||
|
_JOB_PATH_SEGMENTS = frozenset({'careers', 'jobs'})
|
||||||
|
|
||||||
|
_FILTER_RE = re.compile(
|
||||||
|
r'(unsubscribe|mailto:|/track/|pixel\.|\.gif|\.png|\.jpg'
|
||||||
|
r'|/open\?|/click\?|list-unsubscribe)',
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def _score_url(url: str) -> int:
|
||||||
|
"""Return 2 for likely job URLs, 1 for others, -1 to exclude."""
|
||||||
|
if _FILTER_RE.search(url):
|
||||||
|
return -1
|
||||||
|
parsed = urlparse(url)
|
||||||
|
hostname = (parsed.hostname or '').lower()
|
||||||
|
path = parsed.path.lower()
|
||||||
|
for domain in _JOB_DOMAINS:
|
||||||
|
if domain in hostname or domain in path:
|
||||||
|
return 2
|
||||||
|
for seg in _JOB_PATH_SEGMENTS:
|
||||||
|
if f'/{seg}/' in path or path.startswith(f'/{seg}'):
|
||||||
|
return 2
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_links(body: str) -> list[dict]:
|
||||||
|
"""Extract and rank URLs from raw HTML email body."""
|
||||||
|
if not body:
|
||||||
|
return []
|
||||||
|
seen: set[str] = set()
|
||||||
|
results = []
|
||||||
|
for m in _URL_RE.finditer(body):
|
||||||
|
url = m.group(0).rstrip('.,;)')
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
score = _score_url(url)
|
||||||
|
if score < 0:
|
||||||
|
continue
|
||||||
|
start = max(0, m.start() - 60)
|
||||||
|
hint = body[start:m.start()].strip().split('\n')[-1].strip()
|
||||||
|
results.append({'url': url, 'score': score, 'hint': hint})
|
||||||
|
results.sort(key=lambda x: -x['score'])
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _row_to_job(row) -> dict:
|
def _row_to_job(row) -> dict:
|
||||||
d = dict(row)
|
d = dict(row)
|
||||||
d["is_remote"] = bool(d.get("is_remote", 0))
|
d["is_remote"] = bool(d.get("is_remote", 0))
|
||||||
|
|
@ -500,6 +558,24 @@ def add_to_digest_queue(body: DigestQueueBody):
|
||||||
return {"ok": True, "created": created}
|
return {"ok": True, "created": created}
|
||||||
|
|
||||||
|
|
||||||
|
# ── POST /api/digest-queue/{id}/extract-links ─────────────────────────────
|
||||||
|
|
||||||
|
@app.post("/api/digest-queue/{digest_id}/extract-links")
|
||||||
|
def extract_digest_links(digest_id: int):
|
||||||
|
db = _get_db()
|
||||||
|
row = db.execute(
|
||||||
|
"""SELECT jc.body
|
||||||
|
FROM digest_queue dq
|
||||||
|
JOIN job_contacts jc ON jc.id = dq.job_contact_id
|
||||||
|
WHERE dq.id = ?""",
|
||||||
|
(digest_id,),
|
||||||
|
).fetchone()
|
||||||
|
db.close()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(404, "Digest entry not found")
|
||||||
|
return {"links": _extract_links(row["body"] or "")}
|
||||||
|
|
||||||
|
|
||||||
# ── POST /api/jobs/{id}/move ───────────────────────────────────────────────────
|
# ── POST /api/jobs/{id}/move ───────────────────────────────────────────────────
|
||||||
|
|
||||||
STATUS_TIMESTAMP_COL = {
|
STATUS_TIMESTAMP_COL = {
|
||||||
|
|
|
||||||
|
|
@ -108,3 +108,47 @@ def test_digest_queue_add_duplicate(client):
|
||||||
def test_digest_queue_add_missing_contact(client):
|
def test_digest_queue_add_missing_contact(client):
|
||||||
resp = client.post("/api/digest-queue", json={"job_contact_id": 9999})
|
resp = client.post("/api/digest-queue", json={"job_contact_id": 9999})
|
||||||
assert resp.status_code == 404
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
# ── POST /api/digest-queue/{id}/extract-links ───────────────────────────────
|
||||||
|
|
||||||
|
def _add_digest_entry(tmp_db, contact_id=10):
|
||||||
|
"""Helper: insert a digest_queue row and return its id."""
|
||||||
|
con = sqlite3.connect(tmp_db)
|
||||||
|
cur = con.execute("INSERT INTO digest_queue (job_contact_id) VALUES (?)", (contact_id,))
|
||||||
|
entry_id = cur.lastrowid
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
return entry_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_digest_extract_links(client, tmp_db):
|
||||||
|
entry_id = _add_digest_entry(tmp_db)
|
||||||
|
resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
links = resp.json()["links"]
|
||||||
|
|
||||||
|
# greenhouse.io link should be present with score=2
|
||||||
|
gh_links = [l for l in links if "greenhouse.io" in l["url"]]
|
||||||
|
assert len(gh_links) == 1
|
||||||
|
assert gh_links[0]["score"] == 2
|
||||||
|
|
||||||
|
# lever.co link should be present with score=2
|
||||||
|
lever_links = [l for l in links if "lever.co" in l["url"]]
|
||||||
|
assert len(lever_links) == 1
|
||||||
|
assert lever_links[0]["score"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_digest_extract_links_filters_trackers(client, tmp_db):
|
||||||
|
entry_id = _add_digest_entry(tmp_db)
|
||||||
|
resp = client.post(f"/api/digest-queue/{entry_id}/extract-links")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
links = resp.json()["links"]
|
||||||
|
urls = [l["url"] for l in links]
|
||||||
|
# Unsubscribe URL should be excluded
|
||||||
|
assert not any("unsubscribe" in u for u in urls)
|
||||||
|
|
||||||
|
|
||||||
|
def test_digest_extract_links_404(client):
|
||||||
|
resp = client.post("/api/digest-queue/9999/extract-links")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue