feat(signals): strip HTML and normalize whitespace from email bodies
This commit is contained in:
parent
909fe60908
commit
34494db8d8
2 changed files with 30 additions and 1 deletions
15
dev-api.py
15
dev-api.py
|
|
@ -10,6 +10,7 @@ import sys
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from fastapi import FastAPI, HTTPException, Response
|
from fastapi import FastAPI, HTTPException, Response
|
||||||
|
|
@ -40,6 +41,18 @@ def _get_db():
|
||||||
return db
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_html(text: str | None) -> str | None:
|
||||||
|
"""Strip HTML tags and normalize whitespace in email body text."""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
plain = BeautifulSoup(text, 'html.parser').get_text(separator='\n')
|
||||||
|
# Strip trailing whitespace from each line
|
||||||
|
lines = [line.rstrip() for line in plain.split('\n')]
|
||||||
|
# Collapse 3+ consecutive blank lines to at most 2
|
||||||
|
cleaned = re.sub(r'\n{3,}', '\n\n', '\n'.join(lines))
|
||||||
|
return cleaned.strip() or None
|
||||||
|
|
||||||
|
|
||||||
def _row_to_job(row) -> dict:
|
def _row_to_job(row) -> dict:
|
||||||
d = dict(row)
|
d = dict(row)
|
||||||
d["is_remote"] = bool(d.get("is_remote", 0))
|
d["is_remote"] = bool(d.get("is_remote", 0))
|
||||||
|
|
@ -321,7 +334,7 @@ def list_interviews():
|
||||||
"subject": sr["subject"],
|
"subject": sr["subject"],
|
||||||
"received_at": sr["received_at"],
|
"received_at": sr["received_at"],
|
||||||
"stage_signal": sr["stage_signal"],
|
"stage_signal": sr["stage_signal"],
|
||||||
"body": sr["body"],
|
"body": _strip_html(sr["body"]),
|
||||||
"from_addr": sr["from_addr"],
|
"from_addr": sr["from_addr"],
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -200,3 +200,19 @@ def test_reclassify_signal_404_for_missing_id(client):
|
||||||
resp = client.post("/api/stage-signals/9999/reclassify",
|
resp = client.post("/api/stage-signals/9999/reclassify",
|
||||||
json={"stage_signal": "neutral"})
|
json={"stage_signal": "neutral"})
|
||||||
assert resp.status_code == 404
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_signal_body_html_is_stripped(client, tmp_db):
|
||||||
|
import sqlite3
|
||||||
|
con = sqlite3.connect(tmp_db)
|
||||||
|
con.execute(
|
||||||
|
"UPDATE job_contacts SET body = ? WHERE id = 10",
|
||||||
|
("<html><body><p>Hi there,</p><p>Interview confirmed.</p></body></html>",)
|
||||||
|
)
|
||||||
|
con.commit(); con.close()
|
||||||
|
resp = client.get("/api/interviews")
|
||||||
|
jobs = {j["id"]: j for j in resp.json()}
|
||||||
|
body = jobs[1]["stage_signals"][0]["body"]
|
||||||
|
assert "<" not in body
|
||||||
|
assert "Hi there" in body
|
||||||
|
assert "Interview confirmed" in body
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue