feat(signals): strip HTML and normalize whitespace from email bodies

This commit is contained in:
pyr0ball 2026-03-19 19:59:59 -07:00
parent 909fe60908
commit 34494db8d8
2 changed files with 30 additions and 1 deletions

View file

@ -10,6 +10,7 @@ import sys
import re import re
import json import json
import threading import threading
from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from fastapi import FastAPI, HTTPException, Response from fastapi import FastAPI, HTTPException, Response
@ -40,6 +41,18 @@ def _get_db():
return db return db
def _strip_html(text: str | None) -> str | None:
"""Strip HTML tags and normalize whitespace in email body text."""
if not text:
return text
plain = BeautifulSoup(text, 'html.parser').get_text(separator='\n')
# Strip trailing whitespace from each line
lines = [line.rstrip() for line in plain.split('\n')]
# Collapse 3+ consecutive blank lines to at most 2
cleaned = re.sub(r'\n{3,}', '\n\n', '\n'.join(lines))
return cleaned.strip() or None
def _row_to_job(row) -> dict: def _row_to_job(row) -> dict:
d = dict(row) d = dict(row)
d["is_remote"] = bool(d.get("is_remote", 0)) d["is_remote"] = bool(d.get("is_remote", 0))
@ -321,7 +334,7 @@ def list_interviews():
"subject": sr["subject"], "subject": sr["subject"],
"received_at": sr["received_at"], "received_at": sr["received_at"],
"stage_signal": sr["stage_signal"], "stage_signal": sr["stage_signal"],
"body": sr["body"], "body": _strip_html(sr["body"]),
"from_addr": sr["from_addr"], "from_addr": sr["from_addr"],
}) })

View file

@ -200,3 +200,19 @@ def test_reclassify_signal_404_for_missing_id(client):
resp = client.post("/api/stage-signals/9999/reclassify", resp = client.post("/api/stage-signals/9999/reclassify",
json={"stage_signal": "neutral"}) json={"stage_signal": "neutral"})
assert resp.status_code == 404 assert resp.status_code == 404
def test_signal_body_html_is_stripped(client, tmp_db):
import sqlite3
con = sqlite3.connect(tmp_db)
con.execute(
"UPDATE job_contacts SET body = ? WHERE id = 10",
("<html><body><p>Hi there,</p><p>Interview confirmed.</p></body></html>",)
)
con.commit(); con.close()
resp = client.get("/api/interviews")
jobs = {j["id"]: j for j in resp.json()}
body = jobs[1]["stage_signals"][0]["body"]
assert "<" not in body
assert "Hi there" in body
assert "Interview confirmed" in body