feat(signals): strip HTML and normalize whitespace from email bodies

This commit is contained in:
pyr0ball 2026-03-19 19:59:59 -07:00
parent eff68791f4
commit d10093cfa0
2 changed files with 30 additions and 1 deletions

View file

@ -10,6 +10,7 @@ import sys
import re
import json
import threading
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from fastapi import FastAPI, HTTPException, Response
@ -40,6 +41,18 @@ def _get_db():
return db
def _strip_html(text: str | None) -> str | None:
"""Strip HTML tags and normalize whitespace in email body text."""
if not text:
return text
plain = BeautifulSoup(text, 'html.parser').get_text(separator='\n')
# Strip trailing whitespace from each line
lines = [line.rstrip() for line in plain.split('\n')]
# Collapse 3+ consecutive blank lines to at most 2
cleaned = re.sub(r'\n{3,}', '\n\n', '\n'.join(lines))
return cleaned.strip() or None
def _row_to_job(row) -> dict:
d = dict(row)
d["is_remote"] = bool(d.get("is_remote", 0))
@ -321,7 +334,7 @@ def list_interviews():
"subject": sr["subject"],
"received_at": sr["received_at"],
"stage_signal": sr["stage_signal"],
"body": sr["body"],
"body": _strip_html(sr["body"]),
"from_addr": sr["from_addr"],
})

View file

@ -200,3 +200,19 @@ def test_reclassify_signal_404_for_missing_id(client):
resp = client.post("/api/stage-signals/9999/reclassify",
json={"stage_signal": "neutral"})
assert resp.status_code == 404
def test_signal_body_html_is_stripped(client, tmp_db):
import sqlite3
con = sqlite3.connect(tmp_db)
con.execute(
"UPDATE job_contacts SET body = ? WHERE id = 10",
("<html><body><p>Hi there,</p><p>Interview confirmed.</p></body></html>",)
)
con.commit(); con.close()
resp = client.get("/api/interviews")
jobs = {j["id"]: j for j in resp.json()}
body = jobs[1]["stage_signals"][0]["body"]
assert "<" not in body
assert "Hi there" in body
assert "Interview confirmed" in body