feat(signals): strip HTML and normalize whitespace from email bodies
This commit is contained in:
parent
eff68791f4
commit
d10093cfa0
2 changed files with 30 additions and 1 deletions
15
dev-api.py
15
dev-api.py
|
|
@ -10,6 +10,7 @@ import sys
|
|||
import re
|
||||
import json
|
||||
import threading
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
|
|
@ -40,6 +41,18 @@ def _get_db():
|
|||
return db
|
||||
|
||||
|
||||
def _strip_html(text: str | None) -> str | None:
|
||||
"""Strip HTML tags and normalize whitespace in email body text."""
|
||||
if not text:
|
||||
return text
|
||||
plain = BeautifulSoup(text, 'html.parser').get_text(separator='\n')
|
||||
# Strip trailing whitespace from each line
|
||||
lines = [line.rstrip() for line in plain.split('\n')]
|
||||
# Collapse 3+ consecutive blank lines to at most 2
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', '\n'.join(lines))
|
||||
return cleaned.strip() or None
|
||||
|
||||
|
||||
def _row_to_job(row) -> dict:
|
||||
d = dict(row)
|
||||
d["is_remote"] = bool(d.get("is_remote", 0))
|
||||
|
|
@ -321,7 +334,7 @@ def list_interviews():
|
|||
"subject": sr["subject"],
|
||||
"received_at": sr["received_at"],
|
||||
"stage_signal": sr["stage_signal"],
|
||||
"body": sr["body"],
|
||||
"body": _strip_html(sr["body"]),
|
||||
"from_addr": sr["from_addr"],
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -200,3 +200,19 @@ def test_reclassify_signal_404_for_missing_id(client):
|
|||
resp = client.post("/api/stage-signals/9999/reclassify",
|
||||
json={"stage_signal": "neutral"})
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_signal_body_html_is_stripped(client, tmp_db):
|
||||
import sqlite3
|
||||
con = sqlite3.connect(tmp_db)
|
||||
con.execute(
|
||||
"UPDATE job_contacts SET body = ? WHERE id = 10",
|
||||
("<html><body><p>Hi there,</p><p>Interview confirmed.</p></body></html>",)
|
||||
)
|
||||
con.commit(); con.close()
|
||||
resp = client.get("/api/interviews")
|
||||
jobs = {j["id"]: j for j in resp.json()}
|
||||
body = jobs[1]["stage_signals"][0]["body"]
|
||||
assert "<" not in body
|
||||
assert "Hi there" in body
|
||||
assert "Interview confirmed" in body
|
||||
|
|
|
|||
Loading…
Reference in a new issue