diff --git a/dev-api.py b/dev-api.py index 2c86d05..f30f378 100644 --- a/dev-api.py +++ b/dev-api.py @@ -10,6 +10,7 @@ import sys import re import json import threading +from bs4 import BeautifulSoup from datetime import datetime from pathlib import Path from fastapi import FastAPI, HTTPException, Response @@ -40,6 +41,18 @@ def _get_db(): return db +def _strip_html(text: str | None) -> str | None: + """Strip HTML tags and normalize whitespace in email body text.""" + if not text: + return text + plain = BeautifulSoup(text, 'html.parser').get_text(separator='\n') + # Strip trailing whitespace from each line + lines = [line.rstrip() for line in plain.split('\n')] + # Collapse 3+ consecutive blank lines to at most 2 + cleaned = re.sub(r'\n{3,}', '\n\n', '\n'.join(lines)) + return cleaned.strip() or None + + def _row_to_job(row) -> dict: d = dict(row) d["is_remote"] = bool(d.get("is_remote", 0)) @@ -321,7 +334,7 @@ def list_interviews(): "subject": sr["subject"], "received_at": sr["received_at"], "stage_signal": sr["stage_signal"], - "body": sr["body"], + "body": _strip_html(sr["body"]), "from_addr": sr["from_addr"], }) diff --git a/tests/test_dev_api_interviews.py b/tests/test_dev_api_interviews.py index eb30f87..06803c3 100644 --- a/tests/test_dev_api_interviews.py +++ b/tests/test_dev_api_interviews.py @@ -200,3 +200,19 @@ def test_reclassify_signal_404_for_missing_id(client): resp = client.post("/api/stage-signals/9999/reclassify", json={"stage_signal": "neutral"}) assert resp.status_code == 404 + + +def test_signal_body_html_is_stripped(client, tmp_db): + import sqlite3 + con = sqlite3.connect(tmp_db) + con.execute( + "UPDATE job_contacts SET body = ? WHERE id = 10", + ("
Hi there,
Interview confirmed.
",) + ) + con.commit(); con.close() + resp = client.get("/api/interviews") + jobs = {j["id"]: j for j in resp.json()} + body = jobs[1]["stage_signals"][0]["body"] + assert "<" not in body + assert "Hi there" in body + assert "Interview confirmed" in body