fix: prefer HTML body in imap_sync, strip head/style/script, remove 4000-char truncation
Some checks failed
CI / test (pull_request) Failing after 25s
Some checks failed
CI / test (pull_request) Failing after 25s
- _parse_message now prefers text/html over text/plain so digest emails retain href attribute values needed for link extraction - Strip <head>, <style>, <script> blocks before storing to remove CSS/JS garbage while keeping anchor tags intact - Remove [:4000] truncation — digest emails need full body for URL regex - Update test: large body should NOT be truncated (assert len == 10_000)
This commit is contained in:
parent
8c3c0340ff
commit
b9ef1f631e
2 changed files with 31 additions and 9 deletions
|
|
@ -698,21 +698,43 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
|
|||
return None
|
||||
msg = email.message_from_bytes(data[0][1])
|
||||
|
||||
body = ""
|
||||
# Prefer text/html (preserves href attributes for digest link extraction);
|
||||
# fall back to text/plain if no HTML part exists.
|
||||
html_body = ""
|
||||
plain_body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
ct = part.get_content_type()
|
||||
if ct == "text/html" and not html_body:
|
||||
try:
|
||||
body = part.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||
html_body = part.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
pass
|
||||
elif ct == "text/plain" and not plain_body:
|
||||
try:
|
||||
plain_body = part.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
ct = msg.get_content_type()
|
||||
try:
|
||||
body = msg.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||
raw = msg.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||
if ct == "text/html":
|
||||
html_body = raw
|
||||
else:
|
||||
plain_body = raw
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if html_body:
|
||||
# Strip <head>…</head> (CSS, meta, title) and any stray <style> blocks.
|
||||
# Keeps <body> HTML intact so href attributes survive for digest extraction.
|
||||
body = re.sub(r"<head[\s\S]*?</head>", "", html_body, flags=re.I)
|
||||
body = re.sub(r"<style[\s\S]*?</style>", "", body, flags=re.I)
|
||||
body = re.sub(r"<script[\s\S]*?</script>", "", body, flags=re.I)
|
||||
else:
|
||||
body = plain_body
|
||||
|
||||
mid = msg.get("Message-ID", "").strip()
|
||||
if not mid:
|
||||
return None # No Message-ID → can't dedup; skip to avoid repeat inserts
|
||||
|
|
@ -723,7 +745,7 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
|
|||
"from_addr": _decode_str(msg.get("From")),
|
||||
"to_addr": _decode_str(msg.get("To")),
|
||||
"date": _decode_str(msg.get("Date")),
|
||||
"body": body[:4000],
|
||||
"body": body, # no truncation — digest emails need full content
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -1024,8 +1024,8 @@ def test_sync_all_per_job_exception_continues(tmp_path):
|
|||
|
||||
# ── Performance / edge cases ──────────────────────────────────────────────────
|
||||
|
||||
def test_parse_message_large_body_truncated():
|
||||
"""Body longer than 4000 chars is silently truncated to 4000."""
|
||||
def test_parse_message_large_body_not_truncated():
|
||||
"""Body longer than 4000 chars is stored in full (no truncation)."""
|
||||
from scripts.imap_sync import _parse_message
|
||||
|
||||
big_body = ("x" * 10_000).encode()
|
||||
|
|
@ -1037,7 +1037,7 @@ def test_parse_message_large_body_truncated():
|
|||
conn.fetch.return_value = ("OK", [(b"1 (RFC822)", raw)])
|
||||
result = _parse_message(conn, b"1")
|
||||
assert result is not None
|
||||
assert len(result["body"]) <= 4000
|
||||
assert len(result["body"]) == 10_000
|
||||
|
||||
|
||||
def test_parse_message_binary_attachment_no_crash():
|
||||
|
|
|
|||
Loading…
Reference in a new issue