From 347c171e263b51b6d2e4c8ed82c5e62187ff2602 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Fri, 20 Mar 2026 13:35:30 -0700 Subject: [PATCH] fix: prefer HTML body in imap_sync, strip head/style/script, remove 4000-char truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _parse_message now prefers text/html over text/plain so digest emails retain href attribute values needed for link extraction - Strip , ", "", body, flags=re.I) + body = re.sub(r"", "", body, flags=re.I) + else: + body = plain_body + mid = msg.get("Message-ID", "").strip() if not mid: return None # No Message-ID → can't dedup; skip to avoid repeat inserts @@ -723,7 +745,7 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: "from_addr": _decode_str(msg.get("From")), "to_addr": _decode_str(msg.get("To")), "date": _decode_str(msg.get("Date")), - "body": body[:4000], + "body": body, # no truncation — digest emails need full content } except Exception: return None diff --git a/tests/test_imap_sync.py b/tests/test_imap_sync.py index f9cc4e5..5bdc687 100644 --- a/tests/test_imap_sync.py +++ b/tests/test_imap_sync.py @@ -1024,8 +1024,8 @@ def test_sync_all_per_job_exception_continues(tmp_path): # ── Performance / edge cases ────────────────────────────────────────────────── -def test_parse_message_large_body_truncated(): - """Body longer than 4000 chars is silently truncated to 4000.""" +def test_parse_message_large_body_not_truncated(): + """Body longer than 4000 chars is stored in full (no truncation).""" from scripts.imap_sync import _parse_message big_body = ("x" * 10_000).encode() @@ -1037,7 +1037,7 @@ def test_parse_message_large_body_truncated(): conn.fetch.return_value = ("OK", [(b"1 (RFC822)", raw)]) result = _parse_message(conn, b"1") assert result is not None - assert len(result["body"]) <= 4000 + assert len(result["body"]) == 10_000 def test_parse_message_binary_attachment_no_crash():