fix: prefer HTML body in imap_sync, strip head/style/script, remove 4000-char truncation

- _parse_message now prefers text/html over text/plain so digest emails retain href attribute values needed for link extraction - Strip <head>, <style>, <script> blocks before storing to remove CSS/JS garbage while keeping anchor tags intact - Remove [:4000] truncation — digest emails need full body for URL regex - Update test: large body should NOT be truncated (assert len == 10_000)
2026-03-20 13:35:30 -07:00 · 2026-03-20 13:35:30 -07:00 · b9ef1f631e
commit b9ef1f631e
parent 8c3c0340ff
2 changed files with 31 additions and 9 deletions
--- a/scripts/imap_sync.py
+++ b/scripts/imap_sync.py
@ -698,21 +698,43 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
            return None
        msg = email.message_from_bytes(data[0][1])

-        body = ""
+        # Prefer text/html (preserves href attributes for digest link extraction);
+        # fall back to text/plain if no HTML part exists.
+        html_body = ""
+        plain_body = ""
        if msg.is_multipart():
            for part in msg.walk():
-                if part.get_content_type() == "text/plain":
+                ct = part.get_content_type()
+                if ct == "text/html" and not html_body:
                    try:
-                        body = part.get_payload(decode=True).decode("utf-8", errors="replace")
+                        html_body = part.get_payload(decode=True).decode("utf-8", errors="replace")
+                    except Exception:
+                        pass
+                elif ct == "text/plain" and not plain_body:
+                    try:
+                        plain_body = part.get_payload(decode=True).decode("utf-8", errors="replace")
                    except Exception:
                        pass
-                    break
        else:
+            ct = msg.get_content_type()
            try:
-                body = msg.get_payload(decode=True).decode("utf-8", errors="replace")
+                raw = msg.get_payload(decode=True).decode("utf-8", errors="replace")
+                if ct == "text/html":
+                    html_body = raw
+                else:
+                    plain_body = raw
            except Exception:
                pass

+        if html_body:
+            # Strip <head>…</head> (CSS, meta, title) and any stray <style> blocks.
+            # Keeps <body> HTML intact so href attributes survive for digest extraction.
+            body = re.sub(r"<head[\s\S]*?</head>", "", html_body, flags=re.I)
+            body = re.sub(r"<style[\s\S]*?</style>", "", body, flags=re.I)
+            body = re.sub(r"<script[\s\S]*?</script>", "", body, flags=re.I)
+        else:
+            body = plain_body
+
        mid = msg.get("Message-ID", "").strip()
        if not mid:
            return None  # No Message-ID → can't dedup; skip to avoid repeat inserts
@ -723,7 +745,7 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
            "from_addr":  _decode_str(msg.get("From")),
            "to_addr":    _decode_str(msg.get("To")),
            "date":       _decode_str(msg.get("Date")),
-            "body":       body[:4000],
+            "body":       body,  # no truncation — digest emails need full content
        }
    except Exception:
        return None
--- a/tests/test_imap_sync.py
+++ b/tests/test_imap_sync.py
@ -1024,8 +1024,8 @@ def test_sync_all_per_job_exception_continues(tmp_path):

 # ── Performance / edge cases ──────────────────────────────────────────────────

-def test_parse_message_large_body_truncated():
-    """Body longer than 4000 chars is silently truncated to 4000."""
+def test_parse_message_large_body_not_truncated():
+    """Body longer than 4000 chars is stored in full (no truncation)."""
    from scripts.imap_sync import _parse_message

    big_body = ("x" * 10_000).encode()
@ -1037,7 +1037,7 @@ def test_parse_message_large_body_truncated():
    conn.fetch.return_value = ("OK", [(b"1 (RFC822)", raw)])
    result = _parse_message(conn, b"1")
    assert result is not None
-    assert len(result["body"]) <= 4000
+    assert len(result["body"]) == 10_000


 def test_parse_message_binary_attachment_no_crash():