diff --git a/scripts/imap_sync.py b/scripts/imap_sync.py index 220a54f..e900aed 100644 --- a/scripts/imap_sync.py +++ b/scripts/imap_sync.py @@ -698,21 +698,43 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: return None msg = email.message_from_bytes(data[0][1]) - body = "" + # Prefer text/html (preserves href attributes for digest link extraction); + # fall back to text/plain if no HTML part exists. + html_body = "" + plain_body = "" if msg.is_multipart(): for part in msg.walk(): - if part.get_content_type() == "text/plain": + ct = part.get_content_type() + if ct == "text/html" and not html_body: try: - body = part.get_payload(decode=True).decode("utf-8", errors="replace") + html_body = part.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + elif ct == "text/plain" and not plain_body: + try: + plain_body = part.get_payload(decode=True).decode("utf-8", errors="replace") except Exception: pass - break else: + ct = msg.get_content_type() try: - body = msg.get_payload(decode=True).decode("utf-8", errors="replace") + raw = msg.get_payload(decode=True).decode("utf-8", errors="replace") + if ct == "text/html": + html_body = raw + else: + plain_body = raw except Exception: pass + if html_body: + # Strip … (CSS, meta, title) and any stray ", "", body, flags=re.I) + body = re.sub(r"", "", body, flags=re.I) + else: + body = plain_body + mid = msg.get("Message-ID", "").strip() if not mid: return None # No Message-ID → can't dedup; skip to avoid repeat inserts @@ -723,7 +745,7 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: "from_addr": _decode_str(msg.get("From")), "to_addr": _decode_str(msg.get("To")), "date": _decode_str(msg.get("Date")), - "body": body[:4000], + "body": body, # no truncation — digest emails need full content } except Exception: return None diff --git a/tests/test_imap_sync.py b/tests/test_imap_sync.py index f9cc4e5..5bdc687 100644 --- a/tests/test_imap_sync.py +++ b/tests/test_imap_sync.py @@ -1024,8 +1024,8 @@ def test_sync_all_per_job_exception_continues(tmp_path): # ── Performance / edge cases ────────────────────────────────────────────────── -def test_parse_message_large_body_truncated(): - """Body longer than 4000 chars is silently truncated to 4000.""" +def test_parse_message_large_body_not_truncated(): + """Body longer than 4000 chars is stored in full (no truncation).""" from scripts.imap_sync import _parse_message big_body = ("x" * 10_000).encode() @@ -1037,7 +1037,7 @@ def test_parse_message_large_body_truncated(): conn.fetch.return_value = ("OK", [(b"1 (RFC822)", raw)]) result = _parse_message(conn, b"1") assert result is not None - assert len(result["body"]) <= 4000 + assert len(result["body"]) == 10_000 def test_parse_message_binary_attachment_no_crash():