diff --git a/scripts/imap_sync.py b/scripts/imap_sync.py index 220a54f..e900aed 100644 --- a/scripts/imap_sync.py +++ b/scripts/imap_sync.py @@ -698,21 +698,43 @@ def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: return None msg = email.message_from_bytes(data[0][1]) - body = "" + # Prefer text/html (preserves href attributes for digest link extraction); + # fall back to text/plain if no HTML part exists. + html_body = "" + plain_body = "" if msg.is_multipart(): for part in msg.walk(): - if part.get_content_type() == "text/plain": + ct = part.get_content_type() + if ct == "text/html" and not html_body: try: - body = part.get_payload(decode=True).decode("utf-8", errors="replace") + html_body = part.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + elif ct == "text/plain" and not plain_body: + try: + plain_body = part.get_payload(decode=True).decode("utf-8", errors="replace") except Exception: pass - break else: + ct = msg.get_content_type() try: - body = msg.get_payload(decode=True).decode("utf-8", errors="replace") + raw = msg.get_payload(decode=True).decode("utf-8", errors="replace") + if ct == "text/html": + html_body = raw + else: + plain_body = raw except Exception: pass + if html_body: + # Strip
… (CSS, meta, title) and any stray ", "", body, flags=re.I) + body = re.sub(r"