diff --git a/app/imap_fetch.py b/app/imap_fetch.py index bcafb7b..1e15119 100644 --- a/app/imap_fetch.py +++ b/app/imap_fetch.py @@ -1,6 +1,6 @@ """Avocet — IMAP fetch utilities. -Shared between app/api.py (FastAPI SSE endpoint) and app/label_tool.py (Streamlit). +Shared between app/api.py (FastAPI SSE endpoint) and the label UI. No Streamlit imports here — stdlib + imaplib only. """ from __future__ import annotations @@ -8,36 +8,11 @@ from __future__ import annotations import email as _email_lib import hashlib import imaplib -import re from datetime import datetime, timedelta from email.header import decode_header as _raw_decode -from html.parser import HTMLParser from typing import Any, Iterator - -# ── HTML → plain text ──────────────────────────────────────────────────────── - -class _TextExtractor(HTMLParser): - def __init__(self): - super().__init__() - self._parts: list[str] = [] - - def handle_data(self, data: str) -> None: - stripped = data.strip() - if stripped: - self._parts.append(stripped) - - def get_text(self) -> str: - return " ".join(self._parts) - - -def strip_html(html_str: str) -> str: - try: - ex = _TextExtractor() - ex.feed(html_str) - return ex.get_text() - except Exception: - return re.sub(r"<[^>]+>", " ", html_str).strip() +from app.utils import extract_body, strip_html # noqa: F401 (strip_html re-exported for callers) # ── IMAP decode helpers ─────────────────────────────────────────────────────── @@ -55,37 +30,6 @@ def _decode_str(value: str | None) -> str: return " ".join(out).strip() -def _extract_body(msg: Any) -> str: - if msg.is_multipart(): - html_fallback: str | None = None - for part in msg.walk(): - ct = part.get_content_type() - if ct == "text/plain": - try: - charset = part.get_content_charset() or "utf-8" - return part.get_payload(decode=True).decode(charset, errors="replace") - except Exception: - pass - elif ct == "text/html" and html_fallback is None: - try: - charset = part.get_content_charset() or "utf-8" - raw = part.get_payload(decode=True).decode(charset, errors="replace") - html_fallback = strip_html(raw) - except Exception: - pass - return html_fallback or "" - else: - try: - charset = msg.get_content_charset() or "utf-8" - raw = msg.get_payload(decode=True).decode(charset, errors="replace") - if msg.get_content_type() == "text/html": - return strip_html(raw) - return raw - except Exception: - pass - return "" - - def entry_key(e: dict) -> str: """Stable MD5 content-hash for dedup — matches label_tool.py _entry_key.""" key = (e.get("subject", "") + (e.get("body", "") or "")[:100]) @@ -193,7 +137,7 @@ def fetch_account_stream( subj = _decode_str(msg.get("Subject", "")) from_addr = _decode_str(msg.get("From", "")) date = _decode_str(msg.get("Date", "")) - body = _extract_body(msg)[:800] + body = extract_body(msg)[:800] entry = {"subject": subj, "body": body, "from_addr": from_addr, "date": date, "account": name} k = entry_key(entry) diff --git a/app/utils.py b/app/utils.py index 3a250ea..a98088e 100644 --- a/app/utils.py +++ b/app/utils.py @@ -43,7 +43,7 @@ class _TextExtractor(HTMLParser): return "\n".join(ln for ln in lines if ln) -def _strip_html(html_str: str) -> str: +def strip_html(html_str: str) -> str: """Convert HTML email body to plain text. Pure stdlib, no dependencies.""" try: extractor = _TextExtractor() @@ -53,7 +53,7 @@ def _strip_html(html_str: str) -> str: return re.sub(r"<[^>]+>", " ", html_str).strip() -def _extract_body(msg: Any) -> str: +def extract_body(msg: Any) -> str: """Return plain-text body. Strips HTML when no text/plain part exists.""" if msg.is_multipart(): html_fallback: str | None = None @@ -69,7 +69,7 @@ def _extract_body(msg: Any) -> str: try: charset = part.get_content_charset() or "utf-8" raw = part.get_payload(decode=True).decode(charset, errors="replace") - html_fallback = _strip_html(raw) + html_fallback = strip_html(raw) except Exception: pass return html_fallback or "" @@ -78,7 +78,7 @@ def _extract_body(msg: Any) -> str: charset = msg.get_content_charset() or "utf-8" raw = msg.get_payload(decode=True).decode(charset, errors="replace") if msg.get_content_type() == "text/html": - return _strip_html(raw) + return strip_html(raw) return raw except Exception: pass diff --git a/tests/test_label_tool.py b/tests/test_label_tool.py index d5d05ec..01f6f4d 100644 --- a/tests/test_label_tool.py +++ b/tests/test_label_tool.py @@ -5,83 +5,83 @@ These functions are stdlib-only and safe to test without an IMAP connection. from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText -from app.utils import _extract_body, _strip_html +from app.utils import extract_body, strip_html -# ── _strip_html ────────────────────────────────────────────────────────────── +# ── strip_html ────────────────────────────────────────────────────────────── def test_strip_html_removes_tags(): - assert _strip_html("
Hello world
") == "Hello world" + assert strip_html("Hello world
") == "Hello world" def test_strip_html_skips_script_content(): - result = _strip_html("real
") + result = strip_html("real
") assert "doEvil" not in result assert "real" in result def test_strip_html_skips_style_content(): - result = _strip_html("visible
") + result = strip_html("visible
") assert ".foo" not in result assert "visible" in result def test_strip_html_handles_br_as_newline(): - result = _strip_html("line1Hello & welcome
") + result = strip_html("Hello & welcome
") assert "&" not in result assert "Hello" in result assert "welcome" in result def test_strip_html_empty_string(): - assert _strip_html("") == "" + assert strip_html("") == "" def test_strip_html_plain_text_passthrough(): - assert _strip_html("no tags here") == "no tags here" + assert strip_html("no tags here") == "no tags here" -# ── _extract_body ──────────────────────────────────────────────────────────── +# ── extract_body ──────────────────────────────────────────────────────────── def test_extract_body_prefers_plain_over_html(): msg = MIMEMultipart("alternative") msg.attach(MIMEText("plain body", "plain")) msg.attach(MIMEText("html body", "html")) - assert _extract_body(msg) == "plain body" + assert extract_body(msg) == "plain body" def test_extract_body_falls_back_to_html_when_no_plain(): msg = MIMEMultipart("alternative") msg.attach(MIMEText("HTML only email
", "html")) - result = _extract_body(msg) + result = extract_body(msg) assert "HTML only email" in result assert "<" not in result # no raw HTML tags leaked through def test_extract_body_non_multipart_html_stripped(): msg = MIMEText("Solo HTML
", "html") - result = _extract_body(msg) + result = extract_body(msg) assert "Solo HTML" in result assert "" not in result def test_extract_body_non_multipart_plain_unchanged(): msg = MIMEText("just plain text", "plain") - assert _extract_body(msg) == "just plain text" + assert extract_body(msg) == "just plain text" def test_extract_body_empty_message(): msg = MIMEText("", "plain") - assert _extract_body(msg) == "" + assert extract_body(msg) == "" def test_extract_body_multipart_empty_returns_empty(): msg = MIMEMultipart("alternative") - assert _extract_body(msg) == "" + assert extract_body(msg) == ""