avocet/app/utils.py

"""Shared email utility functions for Avocet.

Pure-stdlib helpers extracted from the retired label_tool.py Streamlit app.
These are reused by the FastAPI backend and the test suite.
"""
from __future__ import annotations

import re
from html.parser import HTMLParser
from typing import Any


# ── HTML → plain-text extractor ──────────────────────────────────────────────

class _TextExtractor(HTMLParser):
    """Extract visible text from an HTML email body, preserving line breaks."""
    _BLOCK = {"p", "div", "br", "li", "tr", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"}
    _SKIP  = {"script", "style", "head", "noscript"}

    def __init__(self):
        super().__init__(convert_charrefs=True)
        self._parts: list[str] = []
        self._depth_skip = 0

    def handle_starttag(self, tag, attrs):
        tag = tag.lower()
        if tag in self._SKIP:
            self._depth_skip += 1
        elif tag in self._BLOCK:
            self._parts.append("\n")

    def handle_endtag(self, tag):
        if tag.lower() in self._SKIP:
            self._depth_skip = max(0, self._depth_skip - 1)

    def handle_data(self, data):
        if not self._depth_skip:
            self._parts.append(data)

    def get_text(self) -> str:
        text = "".join(self._parts)
        lines = [ln.strip() for ln in text.splitlines()]
        return "\n".join(ln for ln in lines if ln)


def strip_html(html_str: str) -> str:
    """Convert HTML email body to plain text. Pure stdlib, no dependencies."""
    try:
        extractor = _TextExtractor()
        extractor.feed(html_str)
        return extractor.get_text()
    except Exception:
        return re.sub(r"<[^>]+>", " ", html_str).strip()


def extract_body(msg: Any) -> str:
    """Return plain-text body. Strips HTML when no text/plain part exists."""
    if msg.is_multipart():
        html_fallback: str | None = None
        for part in msg.walk():
            ct = part.get_content_type()
            if ct == "text/plain":
                try:
                    charset = part.get_content_charset() or "utf-8"
                    return part.get_payload(decode=True).decode(charset, errors="replace")
                except Exception:
                    pass
            elif ct == "text/html" and html_fallback is None:
                try:
                    charset = part.get_content_charset() or "utf-8"
                    raw = part.get_payload(decode=True).decode(charset, errors="replace")
                    html_fallback = strip_html(raw)
                except Exception:
                    pass
        return html_fallback or ""
    else:
        try:
            charset = msg.get_content_charset() or "utf-8"
            raw = msg.get_payload(decode=True).decode(charset, errors="replace")
            if msg.get_content_type() == "text/html":
                return strip_html(raw)
            return raw
        except Exception:
            pass
    return ""