fix(avocet): strip HTML from email bodies — stdlib HTMLParser, no deps

2026-03-03 16:28:18 -08:00 · 2026-03-03 16:28:18 -08:00 · 682a958c28
commit 682a958c28
parent 47973aeba6
2 changed files with 310 additions and 18 deletions
--- a/app/label_tool.py
+++ b/app/label_tool.py
@ -14,6 +14,7 @@ from __future__ import annotations
 import email as _email_lib
 import hashlib
 import html as _html
+from html.parser import HTMLParser
 import imaplib
 import json
 import re
@ -23,6 +24,9 @@ from email.header import decode_header as _raw_decode
 from pathlib import Path
 from typing import Any

+import os
+import subprocess
+
 import streamlit as st
 import yaml

@ -43,8 +47,9 @@ LABELS = [
    "survey_received",
    "neutral",
    "event_rescheduled",
-    "unrelated",
    "digest",
+    "new_lead",
+    "hired",
 ]

 _LABEL_META: dict[str, dict] = {
@ -55,8 +60,9 @@ _LABEL_META: dict[str, dict] = {
    "survey_received":     {"emoji": "📋", "color": "#9C27B0", "key": "5"},
    "neutral":             {"emoji": "⬜", "color": "#607D8B", "key": "6"},
    "event_rescheduled":   {"emoji": "🔄", "color": "#FF5722", "key": "7"},
-    "unrelated":           {"emoji": "🗑️", "color": "#757575", "key": "8"},
-    "digest":              {"emoji": "📰", "color": "#00BCD4", "key": "9"},
+    "digest":              {"emoji": "📰", "color": "#00BCD4", "key": "8"},
+    "new_lead":            {"emoji": "🤝", "color": "#009688", "key": "9"},
+    "hired":               {"emoji": "🎊", "color": "#FFC107", "key": "h"},
 }

 # ── HTML sanitiser ───────────────────────────────────────────────────────────
@ -78,7 +84,50 @@ def _to_html(text: str, newlines_to_br: bool = False) -> str:
    return escaped


-# ── Wide IMAP search terms (cast a net across all 9 categories) ─────────────
+# ── HTML → plain-text extractor ─────────────────────────────────────────────
+
+class _TextExtractor(HTMLParser):
+    """Extract visible text from an HTML email body, preserving line breaks."""
+    _BLOCK = {"p","div","br","li","tr","h1","h2","h3","h4","h5","h6","blockquote"}
+    _SKIP  = {"script","style","head","noscript"}
+
+    def __init__(self):
+        super().__init__(convert_charrefs=True)
+        self._parts: list[str] = []
+        self._depth_skip = 0
+
+    def handle_starttag(self, tag, attrs):
+        tag = tag.lower()
+        if tag in self._SKIP:
+            self._depth_skip += 1
+        elif tag in self._BLOCK:
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag):
+        if tag.lower() in self._SKIP:
+            self._depth_skip = max(0, self._depth_skip - 1)
+
+    def handle_data(self, data):
+        if not self._depth_skip:
+            self._parts.append(data)
+
+    def get_text(self) -> str:
+        text = "".join(self._parts)
+        lines = [ln.strip() for ln in text.splitlines()]
+        return "\n".join(ln for ln in lines if ln)
+
+
+def _strip_html(html_str: str) -> str:
+    """Convert HTML email body to plain text. Pure stdlib, no dependencies."""
+    try:
+        extractor = _TextExtractor()
+        extractor.feed(html_str)
+        return extractor.get_text()
+    except Exception:
+        return re.sub(r"<[^>]+>", " ", html_str).strip()
+
+
+# ── Wide IMAP search terms (cast a net across all 10 categories) ────────────
 _WIDE_TERMS = [
    # interview_scheduled
    "interview", "phone screen", "video call", "zoom link", "schedule a call",
@ -100,6 +149,11 @@ _WIDE_TERMS = [
    # digest
    "job digest", "jobs you may like", "recommended jobs", "jobs for you",
    "new jobs", "job alert",
+    # new_lead
+    "came across your profile", "reaching out about", "great fit for a role",
+    "exciting opportunity", "love to connect",
+    # hired / onboarding
+    "welcome to the team", "start date", "onboarding", "first day", "we're excited to have you",
    # general recruitment
    "application", "recruiter", "recruiting", "hiring", "candidate",
 ]
@ -121,18 +175,32 @@ def _decode_str(value: str | None) -> str:


 def _extract_body(msg: Any) -> str:
+    """Return plain-text body. Strips HTML when no text/plain part exists."""
    if msg.is_multipart():
+        html_fallback: str | None = None
        for part in msg.walk():
-            if part.get_content_type() == "text/plain":
+            ct = part.get_content_type()
+            if ct == "text/plain":
                try:
                    charset = part.get_content_charset() or "utf-8"
                    return part.get_payload(decode=True).decode(charset, errors="replace")
                except Exception:
                    pass
+            elif ct == "text/html" and html_fallback is None:
+                try:
+                    charset = part.get_content_charset() or "utf-8"
+                    raw = part.get_payload(decode=True).decode(charset, errors="replace")
+                    html_fallback = _strip_html(raw)
+                except Exception:
+                    pass
+        return html_fallback or ""
    else:
        try:
            charset = msg.get_content_charset() or "utf-8"
-            return msg.get_payload(decode=True).decode(charset, errors="replace")
+            raw = msg.get_payload(decode=True).decode(charset, errors="replace")
+            if msg.get_content_type() == "text/html":
+                return _strip_html(raw)
+            return raw
        except Exception:
            pass
    return ""
@ -436,7 +504,9 @@ with st.sidebar:

 # ── Tabs ─────────────────────────────────────────────────────────────────────

-tab_label, tab_fetch, tab_stats, tab_settings = st.tabs(["🃏 Label", "📥 Fetch", "📊 Stats", "⚙️ Settings"])
+tab_label, tab_fetch, tab_stats, tab_settings, tab_benchmark = st.tabs(
+    ["🃏 Label", "📥 Fetch", "📊 Stats", "⚙️ Settings", "🔬 Benchmark"]
+)


 # ══════════════════════════════════════════════════════════════════════════════
@ -669,19 +739,19 @@ with tab_label:
            _lbl_r = _r.get("label", "")
            _counts[_lbl_r] = _counts.get(_lbl_r, 0) + 1

-        row1_cols = st.columns(3)
-        row2_cols = st.columns(3)
-        row3_cols = st.columns(3)
+        row1_cols = st.columns(5)
+        row2_cols = st.columns(5)
        bucket_pairs = [
            (row1_cols[0], "interview_scheduled"),
            (row1_cols[1], "offer_received"),
            (row1_cols[2], "rejected"),
-            (row2_cols[0], "positive_response"),
-            (row2_cols[1], "survey_received"),
-            (row2_cols[2], "neutral"),
-            (row3_cols[0], "event_rescheduled"),
-            (row3_cols[1], "unrelated"),
-            (row3_cols[2], "digest"),
+            (row1_cols[3], "positive_response"),
+            (row1_cols[4], "survey_received"),
+            (row2_cols[0], "neutral"),
+            (row2_cols[1], "event_rescheduled"),
+            (row2_cols[2], "digest"),
+            (row2_cols[3], "new_lead"),
+            (row2_cols[4], "hired"),
        ]
        for col, lbl in bucket_pairs:
            m = _LABEL_META[lbl]
@ -720,7 +790,7 @@ with tab_label:
        nav_cols = st.columns([2, 1, 1, 1])

        remaining = len(unlabeled) - 1
-        nav_cols[0].caption(f"**{remaining}** remaining  ·  Keys: 1–9 = label, 0 = other, S = skip, U = undo")
+        nav_cols[0].caption(f"**{remaining}** remaining  ·  Keys: 1–9, H = label, 0 = other, S = skip, U = undo")

        if nav_cols[1].button("↩ Undo", disabled=not st.session_state.history, use_container_width=True):
            prev_idx, prev_label = st.session_state.history.pop()
@ -757,7 +827,7 @@ document.addEventListener('keydown', function(e) {
    const keyToLabel = {
        '1':'interview_scheduled','2':'offer_received','3':'rejected',
        '4':'positive_response','5':'survey_received','6':'neutral',
-        '7':'event_rescheduled','8':'unrelated','9':'digest'
+        '7':'event_rescheduled','8':'digest','9':'new_lead'
    };
    const label = keyToLabel[e.key];
    if (label) {
@ -772,6 +842,11 @@ document.addEventListener('keydown', function(e) {
        for (const btn of btns) {
            if (btn.innerText.includes('Other')) { btn.click(); break; }
        }
+    } else if (e.key.toLowerCase() === 'h') {
+        const btns = window.parent.document.querySelectorAll('button');
+        for (const btn of btns) {
+            if (btn.innerText.toLowerCase().includes('hired')) { btn.click(); break; }
+        }
    } else if (e.key.toLowerCase() === 's') {
        const btns = window.parent.document.querySelectorAll('button');
        for (const btn of btns) {
@ -979,3 +1054,133 @@ with tab_settings:
            if _k in ("settings_accounts", "settings_max") or _k.startswith("s_"):
                del st.session_state[_k]
        st.rerun()
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# BENCHMARK TAB
+# ══════════════════════════════════════════════════════════════════════════════
+
+with tab_benchmark:
+    # ── Model selection ───────────────────────────────────────────────────────
+    _DEFAULT_MODELS = [
+        "deberta-zeroshot", "deberta-small", "gliclass-large",
+        "bart-mnli", "bge-m3-zeroshot", "deberta-small-2pass", "deberta-base-anli",
+    ]
+    _SLOW_MODELS = [
+        "deberta-large-ling", "mdeberta-xnli-2m", "bge-reranker",
+        "deberta-xlarge", "mdeberta-mnli", "xlm-roberta-anli",
+    ]
+
+    st.subheader("🔬 Benchmark Classifier Models")
+
+    _b_include_slow = st.checkbox("Include slow / large models", value=False, key="b_include_slow")
+    _b_all_models = _DEFAULT_MODELS + (_SLOW_MODELS if _b_include_slow else [])
+    _b_selected = st.multiselect(
+        "Models to run",
+        options=_b_all_models,
+        default=_b_all_models,
+        help="Uncheck models to skip them. Slow models require --include-slow.",
+    )
+
+    _n_examples = len(st.session_state.labeled)
+    st.caption(
+        f"Scoring against `{_SCORE_FILE.name}` · **{_n_examples} labeled examples**"
+        f"  ·  Est. time: ~{max(1, len(_b_selected))} – {max(2, len(_b_selected) * 2)} min"
+    )
+
+    # Direct binary avoids conda's output interception; -u = unbuffered stdout
+    _CLASSIFIER_PYTHON = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
+
+    if st.button("▶ Run Benchmark", type="primary", disabled=not _b_selected, key="b_run"):
+        _b_cmd = [
+            _CLASSIFIER_PYTHON, "-u",
+            str(_ROOT / "scripts" / "benchmark_classifier.py"),
+            "--score", "--score-file", str(_SCORE_FILE),
+            "--models", *_b_selected,
+        ]
+        with st.status("Running benchmark…", expanded=True) as _b_status:
+            _b_proc = subprocess.Popen(
+                _b_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, cwd=str(_ROOT),
+                env={**os.environ, "PYTHONUNBUFFERED": "1"},
+            )
+            _b_lines: list[str] = []
+            _b_area = st.empty()
+            for _b_line in _b_proc.stdout:
+                _b_lines.append(_b_line)
+                _b_area.code("".join(_b_lines[-30:]), language="text")
+            _b_proc.wait()
+            _b_full = "".join(_b_lines)
+            st.session_state["bench_output"] = _b_full
+            if _b_proc.returncode == 0:
+                _b_status.update(label="Benchmark complete ✓", state="complete", expanded=False)
+            else:
+                _b_status.update(label="Benchmark failed", state="error")
+
+    # ── Results display ───────────────────────────────────────────────────────
+    if "bench_output" in st.session_state:
+        _b_out = st.session_state["bench_output"]
+
+        # Parse summary table rows: name  f1  accuracy  ms
+        _b_rows = []
+        for _b_l in _b_out.splitlines():
+            _b_m = re.match(r"^([\w-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", _b_l.strip())
+            if _b_m:
+                _b_rows.append({
+                    "Model": _b_m.group(1),
+                    "macro-F1": float(_b_m.group(2)),
+                    "Accuracy": float(_b_m.group(3)),
+                    "ms/email": float(_b_m.group(4)),
+                })
+
+        if _b_rows:
+            import pandas as _pd
+            _b_df = _pd.DataFrame(_b_rows).sort_values("macro-F1", ascending=False).reset_index(drop=True)
+            st.dataframe(
+                _b_df,
+                column_config={
+                    "macro-F1": st.column_config.ProgressColumn(
+                        "macro-F1", min_value=0, max_value=1, format="%.3f",
+                    ),
+                    "Accuracy": st.column_config.ProgressColumn(
+                        "Accuracy", min_value=0, max_value=1, format="%.3f",
+                    ),
+                    "ms/email": st.column_config.NumberColumn("ms/email", format="%.1f"),
+                },
+                use_container_width=True, hide_index=True,
+            )
+
+        with st.expander("Full benchmark output"):
+            st.code(_b_out, language="text")
+
+    st.divider()
+
+    # ── Tests ─────────────────────────────────────────────────────────────────
+    st.subheader("🧪 Run Tests")
+    st.caption("Runs `pytest tests/ -v` in the job-seeker env (no model downloads required).")
+
+    if st.button("▶ Run Tests", key="b_run_tests"):
+        _t_cmd = [
+            "/devl/miniconda3/envs/job-seeker/bin/pytest", "tests/", "-v", "--tb=short",
+        ]
+        with st.status("Running tests…", expanded=True) as _t_status:
+            _t_proc = subprocess.Popen(
+                _t_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, cwd=str(_ROOT),
+            )
+            _t_lines: list[str] = []
+            _t_area = st.empty()
+            for _t_line in _t_proc.stdout:
+                _t_lines.append(_t_line)
+                _t_area.code("".join(_t_lines[-30:]), language="text")
+            _t_proc.wait()
+            _t_full = "".join(_t_lines)
+            st.session_state["test_output"] = _t_full
+            _t_summary = [l for l in _t_lines if "passed" in l or "failed" in l or "error" in l.lower()]
+            _t_label = _t_summary[-1].strip() if _t_summary else "Done"
+            _t_state = "error" if _t_proc.returncode != 0 else "complete"
+            _t_status.update(label=_t_label, state=_t_state, expanded=False)
+
+    if "test_output" in st.session_state:
+        with st.expander("Full test output", expanded=True):
+            st.code(st.session_state["test_output"], language="text")
--- a/tests/test_label_tool.py
+++ b/tests/test_label_tool.py
@ -0,0 +1,87 @@
+"""Tests for label_tool HTML extraction utilities.
+
+These functions are stdlib-only and safe to test without an IMAP connection.
+"""
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+
+from app.label_tool import _extract_body, _strip_html
+
+
+# ── _strip_html ──────────────────────────────────────────────────────────────
+
+def test_strip_html_removes_tags():
+    assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
+
+
+def test_strip_html_skips_script_content():
+    result = _strip_html("<script>doEvil()</script><p>real</p>")
+    assert "doEvil" not in result
+    assert "real" in result
+
+
+def test_strip_html_skips_style_content():
+    result = _strip_html("<style>.foo{color:red}</style><p>visible</p>")
+    assert ".foo" not in result
+    assert "visible" in result
+
+
+def test_strip_html_handles_br_as_newline():
+    result = _strip_html("line1<br>line2")
+    assert "line1" in result
+    assert "line2" in result
+
+
+def test_strip_html_decodes_entities():
+    # convert_charrefs=True on HTMLParser handles &amp; etc.
+    result = _strip_html("<p>Hello &amp; welcome</p>")
+    assert "&amp;" not in result
+    assert "Hello" in result
+    assert "welcome" in result
+
+
+def test_strip_html_empty_string():
+    assert _strip_html("") == ""
+
+
+def test_strip_html_plain_text_passthrough():
+    assert _strip_html("no tags here") == "no tags here"
+
+
+# ── _extract_body ────────────────────────────────────────────────────────────
+
+def test_extract_body_prefers_plain_over_html():
+    msg = MIMEMultipart("alternative")
+    msg.attach(MIMEText("plain body", "plain"))
+    msg.attach(MIMEText("<html><body>html body</body></html>", "html"))
+    assert _extract_body(msg) == "plain body"
+
+
+def test_extract_body_falls_back_to_html_when_no_plain():
+    msg = MIMEMultipart("alternative")
+    msg.attach(MIMEText("<html><body><p>HTML only email</p></body></html>", "html"))
+    result = _extract_body(msg)
+    assert "HTML only email" in result
+    assert "<" not in result  # no raw HTML tags leaked through
+
+
+def test_extract_body_non_multipart_html_stripped():
+    msg = MIMEText("<html><body><p>Solo HTML</p></body></html>", "html")
+    result = _extract_body(msg)
+    assert "Solo HTML" in result
+    assert "<html>" not in result
+
+
+def test_extract_body_non_multipart_plain_unchanged():
+    msg = MIMEText("just plain text", "plain")
+    assert _extract_body(msg) == "just plain text"
+
+
+def test_extract_body_empty_message():
+    msg = MIMEText("", "plain")
+    assert _extract_body(msg) == ""
+
+
+def test_extract_body_multipart_empty_returns_empty():
+    msg = MIMEMultipart("alternative")
+    assert _extract_body(msg) == ""