diff --git a/app/label_tool.py b/app/label_tool.py index 1340824..c86d09b 100644 --- a/app/label_tool.py +++ b/app/label_tool.py @@ -14,6 +14,7 @@ from __future__ import annotations import email as _email_lib import hashlib import html as _html +from html.parser import HTMLParser import imaplib import json import re @@ -23,6 +24,9 @@ from email.header import decode_header as _raw_decode from pathlib import Path from typing import Any +import os +import subprocess + import streamlit as st import yaml @@ -43,8 +47,9 @@ LABELS = [ "survey_received", "neutral", "event_rescheduled", - "unrelated", "digest", + "new_lead", + "hired", ] _LABEL_META: dict[str, dict] = { @@ -55,8 +60,9 @@ _LABEL_META: dict[str, dict] = { "survey_received": {"emoji": "π", "color": "#9C27B0", "key": "5"}, "neutral": {"emoji": "β¬", "color": "#607D8B", "key": "6"}, "event_rescheduled": {"emoji": "π", "color": "#FF5722", "key": "7"}, - "unrelated": {"emoji": "ποΈ", "color": "#757575", "key": "8"}, - "digest": {"emoji": "π°", "color": "#00BCD4", "key": "9"}, + "digest": {"emoji": "π°", "color": "#00BCD4", "key": "8"}, + "new_lead": {"emoji": "π€", "color": "#009688", "key": "9"}, + "hired": {"emoji": "π", "color": "#FFC107", "key": "h"}, } # ββ HTML sanitiser βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -78,7 +84,50 @@ def _to_html(text: str, newlines_to_br: bool = False) -> str: return escaped -# ββ Wide IMAP search terms (cast a net across all 9 categories) βββββββββββββ +# ββ HTML β plain-text extractor βββββββββββββββββββββββββββββββββββββββββββββ + +class _TextExtractor(HTMLParser): + """Extract visible text from an HTML email body, preserving line breaks.""" + _BLOCK = {"p","div","br","li","tr","h1","h2","h3","h4","h5","h6","blockquote"} + _SKIP = {"script","style","head","noscript"} + + def __init__(self): + super().__init__(convert_charrefs=True) + self._parts: list[str] = [] + self._depth_skip = 0 + + def handle_starttag(self, tag, attrs): + tag = tag.lower() + if tag in self._SKIP: + self._depth_skip += 1 + elif tag in self._BLOCK: + self._parts.append("\n") + + def handle_endtag(self, tag): + if tag.lower() in self._SKIP: + self._depth_skip = max(0, self._depth_skip - 1) + + def handle_data(self, data): + if not self._depth_skip: + self._parts.append(data) + + def get_text(self) -> str: + text = "".join(self._parts) + lines = [ln.strip() for ln in text.splitlines()] + return "\n".join(ln for ln in lines if ln) + + +def _strip_html(html_str: str) -> str: + """Convert HTML email body to plain text. Pure stdlib, no dependencies.""" + try: + extractor = _TextExtractor() + extractor.feed(html_str) + return extractor.get_text() + except Exception: + return re.sub(r"<[^>]+>", " ", html_str).strip() + + +# ββ Wide IMAP search terms (cast a net across all 10 categories) ββββββββββββ _WIDE_TERMS = [ # interview_scheduled "interview", "phone screen", "video call", "zoom link", "schedule a call", @@ -100,6 +149,11 @@ _WIDE_TERMS = [ # digest "job digest", "jobs you may like", "recommended jobs", "jobs for you", "new jobs", "job alert", + # new_lead + "came across your profile", "reaching out about", "great fit for a role", + "exciting opportunity", "love to connect", + # hired / onboarding + "welcome to the team", "start date", "onboarding", "first day", "we're excited to have you", # general recruitment "application", "recruiter", "recruiting", "hiring", "candidate", ] @@ -121,18 +175,32 @@ def _decode_str(value: str | None) -> str: def _extract_body(msg: Any) -> str: + """Return plain-text body. Strips HTML when no text/plain part exists.""" if msg.is_multipart(): + html_fallback: str | None = None for part in msg.walk(): - if part.get_content_type() == "text/plain": + ct = part.get_content_type() + if ct == "text/plain": try: charset = part.get_content_charset() or "utf-8" return part.get_payload(decode=True).decode(charset, errors="replace") except Exception: pass + elif ct == "text/html" and html_fallback is None: + try: + charset = part.get_content_charset() or "utf-8" + raw = part.get_payload(decode=True).decode(charset, errors="replace") + html_fallback = _strip_html(raw) + except Exception: + pass + return html_fallback or "" else: try: charset = msg.get_content_charset() or "utf-8" - return msg.get_payload(decode=True).decode(charset, errors="replace") + raw = msg.get_payload(decode=True).decode(charset, errors="replace") + if msg.get_content_type() == "text/html": + return _strip_html(raw) + return raw except Exception: pass return "" @@ -436,7 +504,9 @@ with st.sidebar: # ββ Tabs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -tab_label, tab_fetch, tab_stats, tab_settings = st.tabs(["π Label", "π₯ Fetch", "π Stats", "βοΈ Settings"]) +tab_label, tab_fetch, tab_stats, tab_settings, tab_benchmark = st.tabs( + ["π Label", "π₯ Fetch", "π Stats", "βοΈ Settings", "π¬ Benchmark"] +) # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -669,19 +739,19 @@ with tab_label: _lbl_r = _r.get("label", "") _counts[_lbl_r] = _counts.get(_lbl_r, 0) + 1 - row1_cols = st.columns(3) - row2_cols = st.columns(3) - row3_cols = st.columns(3) + row1_cols = st.columns(5) + row2_cols = st.columns(5) bucket_pairs = [ (row1_cols[0], "interview_scheduled"), (row1_cols[1], "offer_received"), (row1_cols[2], "rejected"), - (row2_cols[0], "positive_response"), - (row2_cols[1], "survey_received"), - (row2_cols[2], "neutral"), - (row3_cols[0], "event_rescheduled"), - (row3_cols[1], "unrelated"), - (row3_cols[2], "digest"), + (row1_cols[3], "positive_response"), + (row1_cols[4], "survey_received"), + (row2_cols[0], "neutral"), + (row2_cols[1], "event_rescheduled"), + (row2_cols[2], "digest"), + (row2_cols[3], "new_lead"), + (row2_cols[4], "hired"), ] for col, lbl in bucket_pairs: m = _LABEL_META[lbl] @@ -720,7 +790,7 @@ with tab_label: nav_cols = st.columns([2, 1, 1, 1]) remaining = len(unlabeled) - 1 - nav_cols[0].caption(f"**{remaining}** remaining Β· Keys: 1β9 = label, 0 = other, S = skip, U = undo") + nav_cols[0].caption(f"**{remaining}** remaining Β· Keys: 1β9, H = label, 0 = other, S = skip, U = undo") if nav_cols[1].button("β© Undo", disabled=not st.session_state.history, use_container_width=True): prev_idx, prev_label = st.session_state.history.pop() @@ -757,7 +827,7 @@ document.addEventListener('keydown', function(e) { const keyToLabel = { '1':'interview_scheduled','2':'offer_received','3':'rejected', '4':'positive_response','5':'survey_received','6':'neutral', - '7':'event_rescheduled','8':'unrelated','9':'digest' + '7':'event_rescheduled','8':'digest','9':'new_lead' }; const label = keyToLabel[e.key]; if (label) { @@ -772,6 +842,11 @@ document.addEventListener('keydown', function(e) { for (const btn of btns) { if (btn.innerText.includes('Other')) { btn.click(); break; } } + } else if (e.key.toLowerCase() === 'h') { + const btns = window.parent.document.querySelectorAll('button'); + for (const btn of btns) { + if (btn.innerText.toLowerCase().includes('hired')) { btn.click(); break; } + } } else if (e.key.toLowerCase() === 's') { const btns = window.parent.document.querySelectorAll('button'); for (const btn of btns) { @@ -979,3 +1054,133 @@ with tab_settings: if _k in ("settings_accounts", "settings_max") or _k.startswith("s_"): del st.session_state[_k] st.rerun() + + +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# BENCHMARK TAB +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +with tab_benchmark: + # ββ Model selection βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + _DEFAULT_MODELS = [ + "deberta-zeroshot", "deberta-small", "gliclass-large", + "bart-mnli", "bge-m3-zeroshot", "deberta-small-2pass", "deberta-base-anli", + ] + _SLOW_MODELS = [ + "deberta-large-ling", "mdeberta-xnli-2m", "bge-reranker", + "deberta-xlarge", "mdeberta-mnli", "xlm-roberta-anli", + ] + + st.subheader("π¬ Benchmark Classifier Models") + + _b_include_slow = st.checkbox("Include slow / large models", value=False, key="b_include_slow") + _b_all_models = _DEFAULT_MODELS + (_SLOW_MODELS if _b_include_slow else []) + _b_selected = st.multiselect( + "Models to run", + options=_b_all_models, + default=_b_all_models, + help="Uncheck models to skip them. Slow models require --include-slow.", + ) + + _n_examples = len(st.session_state.labeled) + st.caption( + f"Scoring against `{_SCORE_FILE.name}` Β· **{_n_examples} labeled examples**" + f" Β· Est. time: ~{max(1, len(_b_selected))} β {max(2, len(_b_selected) * 2)} min" + ) + + # Direct binary avoids conda's output interception; -u = unbuffered stdout + _CLASSIFIER_PYTHON = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python" + + if st.button("βΆ Run Benchmark", type="primary", disabled=not _b_selected, key="b_run"): + _b_cmd = [ + _CLASSIFIER_PYTHON, "-u", + str(_ROOT / "scripts" / "benchmark_classifier.py"), + "--score", "--score-file", str(_SCORE_FILE), + "--models", *_b_selected, + ] + with st.status("Running benchmarkβ¦", expanded=True) as _b_status: + _b_proc = subprocess.Popen( + _b_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, cwd=str(_ROOT), + env={**os.environ, "PYTHONUNBUFFERED": "1"}, + ) + _b_lines: list[str] = [] + _b_area = st.empty() + for _b_line in _b_proc.stdout: + _b_lines.append(_b_line) + _b_area.code("".join(_b_lines[-30:]), language="text") + _b_proc.wait() + _b_full = "".join(_b_lines) + st.session_state["bench_output"] = _b_full + if _b_proc.returncode == 0: + _b_status.update(label="Benchmark complete β", state="complete", expanded=False) + else: + _b_status.update(label="Benchmark failed", state="error") + + # ββ Results display βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + if "bench_output" in st.session_state: + _b_out = st.session_state["bench_output"] + + # Parse summary table rows: name f1 accuracy ms + _b_rows = [] + for _b_l in _b_out.splitlines(): + _b_m = re.match(r"^([\w-]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", _b_l.strip()) + if _b_m: + _b_rows.append({ + "Model": _b_m.group(1), + "macro-F1": float(_b_m.group(2)), + "Accuracy": float(_b_m.group(3)), + "ms/email": float(_b_m.group(4)), + }) + + if _b_rows: + import pandas as _pd + _b_df = _pd.DataFrame(_b_rows).sort_values("macro-F1", ascending=False).reset_index(drop=True) + st.dataframe( + _b_df, + column_config={ + "macro-F1": st.column_config.ProgressColumn( + "macro-F1", min_value=0, max_value=1, format="%.3f", + ), + "Accuracy": st.column_config.ProgressColumn( + "Accuracy", min_value=0, max_value=1, format="%.3f", + ), + "ms/email": st.column_config.NumberColumn("ms/email", format="%.1f"), + }, + use_container_width=True, hide_index=True, + ) + + with st.expander("Full benchmark output"): + st.code(_b_out, language="text") + + st.divider() + + # ββ Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + st.subheader("π§ͺ Run Tests") + st.caption("Runs `pytest tests/ -v` in the job-seeker env (no model downloads required).") + + if st.button("βΆ Run Tests", key="b_run_tests"): + _t_cmd = [ + "/devl/miniconda3/envs/job-seeker/bin/pytest", "tests/", "-v", "--tb=short", + ] + with st.status("Running testsβ¦", expanded=True) as _t_status: + _t_proc = subprocess.Popen( + _t_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, cwd=str(_ROOT), + ) + _t_lines: list[str] = [] + _t_area = st.empty() + for _t_line in _t_proc.stdout: + _t_lines.append(_t_line) + _t_area.code("".join(_t_lines[-30:]), language="text") + _t_proc.wait() + _t_full = "".join(_t_lines) + st.session_state["test_output"] = _t_full + _t_summary = [l for l in _t_lines if "passed" in l or "failed" in l or "error" in l.lower()] + _t_label = _t_summary[-1].strip() if _t_summary else "Done" + _t_state = "error" if _t_proc.returncode != 0 else "complete" + _t_status.update(label=_t_label, state=_t_state, expanded=False) + + if "test_output" in st.session_state: + with st.expander("Full test output", expanded=True): + st.code(st.session_state["test_output"], language="text") diff --git a/tests/test_label_tool.py b/tests/test_label_tool.py new file mode 100644 index 0000000..7e5d257 --- /dev/null +++ b/tests/test_label_tool.py @@ -0,0 +1,87 @@ +"""Tests for label_tool HTML extraction utilities. + +These functions are stdlib-only and safe to test without an IMAP connection. +""" +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +from app.label_tool import _extract_body, _strip_html + + +# ββ _strip_html ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def test_strip_html_removes_tags(): + assert _strip_html("
Hello world
") == "Hello world" + + +def test_strip_html_skips_script_content(): + result = _strip_html("real
") + assert "doEvil" not in result + assert "real" in result + + +def test_strip_html_skips_style_content(): + result = _strip_html("visible
") + assert ".foo" not in result + assert "visible" in result + + +def test_strip_html_handles_br_as_newline(): + result = _strip_html("line1Hello & welcome
") + assert "&" not in result + assert "Hello" in result + assert "welcome" in result + + +def test_strip_html_empty_string(): + assert _strip_html("") == "" + + +def test_strip_html_plain_text_passthrough(): + assert _strip_html("no tags here") == "no tags here" + + +# ββ _extract_body ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def test_extract_body_prefers_plain_over_html(): + msg = MIMEMultipart("alternative") + msg.attach(MIMEText("plain body", "plain")) + msg.attach(MIMEText("html body", "html")) + assert _extract_body(msg) == "plain body" + + +def test_extract_body_falls_back_to_html_when_no_plain(): + msg = MIMEMultipart("alternative") + msg.attach(MIMEText("HTML only email
", "html")) + result = _extract_body(msg) + assert "HTML only email" in result + assert "<" not in result # no raw HTML tags leaked through + + +def test_extract_body_non_multipart_html_stripped(): + msg = MIMEText("Solo HTML
", "html") + result = _extract_body(msg) + assert "Solo HTML" in result + assert "" not in result + + +def test_extract_body_non_multipart_plain_unchanged(): + msg = MIMEText("just plain text", "plain") + assert _extract_body(msg) == "just plain text" + + +def test_extract_body_empty_message(): + msg = MIMEText("", "plain") + assert _extract_body(msg) == "" + + +def test_extract_body_multipart_empty_returns_empty(): + msg = MIMEMultipart("alternative") + assert _extract_body(msg) == ""