diff --git a/app/label_tool.py b/app/label_tool.py index 83ebbe2..30f2fa9 100644 --- a/app/label_tool.py +++ b/app/label_tool.py @@ -13,8 +13,10 @@ from __future__ import annotations import email as _email_lib import hashlib +import html as _html import imaplib import json +import re import sys from datetime import datetime, timedelta from email.header import decode_header as _raw_decode @@ -40,6 +42,9 @@ LABELS = [ "positive_response", "survey_received", "neutral", + "event_rescheduled", + "unrelated", + "digest", ] _LABEL_META: dict[str, dict] = { @@ -49,9 +54,31 @@ _LABEL_META: dict[str, dict] = { "positive_response": {"emoji": "πŸ‘", "color": "#FF9800", "key": "4"}, "survey_received": {"emoji": "πŸ“‹", "color": "#9C27B0", "key": "5"}, "neutral": {"emoji": "⬜", "color": "#607D8B", "key": "6"}, + "event_rescheduled": {"emoji": "πŸ”„", "color": "#FF5722", "key": "7"}, + "unrelated": {"emoji": "πŸ—‘οΈ", "color": "#757575", "key": "8"}, + "digest": {"emoji": "πŸ“°", "color": "#00BCD4", "key": "9"}, } -# ── Wide IMAP search terms (cast a net across all 6 categories) ───────────── +# ── HTML sanitiser ─────────────────────────────────────────────────────────── +# Valid chars per XML 1.0 Β§2.2 (same set HTML5 innerHTML enforces): +# #x9 | #xA | #xD | [#x20–#xD7FF] | [#xE000–#xFFFD] | [#x10000–#x10FFFF] +# Anything outside this range causes InvalidCharacterError in the browser. +_INVALID_XML_CHARS = re.compile( + r"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]" +) + +def _to_html(text: str, newlines_to_br: bool = False) -> str: + """Strip invalid XML chars, HTML-escape the result, optionally convert \\n β†’
.""" + if not text: + return "" + cleaned = _INVALID_XML_CHARS.sub("", text) + escaped = _html.escape(cleaned) + if newlines_to_br: + escaped = escaped.replace("\n", "
") + return escaped + + +# ── Wide IMAP search terms (cast a net across all 9 categories) ───────────── _WIDE_TERMS = [ # interview_scheduled "interview", "phone screen", "video call", "zoom link", "schedule a call", @@ -68,6 +95,11 @@ _WIDE_TERMS = [ # neutral / ATS confirms "application received", "thank you for applying", "application confirmation", "you applied", "your application for", + # event_rescheduled + "reschedule", "rescheduled", "new time", "moved to", "postponed", "new date", + # digest + "job digest", "jobs you may like", "recommended jobs", "jobs for you", + "new jobs", "job alert", # general recruitment "application", "recruiter", "recruiting", "hiring", "candidate", ] @@ -441,9 +473,9 @@ with tab_label: st.markdown( f"""
-
{from_}  Β·  {date_[:16]}  Β·  {acct}
-
{subj}
-
{body[:500].replace(chr(10), '
')}
+
{_to_html(from_)}  Β·  {_to_html(date_[:16])}  Β·  {_to_html(acct)}
+
{_to_html(subj)}
+
{_to_html(body[:500], newlines_to_br=True)}
""", unsafe_allow_html=True, ) @@ -470,8 +502,15 @@ with tab_label: next_idx += 1 st.session_state.idx = next_idx + # Pre-compute per-label counts once + _counts: dict[str, int] = {} + for _r in st.session_state.labeled: + _lbl_r = _r.get("label", "") + _counts[_lbl_r] = _counts.get(_lbl_r, 0) + 1 + row1_cols = st.columns(3) row2_cols = st.columns(3) + row3_cols = st.columns(3) bucket_pairs = [ (row1_cols[0], "interview_scheduled"), (row1_cols[1], "offer_received"), @@ -479,23 +518,48 @@ with tab_label: (row2_cols[0], "positive_response"), (row2_cols[1], "survey_received"), (row2_cols[2], "neutral"), + (row3_cols[0], "event_rescheduled"), + (row3_cols[1], "unrelated"), + (row3_cols[2], "digest"), ] for col, lbl in bucket_pairs: m = _LABEL_META[lbl] - counts = {l: 0 for l in LABELS} - for r in st.session_state.labeled: - counts[r.get("label", "")] = counts.get(r.get("label", ""), 0) + 1 - label_display = f"{m['emoji']} **{lbl}** [{counts[lbl]}]\n`{m['key']}`" + cnt = _counts.get(lbl, 0) + label_display = f"{m['emoji']} **{lbl}** [{cnt}]\n`{m['key']}`" if col.button(label_display, key=f"lbl_{lbl}", use_container_width=True): _do_label(lbl) st.rerun() + # ── Wildcard label ───────────────────────────────────────────────── + if "show_custom" not in st.session_state: + st.session_state.show_custom = False + + other_col, _ = st.columns([1, 2]) + if other_col.button("🏷️ Other… `0`", key="lbl_other_toggle", use_container_width=True): + st.session_state.show_custom = not st.session_state.show_custom + st.rerun() + + if st.session_state.get("show_custom"): + custom_cols = st.columns([3, 1]) + custom_val = custom_cols[0].text_input( + "Custom label:", key="custom_label_text", + placeholder="e.g. linkedin_outreach", + label_visibility="collapsed", + ) + if custom_cols[1].button( + "βœ“ Apply", key="apply_custom", type="primary", + disabled=not (custom_val or "").strip(), + ): + _do_label(custom_val.strip().lower().replace(" ", "_")) + st.session_state.show_custom = False + st.rerun() + # ── Navigation ──────────────────────────────────────────────────── st.markdown("") nav_cols = st.columns([2, 1, 1]) remaining = len(unlabeled) - 1 - nav_cols[0].caption(f"**{remaining}** remaining Β· Keys: 1–6 = label, S = skip, U = undo") + nav_cols[0].caption(f"**{remaining}** remaining Β· Keys: 1–9 = label, 0 = other, S = skip, U = undo") if nav_cols[1].button("↩ Undo", disabled=not st.session_state.history, use_container_width=True): prev_idx, prev_label = st.session_state.history.pop() @@ -521,7 +585,8 @@ document.addEventListener('keydown', function(e) { if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return; const keyToLabel = { '1':'interview_scheduled','2':'offer_received','3':'rejected', - '4':'positive_response','5':'survey_received','6':'neutral' + '4':'positive_response','5':'survey_received','6':'neutral', + '7':'event_rescheduled','8':'unrelated','9':'digest' }; const label = keyToLabel[e.key]; if (label) { @@ -531,6 +596,11 @@ document.addEventListener('keydown', function(e) { btn.click(); break; } } + } else if (e.key === '0') { + const btns = window.parent.document.querySelectorAll('button'); + for (const btn of btns) { + if (btn.innerText.includes('Other')) { btn.click(); break; } + } } else if (e.key.toLowerCase() === 's') { const btns = window.parent.document.querySelectorAll('button'); for (const btn of btns) { @@ -558,19 +628,25 @@ with tab_stats: if not labeled: st.info("No labeled emails yet.") else: - counts = {lbl: 0 for lbl in LABELS} + counts: dict[str, int] = {} for r in labeled: lbl = r.get("label", "") - if lbl in counts: - counts[lbl] += 1 + if lbl: + counts[lbl] = counts.get(lbl, 0) + 1 st.markdown(f"**{len(labeled)} labeled emails total**") - for lbl in LABELS: - m = _LABEL_META[lbl] + # Show known labels first, then any custom labels + all_display_labels = list(LABELS) + [l for l in counts if l not in LABELS] + max_count = max(counts.values()) if counts else 1 + for lbl in all_display_labels: + if lbl not in counts: + continue + m = _LABEL_META.get(lbl) + emoji = m["emoji"] if m else "🏷️" col_name, col_bar, col_n = st.columns([3, 5, 1]) - col_name.markdown(f"{m['emoji']} {lbl}") - col_bar.progress(counts[lbl] / max(counts.values()) if counts.values() else 0) + col_name.markdown(f"{emoji} {lbl}") + col_bar.progress(counts[lbl] / max_count) col_n.markdown(f"**{counts[lbl]}**") st.divider() diff --git a/scripts/classifier_adapters.py b/scripts/classifier_adapters.py index a74ea34..e6020e2 100644 --- a/scripts/classifier_adapters.py +++ b/scripts/classifier_adapters.py @@ -26,6 +26,9 @@ LABELS: list[str] = [ "positive_response", "survey_received", "neutral", + "event_rescheduled", + "unrelated", + "digest", ] # Natural-language descriptions used by the RerankerAdapter. @@ -35,7 +38,10 @@ LABEL_DESCRIPTIONS: dict[str, str] = { "rejected": "application rejected or not moving forward with candidacy", "positive_response": "positive recruiter interest or request to connect", "survey_received": "invitation to complete a culture-fit survey or assessment", - "neutral": "automated ATS confirmation or unrelated email", + "neutral": "automated ATS confirmation such as application received", + "event_rescheduled": "an interview or scheduled event moved to a new time", + "unrelated": "non-job-search email unrelated to any application or recruiter", + "digest": "job digest or multi-listing email with multiple job postings", } # Lazy import shims β€” allow tests to patch without requiring the libs installed. diff --git a/tests/test_classifier_adapters.py b/tests/test_classifier_adapters.py index 1e1c36a..f50ef3b 100644 --- a/tests/test_classifier_adapters.py +++ b/tests/test_classifier_adapters.py @@ -2,11 +2,14 @@ import pytest -def test_labels_constant_has_six_items(): +def test_labels_constant_has_nine_items(): from scripts.classifier_adapters import LABELS - assert len(LABELS) == 6 + assert len(LABELS) == 9 assert "interview_scheduled" in LABELS assert "neutral" in LABELS + assert "event_rescheduled" in LABELS + assert "unrelated" in LABELS + assert "digest" in LABELS def test_compute_metrics_perfect_predictions():