avocet/app/label_tool.py
pyr0ball d68754d432 feat: initial avocet repo — email classifier training tool
Scrape → Store → Process pipeline for building email classifier
benchmark data across the CircuitForge menagerie.

- app/label_tool.py — Streamlit card-stack UI, multi-account IMAP fetch,
  6-bucket labeling, undo/skip, keyboard shortcuts (1-6/S/U)
- scripts/classifier_adapters.py — ZeroShotAdapter (+ two_pass),
  GLiClassAdapter, RerankerAdapter; ABC with lazy model loading
- scripts/benchmark_classifier.py — 13-model registry, --score,
  --compare, --list-models, --export-db; uses label_tool.yaml for IMAP
- tests/ — 20 tests, all passing, zero model downloads required
- config/label_tool.yaml.example — multi-account IMAP template
- data/email_score.jsonl.example — sample labeled data for CI

Labels: interview_scheduled, offer_received, rejected,
        positive_response, survey_received, neutral
2026-02-27 14:07:38 -08:00

568 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Email Label Tool — card-stack UI for building classifier benchmark data.
Philosophy: Scrape → Store → Process
Fetch (IMAP, wide search, multi-account) → data/email_label_queue.jsonl
Label (card stack) → data/email_score.jsonl
Run:
conda run -n job-seeker streamlit run app/label_tool.py --server.port 8503
Config: config/label_tool.yaml (gitignored — see config/label_tool.yaml.example)
"""
from __future__ import annotations
import email as _email_lib
import hashlib
import imaplib
import json
import sys
from datetime import datetime, timedelta
from email.header import decode_header as _raw_decode
from pathlib import Path
from typing import Any
import streamlit as st
import yaml
# ── Path setup ─────────────────────────────────────────────────────────────
_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(_ROOT))
_QUEUE_FILE = _ROOT / "data" / "email_label_queue.jsonl"
_SCORE_FILE = _ROOT / "data" / "email_score.jsonl"
_CFG_FILE = _ROOT / "config" / "label_tool.yaml"
# ── Labels ─────────────────────────────────────────────────────────────────
LABELS = [
"interview_scheduled",
"offer_received",
"rejected",
"positive_response",
"survey_received",
"neutral",
]
_LABEL_META: dict[str, dict] = {
"interview_scheduled": {"emoji": "🗓️", "color": "#4CAF50", "key": "1"},
"offer_received": {"emoji": "🎉", "color": "#2196F3", "key": "2"},
"rejected": {"emoji": "", "color": "#F44336", "key": "3"},
"positive_response": {"emoji": "👍", "color": "#FF9800", "key": "4"},
"survey_received": {"emoji": "📋", "color": "#9C27B0", "key": "5"},
"neutral": {"emoji": "", "color": "#607D8B", "key": "6"},
}
# ── Wide IMAP search terms (cast a net across all 6 categories) ─────────────
_WIDE_TERMS = [
# interview_scheduled
"interview", "phone screen", "video call", "zoom link", "schedule a call",
# offer_received
"offer letter", "job offer", "offer of employment", "pleased to offer",
# rejected
"unfortunately", "not moving forward", "other candidates", "regret to inform",
"no longer", "decided not to", "decided to go with",
# positive_response
"opportunity", "interested in your background", "reached out", "great fit",
"exciting role", "love to connect",
# survey_received
"assessment", "questionnaire", "culture fit", "culture-fit", "online assessment",
# neutral / ATS confirms
"application received", "thank you for applying", "application confirmation",
"you applied", "your application for",
# general recruitment
"application", "recruiter", "recruiting", "hiring", "candidate",
]
# ── IMAP helpers ────────────────────────────────────────────────────────────
def _decode_str(value: str | None) -> str:
if not value:
return ""
parts = _raw_decode(value)
out = []
for part, enc in parts:
if isinstance(part, bytes):
out.append(part.decode(enc or "utf-8", errors="replace"))
else:
out.append(str(part))
return " ".join(out).strip()
def _extract_body(msg: Any) -> str:
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
charset = part.get_content_charset() or "utf-8"
return part.get_payload(decode=True).decode(charset, errors="replace")
except Exception:
pass
else:
try:
charset = msg.get_content_charset() or "utf-8"
return msg.get_payload(decode=True).decode(charset, errors="replace")
except Exception:
pass
return ""
def _fetch_account(cfg: dict, days: int, limit: int, known_keys: set[str],
progress_cb=None) -> list[dict]:
"""Fetch emails from one IMAP account using wide recruitment search terms."""
since = (datetime.now() - timedelta(days=days)).strftime("%d-%b-%Y")
host = cfg.get("host", "imap.gmail.com")
port = int(cfg.get("port", 993))
use_ssl = cfg.get("use_ssl", True)
username = cfg["username"]
password = cfg["password"]
name = cfg.get("name", username)
conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port)
conn.login(username, password)
seen_uids: dict[bytes, None] = {}
conn.select("INBOX", readonly=True)
for term in _WIDE_TERMS:
try:
_, data = conn.search(None, f'(SUBJECT "{term}" SINCE "{since}")')
for uid in (data[0] or b"").split():
seen_uids[uid] = None
except Exception:
pass
emails: list[dict] = []
uids = list(seen_uids.keys())[:limit * 3] # overfetch; filter after dedup
for i, uid in enumerate(uids):
if len(emails) >= limit:
break
if progress_cb:
progress_cb(i / len(uids), f"{name}: {len(emails)} fetched…")
try:
_, raw_data = conn.fetch(uid, "(RFC822)")
if not raw_data or not raw_data[0]:
continue
msg = _email_lib.message_from_bytes(raw_data[0][1])
subj = _decode_str(msg.get("Subject", ""))
from_addr = _decode_str(msg.get("From", ""))
date = _decode_str(msg.get("Date", ""))
body = _extract_body(msg)[:800]
entry = {
"subject": subj,
"body": body,
"from_addr": from_addr,
"date": date,
"account": name,
}
key = _entry_key(entry)
if key not in known_keys:
known_keys.add(key)
emails.append(entry)
except Exception:
pass
try:
conn.logout()
except Exception:
pass
return emails
# ── Queue / score file helpers ───────────────────────────────────────────────
def _entry_key(e: dict) -> str:
return hashlib.md5(
(e.get("subject", "") + (e.get("body") or "")[:100]).encode()
).hexdigest()
def _load_jsonl(path: Path) -> list[dict]:
if not path.exists():
return []
rows = []
with path.open() as f:
for line in f:
line = line.strip()
if line:
try:
rows.append(json.loads(line))
except Exception:
pass
return rows
def _save_jsonl(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def _append_jsonl(path: Path, row: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("a") as f:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
# ── Config ──────────────────────────────────────────────────────────────────
def _load_config() -> list[dict]:
if not _CFG_FILE.exists():
return []
cfg = yaml.safe_load(_CFG_FILE.read_text()) or {}
return cfg.get("accounts", [])
# ── Page setup ──────────────────────────────────────────────────────────────
st.set_page_config(
page_title="Avocet — Email Labeler",
page_icon="📬",
layout="wide",
)
st.markdown("""
<style>
/* Card stack */
.email-card {
border: 1px solid rgba(128,128,128,0.25);
border-radius: 14px;
padding: 28px 32px;
box-shadow: 0 6px 24px rgba(0,0,0,0.18);
margin-bottom: 4px;
position: relative;
}
.card-stack-hint {
height: 10px;
border-radius: 0 0 12px 12px;
border: 1px solid rgba(128,128,128,0.15);
margin: 0 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.10);
}
.card-stack-hint2 {
height: 8px;
border-radius: 0 0 10px 10px;
border: 1px solid rgba(128,128,128,0.08);
margin: 0 32px;
}
/* Subject line */
.card-subject { font-size: 1.3rem; font-weight: 700; margin-bottom: 6px; }
.card-meta { font-size: 0.82rem; opacity: 0.6; margin-bottom: 16px; }
.card-body { font-size: 0.92rem; opacity: 0.85; white-space: pre-wrap; line-height: 1.5; }
/* Bucket buttons */
div[data-testid="stButton"] > button.bucket-btn {
height: 70px;
font-size: 1.05rem;
font-weight: 600;
border-radius: 12px;
}
</style>
""", unsafe_allow_html=True)
st.title("📬 Avocet — Email Label Tool")
st.caption("Scrape → Store → Process | card-stack edition")
# ── Session state init ───────────────────────────────────────────────────────
if "queue" not in st.session_state:
st.session_state.queue: list[dict] = _load_jsonl(_QUEUE_FILE)
if "labeled" not in st.session_state:
st.session_state.labeled: list[dict] = _load_jsonl(_SCORE_FILE)
st.session_state.labeled_keys: set[str] = {
_entry_key(r) for r in st.session_state.labeled
}
if "idx" not in st.session_state:
# Start past already-labeled entries in the queue
labeled_keys = st.session_state.labeled_keys
for i, entry in enumerate(st.session_state.queue):
if _entry_key(entry) not in labeled_keys:
st.session_state.idx = i
break
else:
st.session_state.idx = len(st.session_state.queue)
if "history" not in st.session_state:
st.session_state.history: list[tuple[int, str]] = [] # (queue_idx, label)
# ── Sidebar stats ────────────────────────────────────────────────────────────
with st.sidebar:
labeled = st.session_state.labeled
queue = st.session_state.queue
unlabeled = [e for e in queue if _entry_key(e) not in st.session_state.labeled_keys]
st.metric("✅ Labeled", len(labeled))
st.metric("📥 Queue", len(unlabeled))
if labeled:
st.caption("**Label distribution**")
counts = {lbl: 0 for lbl in LABELS}
for r in labeled:
counts[r.get("label", "")] = counts.get(r.get("label", ""), 0) + 1
for lbl in LABELS:
m = _LABEL_META[lbl]
st.caption(f"{m['emoji']} {lbl}: **{counts[lbl]}**")
# ── Tabs ─────────────────────────────────────────────────────────────────────
tab_label, tab_fetch, tab_stats = st.tabs(["🃏 Label", "📥 Fetch", "📊 Stats"])
# ══════════════════════════════════════════════════════════════════════════════
# FETCH TAB
# ══════════════════════════════════════════════════════════════════════════════
with tab_fetch:
accounts = _load_config()
if not accounts:
st.warning(
f"No accounts configured. Copy `config/label_tool.yaml.example` → "
f"`config/label_tool.yaml` and add your IMAP accounts.",
icon="⚠️",
)
else:
st.markdown(f"**{len(accounts)} account(s) configured:**")
for acc in accounts:
st.caption(f"{acc.get('name', acc.get('username'))} ({acc.get('host')})")
col_days, col_limit = st.columns(2)
days = col_days.number_input("Days back", min_value=7, max_value=730, value=180)
limit = col_limit.number_input("Max emails per account", min_value=10, max_value=1000, value=150)
all_accs = [a.get("name", a.get("username")) for a in accounts]
selected = st.multiselect("Accounts to fetch", all_accs, default=all_accs)
if st.button("📥 Fetch from IMAP", disabled=not accounts or not selected, type="primary"):
existing_keys = {_entry_key(e) for e in st.session_state.queue}
existing_keys.update(st.session_state.labeled_keys)
fetched_all: list[dict] = []
status = st.status("Fetching…", expanded=True)
for acc in accounts:
name = acc.get("name", acc.get("username"))
if name not in selected:
continue
status.write(f"Connecting to **{name}**…")
try:
emails = _fetch_account(
acc, days=int(days), limit=int(limit),
known_keys=existing_keys,
progress_cb=lambda p, msg: status.write(msg),
)
fetched_all.extend(emails)
status.write(f"{name}: {len(emails)} new emails")
except Exception as e:
status.write(f"{name}: {e}")
if fetched_all:
_save_jsonl(_QUEUE_FILE, st.session_state.queue + fetched_all)
st.session_state.queue = _load_jsonl(_QUEUE_FILE)
# Reset idx to first unlabeled
labeled_keys = st.session_state.labeled_keys
for i, entry in enumerate(st.session_state.queue):
if _entry_key(entry) not in labeled_keys:
st.session_state.idx = i
break
status.update(label=f"Done — {len(fetched_all)} new emails added to queue", state="complete")
else:
status.update(label="No new emails found (all already in queue or score file)", state="complete")
# ══════════════════════════════════════════════════════════════════════════════
# LABEL TAB
# ══════════════════════════════════════════════════════════════════════════════
with tab_label:
queue = st.session_state.queue
labeled_keys = st.session_state.labeled_keys
idx = st.session_state.idx
# Advance idx past already-labeled entries
while idx < len(queue) and _entry_key(queue[idx]) in labeled_keys:
idx += 1
st.session_state.idx = idx
unlabeled = [e for e in queue if _entry_key(e) not in labeled_keys]
total_in_queue = len(queue)
n_labeled = len(st.session_state.labeled)
if not queue:
st.info("Queue is empty — go to **Fetch** to pull emails from IMAP.", icon="📥")
elif not unlabeled:
st.success(
f"🎉 All {n_labeled} emails labeled! Go to **Stats** to review and export.",
icon="",
)
else:
# Progress
labeled_in_queue = total_in_queue - len(unlabeled)
progress_pct = labeled_in_queue / total_in_queue if total_in_queue else 0
st.progress(progress_pct, text=f"{labeled_in_queue} / {total_in_queue} labeled in queue")
# Current email
entry = queue[idx]
# Card HTML
subj = entry.get("subject", "(no subject)") or "(no subject)"
from_ = entry.get("from_addr", "") or ""
date_ = entry.get("date", "") or ""
acct = entry.get("account", "") or ""
body = (entry.get("body") or "").strip()
st.markdown(
f"""<div class="email-card">
<div class="card-meta">{from_} &nbsp;·&nbsp; {date_[:16]} &nbsp;·&nbsp; <em>{acct}</em></div>
<div class="card-subject">{subj}</div>
<div class="card-body">{body[:500].replace(chr(10), '<br>')}</div>
</div>""",
unsafe_allow_html=True,
)
if len(body) > 500:
with st.expander("Show full body"):
st.text(body)
# Stack hint (visual depth)
st.markdown('<div class="card-stack-hint"></div>', unsafe_allow_html=True)
st.markdown('<div class="card-stack-hint2"></div>', unsafe_allow_html=True)
st.markdown("") # spacer
# ── Bucket buttons ────────────────────────────────────────────────
def _do_label(label: str) -> None:
row = {"subject": entry.get("subject", ""), "body": body[:600], "label": label}
st.session_state.labeled.append(row)
st.session_state.labeled_keys.add(_entry_key(entry))
_append_jsonl(_SCORE_FILE, row)
st.session_state.history.append((idx, label))
# Advance
next_idx = idx + 1
while next_idx < len(queue) and _entry_key(queue[next_idx]) in labeled_keys:
next_idx += 1
st.session_state.idx = next_idx
row1_cols = st.columns(3)
row2_cols = st.columns(3)
bucket_pairs = [
(row1_cols[0], "interview_scheduled"),
(row1_cols[1], "offer_received"),
(row1_cols[2], "rejected"),
(row2_cols[0], "positive_response"),
(row2_cols[1], "survey_received"),
(row2_cols[2], "neutral"),
]
for col, lbl in bucket_pairs:
m = _LABEL_META[lbl]
counts = {l: 0 for l in LABELS}
for r in st.session_state.labeled:
counts[r.get("label", "")] = counts.get(r.get("label", ""), 0) + 1
label_display = f"{m['emoji']} **{lbl}** [{counts[lbl]}]\n`{m['key']}`"
if col.button(label_display, key=f"lbl_{lbl}", use_container_width=True):
_do_label(lbl)
st.rerun()
# ── Navigation ────────────────────────────────────────────────────
st.markdown("")
nav_cols = st.columns([2, 1, 1])
remaining = len(unlabeled) - 1
nav_cols[0].caption(f"**{remaining}** remaining · Keys: 16 = label, S = skip, U = undo")
if nav_cols[1].button("↩ Undo", disabled=not st.session_state.history, use_container_width=True):
prev_idx, prev_label = st.session_state.history.pop()
# Remove the last labeled entry
if st.session_state.labeled:
removed = st.session_state.labeled.pop()
st.session_state.labeled_keys.discard(_entry_key(removed))
_save_jsonl(_SCORE_FILE, st.session_state.labeled)
st.session_state.idx = prev_idx
st.rerun()
if nav_cols[2].button("→ Skip", use_container_width=True):
next_idx = idx + 1
while next_idx < len(queue) and _entry_key(queue[next_idx]) in labeled_keys:
next_idx += 1
st.session_state.idx = next_idx
st.rerun()
# Keyboard shortcut capture (JS → hidden button click)
st.components.v1.html(
"""<script>
document.addEventListener('keydown', function(e) {
if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
const keyToLabel = {
'1':'interview_scheduled','2':'offer_received','3':'rejected',
'4':'positive_response','5':'survey_received','6':'neutral'
};
const label = keyToLabel[e.key];
if (label) {
const btns = window.parent.document.querySelectorAll('button');
for (const btn of btns) {
if (btn.innerText.toLowerCase().includes(label.replace('_',' '))) {
btn.click(); break;
}
}
} else if (e.key.toLowerCase() === 's') {
const btns = window.parent.document.querySelectorAll('button');
for (const btn of btns) {
if (btn.innerText.includes('Skip')) { btn.click(); break; }
}
} else if (e.key.toLowerCase() === 'u') {
const btns = window.parent.document.querySelectorAll('button');
for (const btn of btns) {
if (btn.innerText.includes('Undo')) { btn.click(); break; }
}
}
});
</script>""",
height=0,
)
# ══════════════════════════════════════════════════════════════════════════════
# STATS TAB
# ══════════════════════════════════════════════════════════════════════════════
with tab_stats:
labeled = st.session_state.labeled
if not labeled:
st.info("No labeled emails yet.")
else:
counts = {lbl: 0 for lbl in LABELS}
for r in labeled:
lbl = r.get("label", "")
if lbl in counts:
counts[lbl] += 1
st.markdown(f"**{len(labeled)} labeled emails total**")
for lbl in LABELS:
m = _LABEL_META[lbl]
col_name, col_bar, col_n = st.columns([3, 5, 1])
col_name.markdown(f"{m['emoji']} {lbl}")
col_bar.progress(counts[lbl] / max(counts.values()) if counts.values() else 0)
col_n.markdown(f"**{counts[lbl]}**")
st.divider()
st.caption(
f"Score file: `{_SCORE_FILE.relative_to(_ROOT)}` "
f"({_SCORE_FILE.stat().st_size if _SCORE_FILE.exists() else 0:,} bytes)"
)
if st.button("🔄 Re-sync from disk"):
st.session_state.labeled = _load_jsonl(_SCORE_FILE)
st.session_state.labeled_keys = {_entry_key(r) for r in st.session_state.labeled}
st.rerun()
if _SCORE_FILE.exists():
st.download_button(
"⬇️ Download email_score.jsonl",
data=_SCORE_FILE.read_bytes(),
file_name="email_score.jsonl",
mime="application/jsonlines",
)