feat: Corrections tab — SFT candidate import, review, and JSONL export #15
6 changed files with 135 additions and 1345 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -11,6 +11,8 @@ config/label_tool.yaml
|
|||
data/email_score.jsonl
|
||||
data/email_label_queue.jsonl
|
||||
data/email_compare_sample.jsonl
|
||||
data/sft_candidates.jsonl
|
||||
data/sft_approved.jsonl
|
||||
|
||||
# Conda/pip artifacts
|
||||
.env
|
||||
|
|
|
|||
1186
app/label_tool.py
1186
app/label_tool.py
File diff suppressed because it is too large
Load diff
85
app/utils.py
Normal file
85
app/utils.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""Shared email utility functions for Avocet.
|
||||
|
||||
Pure-stdlib helpers extracted from the retired label_tool.py Streamlit app.
|
||||
These are reused by the FastAPI backend and the test suite.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
from typing import Any
|
||||
|
||||
|
||||
# ── HTML → plain-text extractor ──────────────────────────────────────────────
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
"""Extract visible text from an HTML email body, preserving line breaks."""
|
||||
_BLOCK = {"p", "div", "br", "li", "tr", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"}
|
||||
_SKIP = {"script", "style", "head", "noscript"}
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._parts: list[str] = []
|
||||
self._depth_skip = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
tag = tag.lower()
|
||||
if tag in self._SKIP:
|
||||
self._depth_skip += 1
|
||||
elif tag in self._BLOCK:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag.lower() in self._SKIP:
|
||||
self._depth_skip = max(0, self._depth_skip - 1)
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._depth_skip:
|
||||
self._parts.append(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
text = "".join(self._parts)
|
||||
lines = [ln.strip() for ln in text.splitlines()]
|
||||
return "\n".join(ln for ln in lines if ln)
|
||||
|
||||
|
||||
def _strip_html(html_str: str) -> str:
|
||||
"""Convert HTML email body to plain text. Pure stdlib, no dependencies."""
|
||||
try:
|
||||
extractor = _TextExtractor()
|
||||
extractor.feed(html_str)
|
||||
return extractor.get_text()
|
||||
except Exception:
|
||||
return re.sub(r"<[^>]+>", " ", html_str).strip()
|
||||
|
||||
|
||||
def _extract_body(msg: Any) -> str:
|
||||
"""Return plain-text body. Strips HTML when no text/plain part exists."""
|
||||
if msg.is_multipart():
|
||||
html_fallback: str | None = None
|
||||
for part in msg.walk():
|
||||
ct = part.get_content_type()
|
||||
if ct == "text/plain":
|
||||
try:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
return part.get_payload(decode=True).decode(charset, errors="replace")
|
||||
except Exception:
|
||||
pass
|
||||
elif ct == "text/html" and html_fallback is None:
|
||||
try:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
raw = part.get_payload(decode=True).decode(charset, errors="replace")
|
||||
html_fallback = _strip_html(raw)
|
||||
except Exception:
|
||||
pass
|
||||
return html_fallback or ""
|
||||
else:
|
||||
try:
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
raw = msg.get_payload(decode=True).decode(charset, errors="replace")
|
||||
if msg.get_content_type() == "text/html":
|
||||
return _strip_html(raw)
|
||||
return raw
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
|
@ -21,3 +21,8 @@ accounts:
|
|||
|
||||
# Optional: limit emails fetched per account per run (0 = unlimited)
|
||||
max_per_account: 500
|
||||
|
||||
# cf-orch SFT candidate import — path to the bench_results/ directory
|
||||
# produced by circuitforge-orch's benchmark harness.
|
||||
sft:
|
||||
bench_results_dir: /path/to/circuitforge-orch/scripts/bench_results
|
||||
|
|
|
|||
200
manage.sh
200
manage.sh
|
|
@ -21,7 +21,6 @@ DEFAULT_PORT=8503
|
|||
CONDA_BASE="${CONDA_BASE:-/devl/miniconda3}"
|
||||
ENV_UI="job-seeker"
|
||||
ENV_BM="job-seeker-classifiers"
|
||||
STREAMLIT="${CONDA_BASE}/envs/${ENV_UI}/bin/streamlit"
|
||||
PYTHON_BM="${CONDA_BASE}/envs/${ENV_BM}/bin/python"
|
||||
PYTHON_UI="${CONDA_BASE}/envs/${ENV_UI}/bin/python"
|
||||
|
||||
|
|
@ -79,13 +78,11 @@ usage() {
|
|||
echo ""
|
||||
echo " Usage: ./manage.sh <command> [args]"
|
||||
echo ""
|
||||
echo " Label tool:"
|
||||
echo -e " ${GREEN}start${NC} Start label tool UI (port collision-safe)"
|
||||
echo -e " ${GREEN}stop${NC} Stop label tool UI"
|
||||
echo -e " ${GREEN}restart${NC} Restart label tool UI"
|
||||
echo -e " ${GREEN}status${NC} Show running state and port"
|
||||
echo -e " ${GREEN}logs${NC} Tail label tool log output"
|
||||
echo -e " ${GREEN}open${NC} Open label tool in browser"
|
||||
echo " Vue UI + FastAPI:"
|
||||
echo -e " ${GREEN}start${NC} Build Vue SPA + start FastAPI on port 8503"
|
||||
echo -e " ${GREEN}stop${NC} Stop FastAPI server"
|
||||
echo -e " ${GREEN}restart${NC} Stop + rebuild + restart FastAPI server"
|
||||
echo -e " ${GREEN}open${NC} Open Vue UI in browser (http://localhost:8503)"
|
||||
echo ""
|
||||
echo " Benchmark:"
|
||||
echo -e " ${GREEN}benchmark [args]${NC} Run benchmark_classifier.py (args passed through)"
|
||||
|
|
@ -93,12 +90,6 @@ usage() {
|
|||
echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]"
|
||||
echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]"
|
||||
echo ""
|
||||
echo " Vue API:"
|
||||
echo -e " ${GREEN}start-api${NC} Build Vue SPA + start FastAPI on port 8503"
|
||||
echo -e " ${GREEN}stop-api${NC} Stop FastAPI server"
|
||||
echo -e " ${GREEN}restart-api${NC} Stop + rebuild + restart FastAPI server"
|
||||
echo -e " ${GREEN}open-api${NC} Open Vue UI in browser (http://localhost:8503)"
|
||||
echo ""
|
||||
echo " Dev:"
|
||||
echo -e " ${GREEN}test${NC} Run pytest suite"
|
||||
echo ""
|
||||
|
|
@ -121,102 +112,61 @@ shift || true
|
|||
case "$CMD" in
|
||||
|
||||
start)
|
||||
pid=$(_running_pid)
|
||||
if [[ -n "$pid" ]]; then
|
||||
port=$(_running_port)
|
||||
warn "Already running (PID ${pid}) on port ${port} → http://localhost:${port}"
|
||||
API_PID_FILE=".avocet-api.pid"
|
||||
API_PORT=8503
|
||||
if [[ -f "$API_PID_FILE" ]] && kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
|
||||
warn "API already running (PID $(<"$API_PID_FILE")) → http://localhost:${API_PORT}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ ! -x "$STREAMLIT" ]]; then
|
||||
error "Streamlit not found at ${STREAMLIT}\nActivate env: conda run -n ${ENV_UI} ..."
|
||||
fi
|
||||
|
||||
port=$(_find_free_port "$DEFAULT_PORT")
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
info "Starting label tool on port ${port}…"
|
||||
nohup "$STREAMLIT" run app/label_tool.py \
|
||||
--server.port "$port" \
|
||||
--server.headless true \
|
||||
--server.fileWatcherType none \
|
||||
>"$LOG_FILE" 2>&1 &
|
||||
|
||||
pid=$!
|
||||
echo "$pid" > "$PID_FILE"
|
||||
echo "$port" > "$PORT_FILE"
|
||||
|
||||
# Wait briefly and confirm the process survived
|
||||
sleep 1
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
success "Avocet label tool started → http://localhost:${port} (PID ${pid})"
|
||||
success "Logs: ${LOG_FILE}"
|
||||
else
|
||||
rm -f "$PID_FILE" "$PORT_FILE"
|
||||
error "Process died immediately. Check ${LOG_FILE} for details."
|
||||
API_LOG="${LOG_DIR}/api.log"
|
||||
info "Building Vue SPA…"
|
||||
(cd web && npm run build) >> "$API_LOG" 2>&1
|
||||
info "Starting FastAPI on port ${API_PORT}…"
|
||||
nohup "$PYTHON_UI" -m uvicorn app.api:app \
|
||||
--host 0.0.0.0 --port "$API_PORT" \
|
||||
>> "$API_LOG" 2>&1 &
|
||||
echo $! > "$API_PID_FILE"
|
||||
# Poll until port is actually bound (up to 10 s), not just process alive
|
||||
for _i in $(seq 1 20); do
|
||||
sleep 0.5
|
||||
if (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
|
||||
success "Avocet started → http://localhost:${API_PORT} (PID $(<"$API_PID_FILE"))"
|
||||
break
|
||||
fi
|
||||
if ! kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
|
||||
rm -f "$API_PID_FILE"
|
||||
error "Server died during startup. Check ${API_LOG}"
|
||||
fi
|
||||
done
|
||||
if ! (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
|
||||
error "Server did not bind to port ${API_PORT} within 10 s. Check ${API_LOG}"
|
||||
fi
|
||||
;;
|
||||
|
||||
stop)
|
||||
pid=$(_running_pid)
|
||||
if [[ -z "$pid" ]]; then
|
||||
API_PID_FILE=".avocet-api.pid"
|
||||
if [[ ! -f "$API_PID_FILE" ]]; then
|
||||
warn "Not running."
|
||||
exit 0
|
||||
fi
|
||||
info "Stopping label tool (PID ${pid})…"
|
||||
kill "$pid"
|
||||
# Wait up to 5 s for clean exit
|
||||
for _ in $(seq 1 10); do
|
||||
kill -0 "$pid" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
warn "Process did not exit cleanly; sending SIGKILL…"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
PID="$(<"$API_PID_FILE")"
|
||||
if kill -0 "$PID" 2>/dev/null; then
|
||||
kill "$PID" && rm -f "$API_PID_FILE"
|
||||
success "Stopped (PID ${PID})."
|
||||
else
|
||||
warn "Stale PID file (process ${PID} not running). Cleaning up."
|
||||
rm -f "$API_PID_FILE"
|
||||
fi
|
||||
rm -f "$PID_FILE" "$PORT_FILE"
|
||||
success "Stopped."
|
||||
;;
|
||||
|
||||
restart)
|
||||
pid=$(_running_pid)
|
||||
if [[ -n "$pid" ]]; then
|
||||
info "Stopping existing process (PID ${pid})…"
|
||||
kill "$pid"
|
||||
for _ in $(seq 1 10); do
|
||||
kill -0 "$pid" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true
|
||||
rm -f "$PID_FILE" "$PORT_FILE"
|
||||
fi
|
||||
bash "$0" stop
|
||||
exec bash "$0" start
|
||||
;;
|
||||
|
||||
status)
|
||||
pid=$(_running_pid)
|
||||
if [[ -n "$pid" ]]; then
|
||||
port=$(_running_port)
|
||||
success "Running — PID ${pid} port ${port} → http://localhost:${port}"
|
||||
else
|
||||
warn "Not running."
|
||||
fi
|
||||
;;
|
||||
|
||||
logs)
|
||||
if [[ ! -f "$LOG_FILE" ]]; then
|
||||
warn "No log file found at ${LOG_FILE}. Has the tool been started?"
|
||||
exit 0
|
||||
fi
|
||||
info "Tailing ${LOG_FILE} (Ctrl-C to stop)"
|
||||
tail -f "$LOG_FILE"
|
||||
;;
|
||||
|
||||
open)
|
||||
port=$(_running_port)
|
||||
pid=$(_running_pid)
|
||||
[[ -z "$pid" ]] && warn "Label tool does not appear to be running. Start with: ./manage.sh start"
|
||||
URL="http://localhost:${port}"
|
||||
URL="http://localhost:8503"
|
||||
info "Opening ${URL}"
|
||||
if command -v xdg-open &>/dev/null; then
|
||||
xdg-open "$URL"
|
||||
|
|
@ -257,72 +207,6 @@ case "$CMD" in
|
|||
exec "$0" benchmark --compare "$@"
|
||||
;;
|
||||
|
||||
start-api)
|
||||
API_PID_FILE=".avocet-api.pid"
|
||||
API_PORT=8503
|
||||
if [[ -f "$API_PID_FILE" ]] && kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
|
||||
warn "API already running (PID $(<"$API_PID_FILE")) → http://localhost:${API_PORT}"
|
||||
exit 0
|
||||
fi
|
||||
mkdir -p "$LOG_DIR"
|
||||
API_LOG="${LOG_DIR}/api.log"
|
||||
info "Building Vue SPA…"
|
||||
(cd web && npm run build) >> "$API_LOG" 2>&1
|
||||
info "Starting FastAPI on port ${API_PORT}…"
|
||||
nohup "$PYTHON_UI" -m uvicorn app.api:app \
|
||||
--host 0.0.0.0 --port "$API_PORT" \
|
||||
>> "$API_LOG" 2>&1 &
|
||||
echo $! > "$API_PID_FILE"
|
||||
# Poll until port is actually bound (up to 10 s), not just process alive
|
||||
for _i in $(seq 1 20); do
|
||||
sleep 0.5
|
||||
if (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
|
||||
success "Avocet API started → http://localhost:${API_PORT} (PID $(<"$API_PID_FILE"))"
|
||||
break
|
||||
fi
|
||||
if ! kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
|
||||
rm -f "$API_PID_FILE"
|
||||
error "API died during startup. Check ${API_LOG}"
|
||||
fi
|
||||
done
|
||||
if ! (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
|
||||
error "API did not bind to port ${API_PORT} within 10 s. Check ${API_LOG}"
|
||||
fi
|
||||
;;
|
||||
|
||||
stop-api)
|
||||
API_PID_FILE=".avocet-api.pid"
|
||||
if [[ ! -f "$API_PID_FILE" ]]; then
|
||||
warn "API not running."
|
||||
exit 0
|
||||
fi
|
||||
PID="$(<"$API_PID_FILE")"
|
||||
if kill -0 "$PID" 2>/dev/null; then
|
||||
kill "$PID" && rm -f "$API_PID_FILE"
|
||||
success "API stopped (PID ${PID})."
|
||||
else
|
||||
warn "Stale PID file (process ${PID} not running). Cleaning up."
|
||||
rm -f "$API_PID_FILE"
|
||||
fi
|
||||
;;
|
||||
|
||||
restart-api)
|
||||
bash "$0" stop-api
|
||||
exec bash "$0" start-api
|
||||
;;
|
||||
|
||||
open-api)
|
||||
URL="http://localhost:8503"
|
||||
info "Opening ${URL}"
|
||||
if command -v xdg-open &>/dev/null; then
|
||||
xdg-open "$URL"
|
||||
elif command -v open &>/dev/null; then
|
||||
open "$URL"
|
||||
else
|
||||
echo "$URL"
|
||||
fi
|
||||
;;
|
||||
|
||||
help|--help|-h)
|
||||
usage
|
||||
;;
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ These functions are stdlib-only and safe to test without an IMAP connection.
|
|||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
|
||||
from app.label_tool import _extract_body, _strip_html
|
||||
from app.utils import _extract_body, _strip_html
|
||||
|
||||
|
||||
# ── _strip_html ──────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
Loading…
Reference in a new issue