chore: retire Streamlit app, scaffold sft branch

- Delete app/label_tool.py (Streamlit UI retired; Vue SPA is sole UI)
- Extract _strip_html and _extract_body into app/utils.py (stdlib-only, reusable)
- Update tests/test_label_tool.py import to app.utils
- Rename start-api/stop-api/restart-api/open-api → start/stop/restart/open in manage.sh
- Remove STREAMLIT variable and all Streamlit-specific case blocks from manage.sh
- Update manage.sh usage section to reflect Vue+FastAPI-only commands
- Add data/sft_candidates.jsonl and data/sft_approved.jsonl to .gitignore
- Add sft.bench_results_dir key to config/label_tool.yaml.example
This commit is contained in:
pyr0ball 2026-04-08 06:18:12 -07:00
parent de2a2935b9
commit ae0ac19505
6 changed files with 135 additions and 1345 deletions

2
.gitignore vendored
View file

@ -11,6 +11,8 @@ config/label_tool.yaml
data/email_score.jsonl data/email_score.jsonl
data/email_label_queue.jsonl data/email_label_queue.jsonl
data/email_compare_sample.jsonl data/email_compare_sample.jsonl
data/sft_candidates.jsonl
data/sft_approved.jsonl
# Conda/pip artifacts # Conda/pip artifacts
.env .env

File diff suppressed because it is too large Load diff

85
app/utils.py Normal file
View file

@ -0,0 +1,85 @@
"""Shared email utility functions for Avocet.
Pure-stdlib helpers extracted from the retired label_tool.py Streamlit app.
These are reused by the FastAPI backend and the test suite.
"""
from __future__ import annotations
import re
from html.parser import HTMLParser
from typing import Any
# ── HTML → plain-text extractor ──────────────────────────────────────────────
class _TextExtractor(HTMLParser):
"""Extract visible text from an HTML email body, preserving line breaks."""
_BLOCK = {"p", "div", "br", "li", "tr", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"}
_SKIP = {"script", "style", "head", "noscript"}
def __init__(self):
super().__init__(convert_charrefs=True)
self._parts: list[str] = []
self._depth_skip = 0
def handle_starttag(self, tag, attrs):
tag = tag.lower()
if tag in self._SKIP:
self._depth_skip += 1
elif tag in self._BLOCK:
self._parts.append("\n")
def handle_endtag(self, tag):
if tag.lower() in self._SKIP:
self._depth_skip = max(0, self._depth_skip - 1)
def handle_data(self, data):
if not self._depth_skip:
self._parts.append(data)
def get_text(self) -> str:
text = "".join(self._parts)
lines = [ln.strip() for ln in text.splitlines()]
return "\n".join(ln for ln in lines if ln)
def _strip_html(html_str: str) -> str:
"""Convert HTML email body to plain text. Pure stdlib, no dependencies."""
try:
extractor = _TextExtractor()
extractor.feed(html_str)
return extractor.get_text()
except Exception:
return re.sub(r"<[^>]+>", " ", html_str).strip()
def _extract_body(msg: Any) -> str:
"""Return plain-text body. Strips HTML when no text/plain part exists."""
if msg.is_multipart():
html_fallback: str | None = None
for part in msg.walk():
ct = part.get_content_type()
if ct == "text/plain":
try:
charset = part.get_content_charset() or "utf-8"
return part.get_payload(decode=True).decode(charset, errors="replace")
except Exception:
pass
elif ct == "text/html" and html_fallback is None:
try:
charset = part.get_content_charset() or "utf-8"
raw = part.get_payload(decode=True).decode(charset, errors="replace")
html_fallback = _strip_html(raw)
except Exception:
pass
return html_fallback or ""
else:
try:
charset = msg.get_content_charset() or "utf-8"
raw = msg.get_payload(decode=True).decode(charset, errors="replace")
if msg.get_content_type() == "text/html":
return _strip_html(raw)
return raw
except Exception:
pass
return ""

View file

@ -21,3 +21,8 @@ accounts:
# Optional: limit emails fetched per account per run (0 = unlimited) # Optional: limit emails fetched per account per run (0 = unlimited)
max_per_account: 500 max_per_account: 500
# cf-orch SFT candidate import — path to the bench_results/ directory
# produced by circuitforge-orch's benchmark harness.
sft:
bench_results_dir: /path/to/circuitforge-orch/scripts/bench_results

200
manage.sh
View file

@ -21,7 +21,6 @@ DEFAULT_PORT=8503
CONDA_BASE="${CONDA_BASE:-/devl/miniconda3}" CONDA_BASE="${CONDA_BASE:-/devl/miniconda3}"
ENV_UI="job-seeker" ENV_UI="job-seeker"
ENV_BM="job-seeker-classifiers" ENV_BM="job-seeker-classifiers"
STREAMLIT="${CONDA_BASE}/envs/${ENV_UI}/bin/streamlit"
PYTHON_BM="${CONDA_BASE}/envs/${ENV_BM}/bin/python" PYTHON_BM="${CONDA_BASE}/envs/${ENV_BM}/bin/python"
PYTHON_UI="${CONDA_BASE}/envs/${ENV_UI}/bin/python" PYTHON_UI="${CONDA_BASE}/envs/${ENV_UI}/bin/python"
@ -79,13 +78,11 @@ usage() {
echo "" echo ""
echo " Usage: ./manage.sh <command> [args]" echo " Usage: ./manage.sh <command> [args]"
echo "" echo ""
echo " Label tool:" echo " Vue UI + FastAPI:"
echo -e " ${GREEN}start${NC} Start label tool UI (port collision-safe)" echo -e " ${GREEN}start${NC} Build Vue SPA + start FastAPI on port 8503"
echo -e " ${GREEN}stop${NC} Stop label tool UI" echo -e " ${GREEN}stop${NC} Stop FastAPI server"
echo -e " ${GREEN}restart${NC} Restart label tool UI" echo -e " ${GREEN}restart${NC} Stop + rebuild + restart FastAPI server"
echo -e " ${GREEN}status${NC} Show running state and port" echo -e " ${GREEN}open${NC} Open Vue UI in browser (http://localhost:8503)"
echo -e " ${GREEN}logs${NC} Tail label tool log output"
echo -e " ${GREEN}open${NC} Open label tool in browser"
echo "" echo ""
echo " Benchmark:" echo " Benchmark:"
echo -e " ${GREEN}benchmark [args]${NC} Run benchmark_classifier.py (args passed through)" echo -e " ${GREEN}benchmark [args]${NC} Run benchmark_classifier.py (args passed through)"
@ -93,12 +90,6 @@ usage() {
echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]" echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]"
echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]" echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]"
echo "" echo ""
echo " Vue API:"
echo -e " ${GREEN}start-api${NC} Build Vue SPA + start FastAPI on port 8503"
echo -e " ${GREEN}stop-api${NC} Stop FastAPI server"
echo -e " ${GREEN}restart-api${NC} Stop + rebuild + restart FastAPI server"
echo -e " ${GREEN}open-api${NC} Open Vue UI in browser (http://localhost:8503)"
echo ""
echo " Dev:" echo " Dev:"
echo -e " ${GREEN}test${NC} Run pytest suite" echo -e " ${GREEN}test${NC} Run pytest suite"
echo "" echo ""
@ -121,102 +112,61 @@ shift || true
case "$CMD" in case "$CMD" in
start) start)
pid=$(_running_pid) API_PID_FILE=".avocet-api.pid"
if [[ -n "$pid" ]]; then API_PORT=8503
port=$(_running_port) if [[ -f "$API_PID_FILE" ]] && kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
warn "Already running (PID ${pid}) on port ${port} → http://localhost:${port}" warn "API already running (PID $(<"$API_PID_FILE")) → http://localhost:${API_PORT}"
exit 0 exit 0
fi fi
if [[ ! -x "$STREAMLIT" ]]; then
error "Streamlit not found at ${STREAMLIT}\nActivate env: conda run -n ${ENV_UI} ..."
fi
port=$(_find_free_port "$DEFAULT_PORT")
mkdir -p "$LOG_DIR" mkdir -p "$LOG_DIR"
API_LOG="${LOG_DIR}/api.log"
info "Starting label tool on port ${port}" info "Building Vue SPA…"
nohup "$STREAMLIT" run app/label_tool.py \ (cd web && npm run build) >> "$API_LOG" 2>&1
--server.port "$port" \ info "Starting FastAPI on port ${API_PORT}"
--server.headless true \ nohup "$PYTHON_UI" -m uvicorn app.api:app \
--server.fileWatcherType none \ --host 0.0.0.0 --port "$API_PORT" \
>"$LOG_FILE" 2>&1 & >> "$API_LOG" 2>&1 &
echo $! > "$API_PID_FILE"
pid=$! # Poll until port is actually bound (up to 10 s), not just process alive
echo "$pid" > "$PID_FILE" for _i in $(seq 1 20); do
echo "$port" > "$PORT_FILE" sleep 0.5
if (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
# Wait briefly and confirm the process survived success "Avocet started → http://localhost:${API_PORT} (PID $(<"$API_PID_FILE"))"
sleep 1 break
if kill -0 "$pid" 2>/dev/null; then fi
success "Avocet label tool started → http://localhost:${port} (PID ${pid})" if ! kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
success "Logs: ${LOG_FILE}" rm -f "$API_PID_FILE"
else error "Server died during startup. Check ${API_LOG}"
rm -f "$PID_FILE" "$PORT_FILE" fi
error "Process died immediately. Check ${LOG_FILE} for details." done
if ! (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
error "Server did not bind to port ${API_PORT} within 10 s. Check ${API_LOG}"
fi fi
;; ;;
stop) stop)
pid=$(_running_pid) API_PID_FILE=".avocet-api.pid"
if [[ -z "$pid" ]]; then if [[ ! -f "$API_PID_FILE" ]]; then
warn "Not running." warn "Not running."
exit 0 exit 0
fi fi
info "Stopping label tool (PID ${pid})…" PID="$(<"$API_PID_FILE")"
kill "$pid" if kill -0 "$PID" 2>/dev/null; then
# Wait up to 5 s for clean exit kill "$PID" && rm -f "$API_PID_FILE"
for _ in $(seq 1 10); do success "Stopped (PID ${PID})."
kill -0 "$pid" 2>/dev/null || break else
sleep 0.5 warn "Stale PID file (process ${PID} not running). Cleaning up."
done rm -f "$API_PID_FILE"
if kill -0 "$pid" 2>/dev/null; then
warn "Process did not exit cleanly; sending SIGKILL…"
kill -9 "$pid" 2>/dev/null || true
fi fi
rm -f "$PID_FILE" "$PORT_FILE"
success "Stopped."
;; ;;
restart) restart)
pid=$(_running_pid) bash "$0" stop
if [[ -n "$pid" ]]; then
info "Stopping existing process (PID ${pid})…"
kill "$pid"
for _ in $(seq 1 10); do
kill -0 "$pid" 2>/dev/null || break
sleep 0.5
done
kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true
rm -f "$PID_FILE" "$PORT_FILE"
fi
exec bash "$0" start exec bash "$0" start
;; ;;
status)
pid=$(_running_pid)
if [[ -n "$pid" ]]; then
port=$(_running_port)
success "Running — PID ${pid} port ${port} → http://localhost:${port}"
else
warn "Not running."
fi
;;
logs)
if [[ ! -f "$LOG_FILE" ]]; then
warn "No log file found at ${LOG_FILE}. Has the tool been started?"
exit 0
fi
info "Tailing ${LOG_FILE} (Ctrl-C to stop)"
tail -f "$LOG_FILE"
;;
open) open)
port=$(_running_port) URL="http://localhost:8503"
pid=$(_running_pid)
[[ -z "$pid" ]] && warn "Label tool does not appear to be running. Start with: ./manage.sh start"
URL="http://localhost:${port}"
info "Opening ${URL}" info "Opening ${URL}"
if command -v xdg-open &>/dev/null; then if command -v xdg-open &>/dev/null; then
xdg-open "$URL" xdg-open "$URL"
@ -257,72 +207,6 @@ case "$CMD" in
exec "$0" benchmark --compare "$@" exec "$0" benchmark --compare "$@"
;; ;;
start-api)
API_PID_FILE=".avocet-api.pid"
API_PORT=8503
if [[ -f "$API_PID_FILE" ]] && kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
warn "API already running (PID $(<"$API_PID_FILE")) → http://localhost:${API_PORT}"
exit 0
fi
mkdir -p "$LOG_DIR"
API_LOG="${LOG_DIR}/api.log"
info "Building Vue SPA…"
(cd web && npm run build) >> "$API_LOG" 2>&1
info "Starting FastAPI on port ${API_PORT}"
nohup "$PYTHON_UI" -m uvicorn app.api:app \
--host 0.0.0.0 --port "$API_PORT" \
>> "$API_LOG" 2>&1 &
echo $! > "$API_PID_FILE"
# Poll until port is actually bound (up to 10 s), not just process alive
for _i in $(seq 1 20); do
sleep 0.5
if (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
success "Avocet API started → http://localhost:${API_PORT} (PID $(<"$API_PID_FILE"))"
break
fi
if ! kill -0 "$(<"$API_PID_FILE")" 2>/dev/null; then
rm -f "$API_PID_FILE"
error "API died during startup. Check ${API_LOG}"
fi
done
if ! (echo "" >/dev/tcp/127.0.0.1/"$API_PORT") 2>/dev/null; then
error "API did not bind to port ${API_PORT} within 10 s. Check ${API_LOG}"
fi
;;
stop-api)
API_PID_FILE=".avocet-api.pid"
if [[ ! -f "$API_PID_FILE" ]]; then
warn "API not running."
exit 0
fi
PID="$(<"$API_PID_FILE")"
if kill -0 "$PID" 2>/dev/null; then
kill "$PID" && rm -f "$API_PID_FILE"
success "API stopped (PID ${PID})."
else
warn "Stale PID file (process ${PID} not running). Cleaning up."
rm -f "$API_PID_FILE"
fi
;;
restart-api)
bash "$0" stop-api
exec bash "$0" start-api
;;
open-api)
URL="http://localhost:8503"
info "Opening ${URL}"
if command -v xdg-open &>/dev/null; then
xdg-open "$URL"
elif command -v open &>/dev/null; then
open "$URL"
else
echo "$URL"
fi
;;
help|--help|-h) help|--help|-h)
usage usage
;; ;;

View file

@ -5,7 +5,7 @@ These functions are stdlib-only and safe to test without an IMAP connection.
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText from email.mime.text import MIMEText
from app.label_tool import _extract_body, _strip_html from app.utils import _extract_body, _strip_html
# ── _strip_html ────────────────────────────────────────────────────────────── # ── _strip_html ──────────────────────────────────────────────────────────────