feat: extract label queue API into app/data/label.py

This commit is contained in:
pyr0ball 2026-05-01 18:48:14 -07:00
parent 6689ff07b1
commit 167d7351e3
5 changed files with 454 additions and 178 deletions

View file

@ -142,6 +142,9 @@ def _normalize(item: dict) -> dict:
app = FastAPI(title="Avocet API") app = FastAPI(title="Avocet API")
from app.data.label import router as label_router
app.include_router(label_router, prefix="/api")
from app.sft import router as sft_router from app.sft import router as sft_router
app.include_router(sft_router, prefix="/api/sft") app.include_router(sft_router, prefix="/api/sft")
@ -162,183 +165,6 @@ app.include_router(style_router, prefix="/api/style")
_last_action: dict | None = None _last_action: dict | None = None
@app.get("/api/queue")
def get_queue(limit: int = Query(default=10, ge=1, le=50)):
items = _read_jsonl(_queue_file())
return {"items": [_normalize(x) for x in items[:limit]], "total": len(items)}
class LabelRequest(BaseModel):
id: str
label: str
@app.post("/api/label")
def post_label(req: LabelRequest):
global _last_action
items = _read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
record = {**match, "label": req.label,
"labeled_at": datetime.now(timezone.utc).isoformat()}
_append_jsonl(_score_file(), record)
_write_jsonl(_queue_file(), [x for x in items if _normalize(x)["id"] != req.id])
_last_action = {"type": "label", "item": match, "label": req.label}
return {"ok": True}
class SkipRequest(BaseModel):
id: str
@app.post("/api/skip")
def post_skip(req: SkipRequest):
global _last_action
items = _read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
reordered = [x for x in items if _normalize(x)["id"] != req.id] + [match]
_write_jsonl(_queue_file(), reordered)
_last_action = {"type": "skip", "item": match}
return {"ok": True}
class DiscardRequest(BaseModel):
id: str
@app.post("/api/discard")
def post_discard(req: DiscardRequest):
global _last_action
items = _read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
record = {**match, "label": "__discarded__",
"discarded_at": datetime.now(timezone.utc).isoformat()}
_append_jsonl(_discarded_file(), record)
_write_jsonl(_queue_file(), [x for x in items if _normalize(x)["id"] != req.id])
_last_action = {"type": "discard", "item": match}
return {"ok": True}
@app.delete("/api/label/undo")
def delete_undo():
global _last_action
if not _last_action:
raise HTTPException(404, "No action to undo")
action = _last_action
item = action["item"] # always the original clean queue item
# Perform file operations FIRST — only clear _last_action on success
if action["type"] == "label":
records = _read_jsonl(_score_file())
if not records:
raise HTTPException(409, "Score file is empty — cannot undo label")
_write_jsonl(_score_file(), records[:-1])
items = _read_jsonl(_queue_file())
_write_jsonl(_queue_file(), [item] + items)
elif action["type"] == "discard":
records = _read_jsonl(_discarded_file())
if not records:
raise HTTPException(409, "Discarded file is empty — cannot undo discard")
_write_jsonl(_discarded_file(), records[:-1])
items = _read_jsonl(_queue_file())
_write_jsonl(_queue_file(), [item] + items)
elif action["type"] == "skip":
items = _read_jsonl(_queue_file())
item_id = _normalize(item)["id"]
items = [item] + [x for x in items if _normalize(x)["id"] != item_id]
_write_jsonl(_queue_file(), items)
# Clear AFTER all file operations succeed
_last_action = None
return {"undone": {"type": action["type"], "item": _normalize(item)}}
# Label metadata — 10 labels matching label_tool.py
_LABEL_META = [
{"name": "interview_scheduled", "emoji": "\U0001f4c5", "color": "#4CAF50", "key": "1"},
{"name": "offer_received", "emoji": "\U0001f389", "color": "#2196F3", "key": "2"},
{"name": "rejected", "emoji": "\u274c", "color": "#F44336", "key": "3"},
{"name": "positive_response", "emoji": "\U0001f44d", "color": "#FF9800", "key": "4"},
{"name": "survey_received", "emoji": "\U0001f4cb", "color": "#9C27B0", "key": "5"},
{"name": "neutral", "emoji": "\u2b1c", "color": "#607D8B", "key": "6"},
{"name": "event_rescheduled", "emoji": "\U0001f504", "color": "#FF5722", "key": "7"},
{"name": "digest", "emoji": "\U0001f4f0", "color": "#00BCD4", "key": "8"},
{"name": "new_lead", "emoji": "\U0001f91d", "color": "#009688", "key": "9"},
{"name": "hired", "emoji": "\U0001f38a", "color": "#FFC107", "key": "h"},
]
@app.get("/api/config/labels")
def get_labels():
return _LABEL_META
@app.get("/api/config")
def get_config():
f = _config_file()
if not f.exists():
return {"accounts": [], "max_per_account": 500}
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
return {"accounts": raw.get("accounts", []), "max_per_account": raw.get("max_per_account", 500)}
class ConfigPayload(BaseModel):
accounts: list[dict]
max_per_account: int = 500
@app.post("/api/config")
def post_config(payload: ConfigPayload):
f = _config_file()
f.parent.mkdir(parents=True, exist_ok=True)
tmp = f.with_suffix(".tmp")
tmp.write_text(yaml.dump(payload.model_dump(), allow_unicode=True, sort_keys=False),
encoding="utf-8")
tmp.rename(f)
return {"ok": True}
@app.get("/api/stats")
def get_stats():
records = _read_jsonl(_score_file())
counts: dict[str, int] = {}
for r in records:
lbl = r.get("label", "")
if lbl:
counts[lbl] = counts.get(lbl, 0) + 1
benchmark_results: dict = {}
benchmark_path = _DATA_DIR / "benchmark_results.json"
if benchmark_path.exists():
try:
benchmark_results = json.loads(benchmark_path.read_text(encoding="utf-8"))
except Exception:
pass
return {
"total": len(records),
"counts": counts,
"score_file_bytes": _score_file().stat().st_size if _score_file().exists() else 0,
"benchmark_results": benchmark_results,
}
@app.get("/api/stats/download")
def download_stats():
from fastapi.responses import FileResponse
if not _score_file().exists():
raise HTTPException(404, "No score file")
return FileResponse(
str(_score_file()),
filename="email_score.jsonl",
media_type="application/jsonlines",
headers={"Content-Disposition": 'attachment; filename="email_score.jsonl"'},
)
class AccountTestRequest(BaseModel): class AccountTestRequest(BaseModel):
account: dict account: dict
@ -575,9 +401,10 @@ def fetch_stream(
mode: str = Query(default="wide"), mode: str = Query(default="wide"),
): ):
from app.imap_fetch import fetch_account_stream from app.imap_fetch import fetch_account_stream
from app.data.label import get_config
selected_names = {n.strip() for n in accounts.split(",") if n.strip()} selected_names = {n.strip() for n in accounts.split(",") if n.strip()}
config = get_config() # reuse existing endpoint logic config = get_config() # reuse label module config logic
selected = [a for a in config["accounts"] if a.get("name") in selected_names] selected = [a for a in config["accounts"] if a.get("name") in selected_names]
def generate(): def generate():

0
app/data/__init__.py Normal file
View file

222
app/data/label.py Normal file
View file

@ -0,0 +1,222 @@
"""Avocet -- label queue API.
All label/skip/discard/undo/stats/config endpoints.
Extracted from app/api.py as part of the v2 domain split.
"""
from __future__ import annotations
import hashlib
import json
import yaml
from datetime import datetime, timezone
from pathlib import Path
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import FileResponse
from pydantic import BaseModel
from app.utils import append_jsonl, read_jsonl, write_jsonl
_ROOT = Path(__file__).parent.parent.parent
_DATA_DIR: Path = _ROOT / "data"
_CONFIG_DIR: Path | None = None
_last_action: dict | None = None
router = APIRouter()
def set_data_dir(path: Path) -> None:
global _DATA_DIR
_DATA_DIR = path
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
def reset_last_action() -> None:
global _last_action
_last_action = None
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _queue_file() -> Path:
return _DATA_DIR / "email_label_queue.jsonl"
def _score_file() -> Path:
return _DATA_DIR / "email_score.jsonl"
def _discarded_file() -> Path:
return _DATA_DIR / "discarded.jsonl"
def _item_id(item: dict) -> str:
key = (item.get("subject", "") + (item.get("body", "") or "")[:100])
return hashlib.md5(key.encode("utf-8", errors="replace")).hexdigest()
def _normalize(item: dict) -> dict:
return {
"id": item.get("id") or _item_id(item),
"subject": item.get("subject", ""),
"body": item.get("body", ""),
"from": item.get("from") or item.get("from_addr", ""),
"date": item.get("date", ""),
"source": item.get("source") or item.get("account", ""),
}
_LABEL_META = [
{"name": "interview_scheduled", "emoji": "\U0001f4c5", "color": "#4CAF50", "key": "1"},
{"name": "offer_received", "emoji": "\U0001f389", "color": "#2196F3", "key": "2"},
{"name": "rejected", "emoji": "", "color": "#F44336", "key": "3"},
{"name": "positive_response", "emoji": "\U0001f44d", "color": "#FF9800", "key": "4"},
{"name": "survey_received", "emoji": "\U0001f4cb", "color": "#9C27B0", "key": "5"},
{"name": "neutral", "emoji": "", "color": "#607D8B", "key": "6"},
{"name": "event_rescheduled", "emoji": "\U0001f504", "color": "#FF5722", "key": "7"},
{"name": "digest", "emoji": "\U0001f4f0", "color": "#00BCD4", "key": "8"},
{"name": "new_lead", "emoji": "\U0001f91d", "color": "#009688", "key": "9"},
{"name": "hired", "emoji": "\U0001f38a", "color": "#FFC107", "key": "h"},
]
@router.get("/queue")
def get_queue(limit: int = Query(default=10, ge=1, le=50)):
items = read_jsonl(_queue_file())
return {"items": [_normalize(x) for x in items[:limit]], "total": len(items)}
class LabelRequest(BaseModel):
id: str
label: str
@router.post("/label")
def post_label(req: LabelRequest):
global _last_action
items = read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
record = {**match, "label": req.label,
"labeled_at": datetime.now(timezone.utc).isoformat()}
append_jsonl(_score_file(), record)
write_jsonl(_queue_file(), [x for x in items if _normalize(x)["id"] != req.id])
_last_action = {"type": "label", "item": match, "label": req.label}
return {"ok": True}
class SkipRequest(BaseModel):
id: str
@router.post("/skip")
def post_skip(req: SkipRequest):
global _last_action
items = read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
reordered = [x for x in items if _normalize(x)["id"] != req.id] + [match]
write_jsonl(_queue_file(), reordered)
_last_action = {"type": "skip", "item": match}
return {"ok": True}
class DiscardRequest(BaseModel):
id: str
@router.post("/discard")
def post_discard(req: DiscardRequest):
global _last_action
items = read_jsonl(_queue_file())
match = next((x for x in items if _normalize(x)["id"] == req.id), None)
if not match:
raise HTTPException(404, f"Item {req.id!r} not found in queue")
record = {**match, "label": "__discarded__",
"discarded_at": datetime.now(timezone.utc).isoformat()}
append_jsonl(_discarded_file(), record)
write_jsonl(_queue_file(), [x for x in items if _normalize(x)["id"] != req.id])
_last_action = {"type": "discard", "item": match}
return {"ok": True}
@router.delete("/label/undo")
def delete_undo():
global _last_action
if not _last_action:
raise HTTPException(404, "No action to undo")
action = _last_action
item = action["item"]
if action["type"] == "label":
records = read_jsonl(_score_file())
if not records:
raise HTTPException(409, "Score file is empty -- cannot undo label")
write_jsonl(_score_file(), records[:-1])
items = read_jsonl(_queue_file())
write_jsonl(_queue_file(), [item] + items)
elif action["type"] == "discard":
records = read_jsonl(_discarded_file())
if not records:
raise HTTPException(409, "Discarded file is empty -- cannot undo discard")
write_jsonl(_discarded_file(), records[:-1])
items = read_jsonl(_queue_file())
write_jsonl(_queue_file(), [item] + items)
elif action["type"] == "skip":
items = read_jsonl(_queue_file())
item_id = _normalize(item)["id"]
items = [item] + [x for x in items if _normalize(x)["id"] != item_id]
write_jsonl(_queue_file(), items)
_last_action = None
return {"undone": {"type": action["type"], "item": _normalize(item)}}
@router.get("/config/labels")
def get_labels():
return _LABEL_META
@router.get("/config")
def get_config():
f = _config_file()
if not f.exists():
return {"accounts": [], "max_per_account": 500}
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
return {"accounts": raw.get("accounts", []), "max_per_account": raw.get("max_per_account", 500)}
class ConfigPayload(BaseModel):
accounts: list[dict]
max_per_account: int = 500
@router.post("/config")
def post_config(payload: ConfigPayload):
f = _config_file()
f.parent.mkdir(parents=True, exist_ok=True)
tmp = f.with_suffix(".tmp")
tmp.write_text(yaml.dump(payload.model_dump(), allow_unicode=True, sort_keys=False),
encoding="utf-8")
tmp.rename(f)
return {"ok": True}
@router.get("/stats")
def get_stats():
records = read_jsonl(_score_file())
counts: dict[str, int] = {}
for r in records:
lbl = r.get("label", "")
if lbl:
counts[lbl] = counts.get(lbl, 0) + 1
benchmark_results: dict = {}
benchmark_path = _DATA_DIR / "benchmark_results.json"
if benchmark_path.exists():
try:
benchmark_results = json.loads(benchmark_path.read_text(encoding="utf-8"))
except Exception:
pass
return {
"total": len(records),
"counts": counts,
"score_file_bytes": _score_file().stat().st_size if _score_file().exists() else 0,
"benchmark_results": benchmark_results,
}
@router.get("/stats/download")
def download_stats():
if not _score_file().exists():
raise HTTPException(404, "No score file")
return FileResponse(
str(_score_file()),
filename="email_score.jsonl",
media_type="application/jsonlines",
headers={"Content-Disposition": 'attachment; filename="email_score.jsonl"'},
)

View file

@ -7,10 +7,15 @@ from app import api as api_module # noqa: F401
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def reset_globals(tmp_path): def reset_globals(tmp_path):
from app import api from app import api
from app.data import label as label_module
api.set_data_dir(tmp_path) api.set_data_dir(tmp_path)
api.reset_last_action() api.reset_last_action()
label_module.set_data_dir(tmp_path)
label_module.set_config_dir(tmp_path)
label_module.reset_last_action()
yield yield
api.reset_last_action() api.reset_last_action()
label_module.reset_last_action()
def test_import(): def test_import():
@ -160,9 +165,12 @@ def test_config_labels_returns_metadata(client):
def config_dir(tmp_path): def config_dir(tmp_path):
"""Give the API a writable config directory.""" """Give the API a writable config directory."""
from app import api as api_module from app import api as api_module
from app.data import label as label_module
api_module.set_config_dir(tmp_path) api_module.set_config_dir(tmp_path)
label_module.set_config_dir(tmp_path)
yield tmp_path yield tmp_path
api_module.set_config_dir(None) # reset to default api_module.set_config_dir(None) # reset to default
label_module.set_config_dir(None)
@pytest.fixture @pytest.fixture

219
tests/test_data_label.py Normal file
View file

@ -0,0 +1,219 @@
"""Tests for app/data/label.py"""
import json
import pytest
import yaml
from fastapi.testclient import TestClient
@pytest.fixture(autouse=True)
def reset_globals(tmp_path):
from app.data import label as label_module
label_module.set_data_dir(tmp_path)
label_module.set_config_dir(tmp_path)
label_module.reset_last_action()
yield
label_module.reset_last_action()
@pytest.fixture
def client():
from app.api import app
return TestClient(app)
@pytest.fixture
def queue_with_items(tmp_path):
from app.data import label as label_module
items = [
{"id": f"id{i}", "subject": f"Subject {i}", "body": f"Body {i}",
"from": "test@example.com", "date": "2026-03-01", "source": "imap:test"}
for i in range(3)
]
(label_module._DATA_DIR / "email_label_queue.jsonl").write_text(
"\n".join(json.dumps(x) for x in items) + "\n")
return items
def test_queue_returns_items(client, queue_with_items):
r = client.get("/api/queue?limit=2")
assert r.status_code == 200
data = r.json()
assert len(data["items"]) == 2
assert data["total"] == 3
def test_queue_empty_when_no_file(client):
r = client.get("/api/queue")
assert r.status_code == 200
assert r.json() == {"items": [], "total": 0}
def test_label_appends_to_score(client, queue_with_items):
from app.data import label as label_module
r = client.post("/api/label", json={"id": "id0", "label": "interview_scheduled"})
assert r.status_code == 200
records = label_module.read_jsonl(label_module._score_file())
assert len(records) == 1
assert records[0]["id"] == "id0"
assert records[0]["label"] == "interview_scheduled"
assert "labeled_at" in records[0]
def test_label_removes_from_queue(client, queue_with_items):
from app.data import label as label_module
client.post("/api/label", json={"id": "id0", "label": "rejected"})
queue = label_module.read_jsonl(label_module._queue_file())
assert not any(x["id"] == "id0" for x in queue)
def test_label_unknown_id_returns_404(client, queue_with_items):
r = client.post("/api/label", json={"id": "unknown", "label": "neutral"})
assert r.status_code == 404
def test_skip_moves_to_back(client, queue_with_items):
from app.data import label as label_module
r = client.post("/api/skip", json={"id": "id0"})
assert r.status_code == 200
queue = label_module.read_jsonl(label_module._queue_file())
assert queue[-1]["id"] == "id0"
assert queue[0]["id"] == "id1"
def test_skip_unknown_id_returns_404(client, queue_with_items):
r = client.post("/api/skip", json={"id": "nope"})
assert r.status_code == 404
def test_discard_writes_to_discarded_file(client, queue_with_items):
from app.data import label as label_module
r = client.post("/api/discard", json={"id": "id1"})
assert r.status_code == 200
discarded = label_module.read_jsonl(label_module._discarded_file())
assert len(discarded) == 1
assert discarded[0]["id"] == "id1"
assert discarded[0]["label"] == "__discarded__"
def test_discard_removes_from_queue(client, queue_with_items):
from app.data import label as label_module
client.post("/api/discard", json={"id": "id1"})
queue = label_module.read_jsonl(label_module._queue_file())
assert not any(x["id"] == "id1" for x in queue)
def test_undo_label_removes_from_score(client, queue_with_items):
from app.data import label as label_module
client.post("/api/label", json={"id": "id0", "label": "neutral"})
r = client.delete("/api/label/undo")
assert r.status_code == 200
assert r.json()["undone"]["type"] == "label"
assert label_module.read_jsonl(label_module._score_file()) == []
queue = label_module.read_jsonl(label_module._queue_file())
assert queue[0]["id"] == "id0"
def test_undo_discard_removes_from_discarded(client, queue_with_items):
from app.data import label as label_module
client.post("/api/discard", json={"id": "id0"})
r = client.delete("/api/label/undo")
assert r.status_code == 200
assert label_module.read_jsonl(label_module._discarded_file()) == []
def test_undo_skip_restores_to_front(client, queue_with_items):
from app.data import label as label_module
client.post("/api/skip", json={"id": "id0"})
r = client.delete("/api/label/undo")
assert r.status_code == 200
queue = label_module.read_jsonl(label_module._queue_file())
assert queue[0]["id"] == "id0"
def test_undo_with_no_action_returns_404(client):
r = client.delete("/api/label/undo")
assert r.status_code == 404
def test_config_labels_returns_10_labels(client):
r = client.get("/api/config/labels")
assert r.status_code == 200
labels = r.json()
assert len(labels) == 10
assert labels[0]["key"] == "1"
for lbl in labels:
assert "emoji" in lbl and "color" in lbl and "name" in lbl
def test_get_config_returns_empty_when_no_file(client):
r = client.get("/api/config")
assert r.status_code == 200
data = r.json()
assert data["accounts"] == []
assert data["max_per_account"] == 500
def test_post_config_writes_yaml(client, tmp_path):
from app.data import label as label_module
label_module.set_config_dir(tmp_path)
payload = {"accounts": [{"name": "Test", "host": "imap.test.com", "port": 993,
"use_ssl": True, "username": "u@t.com", "password": "pw",
"folder": "INBOX", "days_back": 30}], "max_per_account": 200}
r = client.post("/api/config", json=payload)
assert r.status_code == 200
assert r.json()["ok"] is True
saved = yaml.safe_load((tmp_path / "label_tool.yaml").read_text())
assert saved["max_per_account"] == 200
assert saved["accounts"][0]["name"] == "Test"
def test_get_config_round_trips(client, tmp_path):
from app.data import label as label_module
label_module.set_config_dir(tmp_path)
payload = {"accounts": [{"name": "R", "host": "h", "port": 993, "use_ssl": True,
"username": "u", "password": "p", "folder": "INBOX",
"days_back": 90}], "max_per_account": 300}
client.post("/api/config", json=payload)
r = client.get("/api/config")
data = r.json()
assert data["max_per_account"] == 300
assert data["accounts"][0]["name"] == "R"
def test_stats_returns_counts(client, tmp_path):
from app.data import label as label_module
label_module.set_data_dir(tmp_path)
score_path = tmp_path / "email_score.jsonl"
records = [{"id": "a", "label": "interview_scheduled"},
{"id": "b", "label": "interview_scheduled"},
{"id": "c", "label": "rejected"}]
score_path.write_text("\n".join(json.dumps(r) for r in records) + "\n")
r = client.get("/api/stats")
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
assert data["counts"]["interview_scheduled"] == 2
assert data["counts"]["rejected"] == 1
def test_stats_empty_when_no_file(client):
r = client.get("/api/stats")
assert r.status_code == 200
data = r.json()
assert data["total"] == 0
assert data["counts"] == {}
assert data["score_file_bytes"] == 0
def test_stats_download_returns_file(client, tmp_path):
from app.data import label as label_module
label_module.set_data_dir(tmp_path)
(tmp_path / "email_score.jsonl").write_text(json.dumps({"id": "a", "label": "neutral"}) + "\n")
r = client.get("/api/stats/download")
assert r.status_code == 200
assert "jsonlines" in r.headers.get("content-type", "")
def test_stats_download_404_when_no_file(client):
r = client.get("/api/stats/download")
assert r.status_code == 404