diff --git a/app/sft.py b/app/sft.py index 929b98a..ab439f1 100644 --- a/app/sft.py +++ b/app/sft.py @@ -151,10 +151,21 @@ def get_queue(page: int = 1, per_page: int = 20): # ── POST /submit ─────────────────────────────────────────────────────────── +FailureCategory = Literal[ + "scoring_artifact", + "style_violation", + "partial_answer", + "wrong_answer", + "format_error", + "hallucination", +] + + class SubmitRequest(BaseModel): id: str action: Literal["correct", "discard", "flag"] corrected_response: str | None = None + failure_category: FailureCategory | None = None @router.post("/submit") @@ -174,7 +185,12 @@ def post_submit(req: SubmitRequest): raise HTTPException(409, f"Record is not in needs_review state (current: {record.get('status')})") if req.action == "correct": - records[idx] = {**record, "status": "approved", "corrected_response": req.corrected_response} + records[idx] = { + **record, + "status": "approved", + "corrected_response": req.corrected_response, + "failure_category": req.failure_category, + } _write_candidates(records) append_jsonl(_approved_file(), records[idx]) elif req.action == "discard": diff --git a/tests/test_sft.py b/tests/test_sft.py index ed808b1..e3c98f9 100644 --- a/tests/test_sft.py +++ b/tests/test_sft.py @@ -232,6 +232,41 @@ def test_submit_already_approved_returns_409(client, tmp_path): assert r.status_code == 409 +def test_submit_correct_stores_failure_category(client, tmp_path): + _populate_candidates(tmp_path, [_make_record("a")]) + r = client.post("/api/sft/submit", json={ + "id": "a", "action": "correct", + "corrected_response": "def add(a, b): return a + b", + "failure_category": "style_violation", + }) + assert r.status_code == 200 + from app import sft as sft_module + records = sft_module._read_candidates() + assert records[0]["failure_category"] == "style_violation" + + +def test_submit_correct_null_failure_category(client, tmp_path): + _populate_candidates(tmp_path, [_make_record("a")]) + r = client.post("/api/sft/submit", json={ + "id": "a", "action": "correct", + "corrected_response": "def add(a, b): return a + b", + }) + assert r.status_code == 200 + from app import sft as sft_module + records = sft_module._read_candidates() + assert records[0]["failure_category"] is None + + +def test_submit_invalid_failure_category_returns_422(client, tmp_path): + _populate_candidates(tmp_path, [_make_record("a")]) + r = client.post("/api/sft/submit", json={ + "id": "a", "action": "correct", + "corrected_response": "def add(a, b): return a + b", + "failure_category": "nonsense", + }) + assert r.status_code == 422 + + # ── /api/sft/undo ──────────────────────────────────────────────────────────── def test_undo_restores_discarded_to_needs_review(client, tmp_path): diff --git a/web/src/components/SftCard.test.ts b/web/src/components/SftCard.test.ts index 6834987..59b5e32 100644 --- a/web/src/components/SftCard.test.ts +++ b/web/src/components/SftCard.test.ts @@ -13,6 +13,7 @@ const LOW_QUALITY_ITEM: SftQueueItem = { model_response: 'def add(a, b): return a - b', corrected_response: null, quality_score: 0.2, failure_reason: 'pattern_match: 0/2 matched', + failure_category: null, task_id: 'code-fn', task_type: 'code', task_name: 'Code: Write a function', model_id: 'Qwen/Qwen2.5-3B', model_name: 'Qwen2.5-3B', node_id: 'heimdall', gpu_id: 0, tokens_per_sec: 38.4, @@ -68,15 +69,17 @@ describe('SftCard', () => { expect(w.emitted('correct')).toBeTruthy() }) - it('clicking Discard button emits discard', async () => { + it('clicking Discard button then confirming emits discard', async () => { const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) await w.find('[data-testid="discard-btn"]').trigger('click') + await w.find('[data-testid="confirm-pending-btn"]').trigger('click') expect(w.emitted('discard')).toBeTruthy() }) - it('clicking Flag Model button emits flag', async () => { + it('clicking Flag Model button then confirming emits flag', async () => { const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) await w.find('[data-testid="flag-btn"]').trigger('click') + await w.find('[data-testid="confirm-pending-btn"]').trigger('click') expect(w.emitted('flag')).toBeTruthy() }) @@ -95,4 +98,82 @@ describe('SftCard', () => { const w = mount(SftCard, { props: { item } }) expect(w.find('.failure-reason').exists()).toBe(false) }) + + // ── Failure category chip-group ─────────────────────────────────── + it('failure category section hidden when not correcting and no pending action', () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(false) + }) + + it('failure category section shown when correcting prop is true', () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } }) + expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(true) + }) + + it('renders all six category chips when correcting', () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } }) + const chips = w.findAll('.category-chip') + expect(chips).toHaveLength(6) + }) + + it('clicking a category chip selects it (adds active class)', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } }) + const chip = w.find('[data-testid="category-chip-wrong_answer"]') + await chip.trigger('click') + expect(chip.classes()).toContain('category-chip--active') + }) + + it('clicking the active chip again deselects it', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } }) + const chip = w.find('[data-testid="category-chip-hallucination"]') + await chip.trigger('click') + expect(chip.classes()).toContain('category-chip--active') + await chip.trigger('click') + expect(chip.classes()).not.toContain('category-chip--active') + }) + + it('only one chip can be active at a time', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } }) + await w.find('[data-testid="category-chip-wrong_answer"]').trigger('click') + await w.find('[data-testid="category-chip-hallucination"]').trigger('click') + const active = w.findAll('.category-chip--active') + expect(active).toHaveLength(1) + expect(active[0].attributes('data-testid')).toBe('category-chip-hallucination') + }) + + it('clicking Discard shows pending action row with category section', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + await w.find('[data-testid="discard-btn"]').trigger('click') + expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(true) + expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(true) + }) + + it('clicking Flag shows pending action row', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + await w.find('[data-testid="flag-btn"]').trigger('click') + expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(true) + }) + + it('confirming discard emits discard with null when no category selected', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + await w.find('[data-testid="discard-btn"]').trigger('click') + await w.find('[data-testid="confirm-pending-btn"]').trigger('click') + expect(w.emitted('discard')).toBeTruthy() + expect(w.emitted('discard')![0]).toEqual([null]) + }) + + it('confirming discard emits discard with selected category', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + await w.find('[data-testid="discard-btn"]').trigger('click') + await w.find('[data-testid="category-chip-scoring_artifact"]').trigger('click') + await w.find('[data-testid="confirm-pending-btn"]').trigger('click') + expect(w.emitted('discard')![0]).toEqual(['scoring_artifact']) + }) + + it('cancelling pending action hides the pending row', async () => { + const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } }) + await w.find('[data-testid="discard-btn"]').trigger('click') + await w.find('[data-testid="cancel-pending-btn"]').trigger('click') + expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(false) + }) }) diff --git a/web/src/components/SftCard.vue b/web/src/components/SftCard.vue index f0ac2d1..0a702a3 100644 --- a/web/src/components/SftCard.vue +++ b/web/src/components/SftCard.vue @@ -57,21 +57,52 @@ + +
Failure category (optional)
+