avocet/web/src/components/SftCard.test.ts
pyr0ball 9633d9a535 feat: add failure_category field to SFT corrections (#16)
Adds optional failure_category to SubmitRequest and candidate records so
reviewers can classify why a model response was wrong, not just what to do
with it. Enables the fine-tune harness to filter training data by failure
type (e.g. exclude scoring artifacts, train only on genuine wrong answers).

Taxonomy: scoring_artifact | style_violation | partial_answer |
          wrong_answer | format_error | hallucination

- app/sft.py: FailureCategory Literal type; SubmitRequest.failure_category;
  stored on candidate record in POST /submit correct branch
- tests/test_sft.py: 3 new tests (stores value, null round-trip, 422 on invalid)
- stores/sft.ts: SftFailureCategory type exported; SftQueueItem + SftLastAction
  updated; setLastAction accepts optional category param
- SftCard.vue: chip-group selector shown during correct/discard/flag flow;
  two-step confirm for discard/flag reveals chips before emitting; category
  forwarded in all emit payloads
- CorrectionsView.vue: handleCorrect/Discard/Flag accept and forward category
  to POST /api/sft/submit body and store.setLastAction
- SftCard.test.ts: 11 new tests covering chip visibility, selection,
  single-active enforcement, pending-action flow, emit payloads, cancel
2026-04-08 22:10:26 -07:00

179 lines
8 KiB
TypeScript

import { mount } from '@vue/test-utils'
import SftCard from './SftCard.vue'
import type { SftQueueItem } from '../stores/sft'
import { describe, it, expect } from 'vitest'
const LOW_QUALITY_ITEM: SftQueueItem = {
id: 'abc', source: 'cf-orch-benchmark', benchmark_run_id: 'run1',
timestamp: '2026-04-07T10:00:00Z', status: 'needs_review',
prompt_messages: [
{ role: 'system', content: 'You are a coding assistant.' },
{ role: 'user', content: 'Write a Python add function.' },
],
model_response: 'def add(a, b): return a - b',
corrected_response: null, quality_score: 0.2,
failure_reason: 'pattern_match: 0/2 matched',
failure_category: null,
task_id: 'code-fn', task_type: 'code', task_name: 'Code: Write a function',
model_id: 'Qwen/Qwen2.5-3B', model_name: 'Qwen2.5-3B',
node_id: 'heimdall', gpu_id: 0, tokens_per_sec: 38.4,
}
const MID_QUALITY_ITEM: SftQueueItem = { ...LOW_QUALITY_ITEM, id: 'mid', quality_score: 0.55 }
const HIGH_QUALITY_ITEM: SftQueueItem = { ...LOW_QUALITY_ITEM, id: 'hi', quality_score: 0.72 }
describe('SftCard', () => {
it('renders model name chip', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.text()).toContain('Qwen2.5-3B')
})
it('renders task type chip', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.text()).toContain('code')
})
it('renders failure reason', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.text()).toContain('pattern_match: 0/2 matched')
})
it('renders model response', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.text()).toContain('def add(a, b): return a - b')
})
it('quality chip shows numeric value for low quality', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.text()).toContain('0.20')
})
it('quality chip has low-quality class when score < 0.4', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.find('[data-testid="quality-chip"]').classes()).toContain('quality-low')
})
it('quality chip has mid-quality class when score is 0.4 to <0.7', () => {
const w = mount(SftCard, { props: { item: MID_QUALITY_ITEM } })
expect(w.find('[data-testid="quality-chip"]').classes()).toContain('quality-mid')
})
it('quality chip has acceptable class when score >= 0.7', () => {
const w = mount(SftCard, { props: { item: HIGH_QUALITY_ITEM } })
expect(w.find('[data-testid="quality-chip"]').classes()).toContain('quality-ok')
})
it('clicking Correct button emits correct', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="correct-btn"]').trigger('click')
expect(w.emitted('correct')).toBeTruthy()
})
it('clicking Discard button then confirming emits discard', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="discard-btn"]').trigger('click')
await w.find('[data-testid="confirm-pending-btn"]').trigger('click')
expect(w.emitted('discard')).toBeTruthy()
})
it('clicking Flag Model button then confirming emits flag', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="flag-btn"]').trigger('click')
await w.find('[data-testid="confirm-pending-btn"]').trigger('click')
expect(w.emitted('flag')).toBeTruthy()
})
it('correction area hidden initially', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.find('[data-testid="correction-area"]').exists()).toBe(false)
})
it('correction area shown when correcting prop is true', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
expect(w.find('[data-testid="correction-area"]').exists()).toBe(true)
})
it('renders nothing for failure reason when null', () => {
const item = { ...LOW_QUALITY_ITEM, failure_reason: null }
const w = mount(SftCard, { props: { item } })
expect(w.find('.failure-reason').exists()).toBe(false)
})
// ── Failure category chip-group ───────────────────────────────────
it('failure category section hidden when not correcting and no pending action', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(false)
})
it('failure category section shown when correcting prop is true', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(true)
})
it('renders all six category chips when correcting', () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
const chips = w.findAll('.category-chip')
expect(chips).toHaveLength(6)
})
it('clicking a category chip selects it (adds active class)', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
const chip = w.find('[data-testid="category-chip-wrong_answer"]')
await chip.trigger('click')
expect(chip.classes()).toContain('category-chip--active')
})
it('clicking the active chip again deselects it', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
const chip = w.find('[data-testid="category-chip-hallucination"]')
await chip.trigger('click')
expect(chip.classes()).toContain('category-chip--active')
await chip.trigger('click')
expect(chip.classes()).not.toContain('category-chip--active')
})
it('only one chip can be active at a time', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM, correcting: true } })
await w.find('[data-testid="category-chip-wrong_answer"]').trigger('click')
await w.find('[data-testid="category-chip-hallucination"]').trigger('click')
const active = w.findAll('.category-chip--active')
expect(active).toHaveLength(1)
expect(active[0].attributes('data-testid')).toBe('category-chip-hallucination')
})
it('clicking Discard shows pending action row with category section', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="discard-btn"]').trigger('click')
expect(w.find('[data-testid="failure-category-section"]').exists()).toBe(true)
expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(true)
})
it('clicking Flag shows pending action row', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="flag-btn"]').trigger('click')
expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(true)
})
it('confirming discard emits discard with null when no category selected', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="discard-btn"]').trigger('click')
await w.find('[data-testid="confirm-pending-btn"]').trigger('click')
expect(w.emitted('discard')).toBeTruthy()
expect(w.emitted('discard')![0]).toEqual([null])
})
it('confirming discard emits discard with selected category', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="discard-btn"]').trigger('click')
await w.find('[data-testid="category-chip-scoring_artifact"]').trigger('click')
await w.find('[data-testid="confirm-pending-btn"]').trigger('click')
expect(w.emitted('discard')![0]).toEqual(['scoring_artifact'])
})
it('cancelling pending action hides the pending row', async () => {
const w = mount(SftCard, { props: { item: LOW_QUALITY_ITEM } })
await w.find('[data-testid="discard-btn"]').trigger('click')
await w.find('[data-testid="cancel-pending-btn"]').trigger('click')
expect(w.find('[data-testid="pending-action-row"]').exists()).toBe(false)
})
})