feat: AudioEvent models, classify_chunk() for per-chunk request-response path
- events.py: AudioEvent dataclass + ToneEvent with affect, shift_magnitude, shift_direction, prosody_flags; make_subtext() for generic/Elcor formats - context.py: classify_chunk(audio_b64, timestamp, prior_frames, elcor) returns list[AudioEvent]; mock mode uses MockVoiceIO RNG, real raises NotImplementedError - ToneEvent.__post_init__ pins event_type='tone' (avoids MRO default-field ordering bug) - Elcor mode: same classifier output, Elcor speech-prefix wording; all tiers
This commit is contained in:
parent
35fc0a088c
commit
6e17da9e93
2 changed files with 197 additions and 1 deletions
|
|
@ -14,7 +14,8 @@ from __future__ import annotations
|
|||
import os
|
||||
from typing import AsyncIterator
|
||||
|
||||
from cf_voice.io import VoiceIO, make_io
|
||||
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
|
||||
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
|
||||
from cf_voice.models import VoiceFrame
|
||||
|
||||
|
||||
|
|
@ -64,6 +65,51 @@ class ContextClassifier:
|
|||
async def stop(self) -> None:
|
||||
await self._io.stop()
|
||||
|
||||
def classify_chunk(
|
||||
self,
|
||||
audio_b64: str,
|
||||
timestamp: float = 0.0,
|
||||
prior_frames: int = 0,
|
||||
elcor: bool = False,
|
||||
) -> list[AudioEvent]:
|
||||
"""
|
||||
Classify a single audio chunk and return AudioEvents.
|
||||
|
||||
This is the request-response path used by the cf-orch endpoint.
|
||||
The streaming path (async generator) is for continuous consumers.
|
||||
|
||||
Stub: audio_b64 is ignored; returns synthetic events from the mock IO.
|
||||
Real: decode audio, run YAMNet + SER + pyannote, return events.
|
||||
|
||||
elcor=True switches subtext format to Mass Effect Elcor prefix style.
|
||||
Generic tone annotation is always present regardless of elcor flag.
|
||||
"""
|
||||
if not isinstance(self._io, MockVoiceIO):
|
||||
raise NotImplementedError(
|
||||
"classify_chunk() requires mock mode. "
|
||||
"Real audio inference is not yet implemented."
|
||||
)
|
||||
# Generate a synthetic VoiceFrame to derive events from
|
||||
rng = self._io._rng
|
||||
import time
|
||||
label = rng.choice(self._io._labels)
|
||||
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
|
||||
frame = VoiceFrame(
|
||||
label=label,
|
||||
confidence=rng.uniform(0.6, 0.97),
|
||||
speaker_id=rng.choice(self._io._speakers),
|
||||
shift_magnitude=round(shift, 3),
|
||||
timestamp=timestamp,
|
||||
)
|
||||
tone = tone_event_from_voice_frame(
|
||||
frame_label=frame.label,
|
||||
frame_confidence=frame.confidence,
|
||||
shift_magnitude=frame.shift_magnitude,
|
||||
timestamp=frame.timestamp,
|
||||
elcor=elcor,
|
||||
)
|
||||
return [tone]
|
||||
|
||||
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
||||
"""
|
||||
Apply tone classification to a raw frame.
|
||||
|
|
|
|||
150
cf_voice/events.py
Normal file
150
cf_voice/events.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
# cf_voice/events.py — AudioEvent models from the parallel classifier
|
||||
#
|
||||
# These are the outputs of cf_voice.context (not cf_voice.io).
|
||||
# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents
|
||||
# from the same audio window, running in parallel.
|
||||
#
|
||||
# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame.
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Literal
|
||||
|
||||
EventType = Literal["queue", "speaker", "environ", "tone"]
|
||||
|
||||
# ── Queue state labels ────────────────────────────────────────────────────────
|
||||
# Detected from YAMNet acoustic event classification
|
||||
QUEUE_LABELS = Literal[
|
||||
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
|
||||
]
|
||||
|
||||
# ── Speaker type labels ───────────────────────────────────────────────────────
|
||||
# Detected from pyannote VAD + custom IVR-vs-human head
|
||||
SPEAKER_LABELS = Literal[
|
||||
"ivr_synth", "human_single", "human_multi", "transfer", "no_speaker"
|
||||
]
|
||||
|
||||
# ── Environmental labels ──────────────────────────────────────────────────────
|
||||
# Background shift is the primary AMD (answering machine detection) signal
|
||||
ENVIRON_LABELS = Literal[
|
||||
"call_center", "music", "background_shift", "noise_floor_change", "quiet"
|
||||
]
|
||||
|
||||
# ── Tone / affect labels ──────────────────────────────────────────────────────
|
||||
# From SER model (wav2vec2-based); feeds Elcor label generation
|
||||
AFFECT_LABELS = Literal[
|
||||
"neutral", "warm", "frustrated", "dismissive", "apologetic",
|
||||
"urgent", "condescending", "scripted", "genuine", "confused",
|
||||
"tired", "optimistic",
|
||||
]
|
||||
|
||||
# Generic subtext format (default, always on): "Tone: Frustrated"
|
||||
# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:"
|
||||
_ELCOR_MAP: dict[str, str] = {
|
||||
"neutral": "In a measured, neutral tone:",
|
||||
"warm": "Warmly:",
|
||||
"frustrated": "With barely concealed frustration:",
|
||||
"dismissive": "With polite dismissiveness:",
|
||||
"apologetic": "Apologetically:",
|
||||
"urgent": "With evident urgency:",
|
||||
"condescending": "With patronizing brightness:",
|
||||
"scripted": "Reading from a script:",
|
||||
"genuine": "With apparent sincerity:",
|
||||
"confused": "With evident confusion:",
|
||||
"tired": "With audible fatigue:",
|
||||
"optimistic": "With cautious optimism:",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class AudioEvent:
|
||||
"""
|
||||
A single classified event from the parallel audio classifier.
|
||||
|
||||
event_type determines how to interpret label and whether subtext is present.
|
||||
"""
|
||||
timestamp: float
|
||||
event_type: EventType
|
||||
label: str
|
||||
confidence: float
|
||||
# Tone annotation — present on ToneEvent only.
|
||||
# Generic format (default): "Tone: Frustrated"
|
||||
# Elcor format (easter egg): "With barely concealed frustration:"
|
||||
subtext: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToneEvent(AudioEvent):
|
||||
"""
|
||||
Tone/affect classification event.
|
||||
|
||||
The subtext field carries the human-readable annotation.
|
||||
Format is controlled by the caller (elcor flag in the classify request).
|
||||
"""
|
||||
affect: str = "neutral"
|
||||
shift_magnitude: float = 0.0
|
||||
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
|
||||
prosody_flags: list[str] = field(default_factory=list)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Force event_type to "tone" regardless of what the caller passed.
|
||||
# Overriding a parent field with a default in a child dataclass breaks
|
||||
# MRO field ordering in Python, so we use __post_init__ instead.
|
||||
self.event_type = "tone"
|
||||
|
||||
|
||||
def make_subtext(affect: str, elcor: bool) -> str:
|
||||
"""Generate the subtext annotation for a tone event."""
|
||||
if elcor:
|
||||
return _ELCOR_MAP.get(affect, f"With {affect} tone:")
|
||||
return f"Tone: {affect.replace('_', ' ').capitalize()}"
|
||||
|
||||
|
||||
def tone_event_from_voice_frame(
|
||||
frame_label: str,
|
||||
frame_confidence: float,
|
||||
shift_magnitude: float,
|
||||
timestamp: float,
|
||||
elcor: bool = False,
|
||||
) -> ToneEvent:
|
||||
"""
|
||||
Convert a VoiceFrame label into a ToneEvent.
|
||||
|
||||
Used in mock mode and as the bridge from VoiceFrame to AudioEvent.
|
||||
"""
|
||||
# Map VoiceFrame labels to affect labels
|
||||
_label_to_affect: dict[str, str] = {
|
||||
"Calm and focused": "neutral",
|
||||
"Warmly impatient": "frustrated",
|
||||
"Deflecting": "dismissive",
|
||||
"Genuinely curious": "genuine",
|
||||
"Politely dismissive": "dismissive",
|
||||
"Nervous but cooperative": "apologetic",
|
||||
"Frustrated but contained": "frustrated",
|
||||
"Enthusiastic": "warm",
|
||||
"Tired and compliant": "tired",
|
||||
"Guardedly optimistic": "optimistic",
|
||||
"Apologetically firm": "apologetic",
|
||||
"Confused but engaged": "confused",
|
||||
}
|
||||
affect = _label_to_affect.get(frame_label, "neutral")
|
||||
|
||||
shift_dir = (
|
||||
"stable" if shift_magnitude < 0.15
|
||||
else "warmer" if affect in ("warm", "genuine", "optimistic")
|
||||
else "colder" if affect in ("dismissive", "condescending")
|
||||
else "more_urgent" if affect in ("frustrated", "urgent")
|
||||
else "stable"
|
||||
)
|
||||
|
||||
return ToneEvent(
|
||||
timestamp=timestamp,
|
||||
event_type="tone",
|
||||
label=frame_label,
|
||||
confidence=frame_confidence,
|
||||
subtext=make_subtext(affect, elcor),
|
||||
affect=affect,
|
||||
shift_magnitude=shift_magnitude,
|
||||
shift_direction=shift_dir,
|
||||
prosody_flags=[],
|
||||
)
|
||||
Loading…
Reference in a new issue