# cf_voice/events.py — AudioEvent models from the parallel classifier # # These are the outputs of cf_voice.context (not cf_voice.io). # cf_voice.io produces transcripts; cf_voice.context produces AudioEvents # from the same audio window, running in parallel. # # Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame. from __future__ import annotations from dataclasses import dataclass, field from typing import Literal EventType = Literal["queue", "speaker", "environ", "tone"] # ── Queue state labels ──────────────────────────────────────────────────────── # Detected from YAMNet acoustic event classification QUEUE_LABELS = Literal[ "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone" ] # ── Speaker type labels ─────────────────────────────────────────────────────── # Detected from pyannote VAD + custom IVR-vs-human head SPEAKER_LABELS = Literal[ "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker" ] # ── Environmental labels ────────────────────────────────────────────────────── # Background shift is the primary AMD (answering machine detection) signal ENVIRON_LABELS = Literal[ "call_center", "music", "background_shift", "noise_floor_change", "quiet" ] # ── Tone / affect labels ────────────────────────────────────────────────────── # From SER model (wav2vec2-based); feeds Elcor label generation AFFECT_LABELS = Literal[ "neutral", "warm", "frustrated", "dismissive", "apologetic", "urgent", "condescending", "scripted", "genuine", "confused", "tired", "optimistic", ] # Generic subtext format (default, always on): "Tone: Frustrated" # Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:" _ELCOR_MAP: dict[str, str] = { "neutral": "In a measured, neutral tone:", "warm": "Warmly:", "frustrated": "With barely concealed frustration:", "dismissive": "With polite dismissiveness:", "apologetic": "Apologetically:", "urgent": "With evident urgency:", "condescending": "With patronizing brightness:", "scripted": "Reading from a script:", "genuine": "With apparent sincerity:", "confused": "With evident confusion:", "tired": "With audible fatigue:", "optimistic": "With cautious optimism:", } @dataclass class AudioEvent: """ A single classified event from the parallel audio classifier. event_type determines how to interpret label and whether subtext is present. """ timestamp: float event_type: EventType label: str confidence: float # Tone annotation — present on ToneEvent only. # Generic format (default): "Tone: Frustrated" # Elcor format (easter egg): "With barely concealed frustration:" subtext: str | None = None @dataclass class ToneEvent(AudioEvent): """ Tone/affect classification event. The subtext field carries the human-readable annotation. Format is controlled by the caller (elcor flag in the classify request). """ affect: str = "neutral" shift_magnitude: float = 0.0 shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable" prosody_flags: list[str] = field(default_factory=list) def __post_init__(self) -> None: # Force event_type to "tone" regardless of what the caller passed. # Overriding a parent field with a default in a child dataclass breaks # MRO field ordering in Python, so we use __post_init__ instead. self.event_type = "tone" def make_subtext(affect: str, elcor: bool) -> str: """Generate the subtext annotation for a tone event.""" if elcor: return _ELCOR_MAP.get(affect, f"With {affect} tone:") return f"Tone: {affect.replace('_', ' ').capitalize()}" def tone_event_from_voice_frame( frame_label: str, frame_confidence: float, shift_magnitude: float, timestamp: float, elcor: bool = False, ) -> ToneEvent: """ Convert a VoiceFrame label into a ToneEvent. Used in mock mode and as the bridge from VoiceFrame to AudioEvent. """ # Map VoiceFrame labels to affect labels _label_to_affect: dict[str, str] = { "Calm and focused": "neutral", "Warmly impatient": "frustrated", "Deflecting": "dismissive", "Genuinely curious": "genuine", "Politely dismissive": "dismissive", "Nervous but cooperative": "apologetic", "Frustrated but contained": "frustrated", "Enthusiastic": "warm", "Tired and compliant": "tired", "Guardedly optimistic": "optimistic", "Apologetically firm": "apologetic", "Confused but engaged": "confused", } affect = _label_to_affect.get(frame_label, "neutral") shift_dir = ( "stable" if shift_magnitude < 0.15 else "warmer" if affect in ("warm", "genuine", "optimistic") else "colder" if affect in ("dismissive", "condescending") else "more_urgent" if affect in ("frustrated", "urgent") else "stable" ) return ToneEvent( timestamp=timestamp, event_type="tone", label=frame_label, confidence=frame_confidence, subtext=make_subtext(affect, elcor), affect=affect, shift_magnitude=shift_magnitude, shift_direction=shift_dir, prosody_flags=[], )