# cf_voice/events.py — AudioEvent models from the parallel classifier # # These are the outputs of cf_voice.context (not cf_voice.io). # cf_voice.io produces transcripts; cf_voice.context produces AudioEvents # from the same audio window, running in parallel. # # Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame. from __future__ import annotations from dataclasses import dataclass, field from typing import Literal EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"] # ── Queue state labels ──────────────────────────────────────────────────────── # Detected from AST acoustic event classification QUEUE_LABELS = Literal[ "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone" ] # ── Speaker type labels ─────────────────────────────────────────────────────── # Detected from pyannote VAD + custom IVR-vs-human head SPEAKER_LABELS = Literal[ "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker", "background_voices", ] # ── Environmental labels ────────────────────────────────────────────────────── # Background shift is the primary AMD (answering machine detection) signal. # Telephony labels + general-purpose acoustic scene labels. ENVIRON_LABELS = Literal[ # Telephony "call_center", "music", "background_shift", "noise_floor_change", "quiet", # Nature "birdsong", "wind", "rain", "water", # Urban "traffic", "crowd_chatter", "street_signal", "construction", # Indoor "hvac", "keyboard_typing", "restaurant", ] # ── Acoustic scene labels ───────────────────────────────────────────────────── # Broad scene category — primary input to privacy risk scoring. SCENE_LABELS = Literal[ "indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature", "vehicle", "public_transit", ] # ── Accent / language labels ────────────────────────────────────────────────── # Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1. ACCENT_LABELS = Literal[ "en_gb", "en_us", "en_au", "en_ca", "en_in", "fr", "es", "de", "zh", "ja", "other", ] # ── Tone / affect labels ────────────────────────────────────────────────────── # From SER model (wav2vec2-based); feeds Elcor label generation AFFECT_LABELS = Literal[ "neutral", "warm", "frustrated", "dismissive", "apologetic", "urgent", "condescending", "scripted", "genuine", "confused", "tired", "optimistic", ] # Generic subtext format (default, always on): "Tone: Frustrated" # Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:" _ELCOR_MAP: dict[str, str] = { "neutral": "In a measured, neutral tone:", "warm": "Warmly:", "frustrated": "With barely concealed frustration:", "dismissive": "With polite dismissiveness:", "apologetic": "Apologetically:", "urgent": "With evident urgency:", "condescending": "With patronizing brightness:", "scripted": "Reading from a script:", "genuine": "With apparent sincerity:", "confused": "With evident confusion:", "tired": "With audible fatigue:", "optimistic": "With cautious optimism:", } @dataclass class AudioEvent: """ A single classified event from the parallel audio classifier. event_type determines how to interpret label and whether subtext is present. speaker_id is the ephemeral local diarization label for this event's speaker. """ timestamp: float event_type: EventType label: str confidence: float speaker_id: str = "speaker_a" # Tone annotation — present on ToneEvent only. # Generic format (default): "Tone: Frustrated" # Elcor format (easter egg): "With barely concealed frustration:" subtext: str | None = None @dataclass class ToneEvent(AudioEvent): """ Tone/affect classification event. This is the SSE wire type for Linnet's annotation stream and the embed protocol. Field names are stable as of cf-voice v0.1.0 — see cf-core#40 for the wire format spec. The subtext field carries the human-readable annotation. Format is controlled by the caller (elcor flag in the classify request). Dimensional emotion (Navigation v0.2.x — audeering model): valence / arousal / dominance are None when the dimensional classifier is not enabled (CF_VOICE_DIMENSIONAL != "1"). Prosodic signals (Navigation v0.2.x — openSMILE): sarcasm_risk / flat_f0_score are None when extractor is not enabled. """ affect: str = "neutral" shift_magnitude: float = 0.0 shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable" prosody_flags: list[str] = field(default_factory=list) session_id: str = "" # caller-assigned; correlates events to a session # Dimensional emotion scores (audeering, optional) valence: float | None = None arousal: float | None = None dominance: float | None = None # Prosodic signals (openSMILE, optional) sarcasm_risk: float | None = None flat_f0_score: float | None = None # Trajectory signals (rolling buffer — activates after BASELINE_MIN frames) arousal_delta: float | None = None valence_delta: float | None = None trend: str | None = None # "stable"|"escalating"|"suppressed"|… # Coherence signals (SER vs VAD cross-comparison) coherence_score: float | None = None suppression_flag: bool | None = None reframe_type: str | None = None # "none"|"genuine"|"surface" affect_divergence: float | None = None def __post_init__(self) -> None: # Force event_type to "tone" regardless of what the caller passed. # Overriding a parent field with a default in a child dataclass breaks # MRO field ordering in Python, so we use __post_init__ instead. self.event_type = "tone" def make_subtext(affect: str, elcor: bool) -> str: """Generate the subtext annotation for a tone event.""" if elcor: return _ELCOR_MAP.get(affect, f"With {affect} tone:") return f"Tone: {affect.replace('_', ' ').capitalize()}" def tone_event_from_voice_frame( frame_label: str, frame_confidence: float, shift_magnitude: float, timestamp: float, elcor: bool = False, ) -> ToneEvent: """ Convert a VoiceFrame label into a ToneEvent. Used in mock mode and as the bridge from VoiceFrame to AudioEvent. """ # Map VoiceFrame labels to affect labels _label_to_affect: dict[str, str] = { "Calm and focused": "neutral", "Warmly impatient": "frustrated", "Deflecting": "dismissive", "Genuinely curious": "genuine", "Politely dismissive": "dismissive", "Nervous but cooperative": "apologetic", "Frustrated but contained": "frustrated", "Enthusiastic": "warm", "Tired and compliant": "tired", "Guardedly optimistic": "optimistic", "Apologetically firm": "apologetic", "Confused but engaged": "confused", } affect = _label_to_affect.get(frame_label, "neutral") shift_dir = ( "stable" if shift_magnitude < 0.15 else "warmer" if affect in ("warm", "genuine", "optimistic") else "colder" if affect in ("dismissive", "condescending") else "more_urgent" if affect in ("frustrated", "urgent") else "stable" ) return ToneEvent( timestamp=timestamp, event_type="tone", label=frame_label, confidence=frame_confidence, subtext=make_subtext(affect, elcor), affect=affect, shift_magnitude=shift_magnitude, shift_direction=shift_dir, prosody_flags=[], )