From 6e17da9e935ab8d0966716f822d513788bd2ab79 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 6 Apr 2026 16:53:10 -0700 Subject: [PATCH] feat: AudioEvent models, classify_chunk() for per-chunk request-response path - events.py: AudioEvent dataclass + ToneEvent with affect, shift_magnitude, shift_direction, prosody_flags; make_subtext() for generic/Elcor formats - context.py: classify_chunk(audio_b64, timestamp, prior_frames, elcor) returns list[AudioEvent]; mock mode uses MockVoiceIO RNG, real raises NotImplementedError - ToneEvent.__post_init__ pins event_type='tone' (avoids MRO default-field ordering bug) - Elcor mode: same classifier output, Elcor speech-prefix wording; all tiers --- cf_voice/context.py | 48 +++++++++++++- cf_voice/events.py | 150 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 cf_voice/events.py diff --git a/cf_voice/context.py b/cf_voice/context.py index 53ddb44..84f4c84 100644 --- a/cf_voice/context.py +++ b/cf_voice/context.py @@ -14,7 +14,8 @@ from __future__ import annotations import os from typing import AsyncIterator -from cf_voice.io import VoiceIO, make_io +from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame +from cf_voice.io import MockVoiceIO, VoiceIO, make_io from cf_voice.models import VoiceFrame @@ -64,6 +65,51 @@ class ContextClassifier: async def stop(self) -> None: await self._io.stop() + def classify_chunk( + self, + audio_b64: str, + timestamp: float = 0.0, + prior_frames: int = 0, + elcor: bool = False, + ) -> list[AudioEvent]: + """ + Classify a single audio chunk and return AudioEvents. + + This is the request-response path used by the cf-orch endpoint. + The streaming path (async generator) is for continuous consumers. + + Stub: audio_b64 is ignored; returns synthetic events from the mock IO. + Real: decode audio, run YAMNet + SER + pyannote, return events. + + elcor=True switches subtext format to Mass Effect Elcor prefix style. + Generic tone annotation is always present regardless of elcor flag. + """ + if not isinstance(self._io, MockVoiceIO): + raise NotImplementedError( + "classify_chunk() requires mock mode. " + "Real audio inference is not yet implemented." + ) + # Generate a synthetic VoiceFrame to derive events from + rng = self._io._rng + import time + label = rng.choice(self._io._labels) + shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0 + frame = VoiceFrame( + label=label, + confidence=rng.uniform(0.6, 0.97), + speaker_id=rng.choice(self._io._speakers), + shift_magnitude=round(shift, 3), + timestamp=timestamp, + ) + tone = tone_event_from_voice_frame( + frame_label=frame.label, + frame_confidence=frame.confidence, + shift_magnitude=frame.shift_magnitude, + timestamp=frame.timestamp, + elcor=elcor, + ) + return [tone] + def _enrich(self, frame: VoiceFrame) -> VoiceFrame: """ Apply tone classification to a raw frame. diff --git a/cf_voice/events.py b/cf_voice/events.py new file mode 100644 index 0000000..1622b81 --- /dev/null +++ b/cf_voice/events.py @@ -0,0 +1,150 @@ +# cf_voice/events.py — AudioEvent models from the parallel classifier +# +# These are the outputs of cf_voice.context (not cf_voice.io). +# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents +# from the same audio window, running in parallel. +# +# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +EventType = Literal["queue", "speaker", "environ", "tone"] + +# ── Queue state labels ──────────────────────────────────────────────────────── +# Detected from YAMNet acoustic event classification +QUEUE_LABELS = Literal[ + "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone" +] + +# ── Speaker type labels ─────────────────────────────────────────────────────── +# Detected from pyannote VAD + custom IVR-vs-human head +SPEAKER_LABELS = Literal[ + "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker" +] + +# ── Environmental labels ────────────────────────────────────────────────────── +# Background shift is the primary AMD (answering machine detection) signal +ENVIRON_LABELS = Literal[ + "call_center", "music", "background_shift", "noise_floor_change", "quiet" +] + +# ── Tone / affect labels ────────────────────────────────────────────────────── +# From SER model (wav2vec2-based); feeds Elcor label generation +AFFECT_LABELS = Literal[ + "neutral", "warm", "frustrated", "dismissive", "apologetic", + "urgent", "condescending", "scripted", "genuine", "confused", + "tired", "optimistic", +] + +# Generic subtext format (default, always on): "Tone: Frustrated" +# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:" +_ELCOR_MAP: dict[str, str] = { + "neutral": "In a measured, neutral tone:", + "warm": "Warmly:", + "frustrated": "With barely concealed frustration:", + "dismissive": "With polite dismissiveness:", + "apologetic": "Apologetically:", + "urgent": "With evident urgency:", + "condescending": "With patronizing brightness:", + "scripted": "Reading from a script:", + "genuine": "With apparent sincerity:", + "confused": "With evident confusion:", + "tired": "With audible fatigue:", + "optimistic": "With cautious optimism:", +} + + +@dataclass +class AudioEvent: + """ + A single classified event from the parallel audio classifier. + + event_type determines how to interpret label and whether subtext is present. + """ + timestamp: float + event_type: EventType + label: str + confidence: float + # Tone annotation — present on ToneEvent only. + # Generic format (default): "Tone: Frustrated" + # Elcor format (easter egg): "With barely concealed frustration:" + subtext: str | None = None + + +@dataclass +class ToneEvent(AudioEvent): + """ + Tone/affect classification event. + + The subtext field carries the human-readable annotation. + Format is controlled by the caller (elcor flag in the classify request). + """ + affect: str = "neutral" + shift_magnitude: float = 0.0 + shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable" + prosody_flags: list[str] = field(default_factory=list) + + def __post_init__(self) -> None: + # Force event_type to "tone" regardless of what the caller passed. + # Overriding a parent field with a default in a child dataclass breaks + # MRO field ordering in Python, so we use __post_init__ instead. + self.event_type = "tone" + + +def make_subtext(affect: str, elcor: bool) -> str: + """Generate the subtext annotation for a tone event.""" + if elcor: + return _ELCOR_MAP.get(affect, f"With {affect} tone:") + return f"Tone: {affect.replace('_', ' ').capitalize()}" + + +def tone_event_from_voice_frame( + frame_label: str, + frame_confidence: float, + shift_magnitude: float, + timestamp: float, + elcor: bool = False, +) -> ToneEvent: + """ + Convert a VoiceFrame label into a ToneEvent. + + Used in mock mode and as the bridge from VoiceFrame to AudioEvent. + """ + # Map VoiceFrame labels to affect labels + _label_to_affect: dict[str, str] = { + "Calm and focused": "neutral", + "Warmly impatient": "frustrated", + "Deflecting": "dismissive", + "Genuinely curious": "genuine", + "Politely dismissive": "dismissive", + "Nervous but cooperative": "apologetic", + "Frustrated but contained": "frustrated", + "Enthusiastic": "warm", + "Tired and compliant": "tired", + "Guardedly optimistic": "optimistic", + "Apologetically firm": "apologetic", + "Confused but engaged": "confused", + } + affect = _label_to_affect.get(frame_label, "neutral") + + shift_dir = ( + "stable" if shift_magnitude < 0.15 + else "warmer" if affect in ("warm", "genuine", "optimistic") + else "colder" if affect in ("dismissive", "condescending") + else "more_urgent" if affect in ("frustrated", "urgent") + else "stable" + ) + + return ToneEvent( + timestamp=timestamp, + event_type="tone", + label=frame_label, + confidence=frame_confidence, + subtext=make_subtext(affect, elcor), + affect=affect, + shift_magnitude=shift_magnitude, + shift_direction=shift_dir, + prosody_flags=[], + )