feat: AudioEvent models, classify_chunk() for per-chunk request-response path

- events.py: AudioEvent dataclass + ToneEvent with affect, shift_magnitude, shift_direction, prosody_flags; make_subtext() for generic/Elcor formats - context.py: classify_chunk(audio_b64, timestamp, prior_frames, elcor) returns list[AudioEvent]; mock mode uses MockVoiceIO RNG, real raises NotImplementedError - ToneEvent.__post_init__ pins event_type='tone' (avoids MRO default-field ordering bug) - Elcor mode: same classifier output, Elcor speech-prefix wording; all tiers
2026-04-06 16:53:10 -07:00 · 2026-04-06 16:53:10 -07:00 · 6e17da9e93
commit 6e17da9e93
parent 35fc0a088c
2 changed files with 197 additions and 1 deletions
--- a/cf_voice/context.py
+++ b/cf_voice/context.py
@ -14,7 +14,8 @@ from __future__ import annotations
 import os
 from typing import AsyncIterator

-from cf_voice.io import VoiceIO, make_io
+from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
+from cf_voice.io import MockVoiceIO, VoiceIO, make_io
 from cf_voice.models import VoiceFrame


@ -64,6 +65,51 @@ class ContextClassifier:
    async def stop(self) -> None:
        await self._io.stop()

+    def classify_chunk(
+        self,
+        audio_b64: str,
+        timestamp: float = 0.0,
+        prior_frames: int = 0,
+        elcor: bool = False,
+    ) -> list[AudioEvent]:
+        """
+        Classify a single audio chunk and return AudioEvents.
+
+        This is the request-response path used by the cf-orch endpoint.
+        The streaming path (async generator) is for continuous consumers.
+
+        Stub: audio_b64 is ignored; returns synthetic events from the mock IO.
+        Real: decode audio, run YAMNet + SER + pyannote, return events.
+
+        elcor=True switches subtext format to Mass Effect Elcor prefix style.
+        Generic tone annotation is always present regardless of elcor flag.
+        """
+        if not isinstance(self._io, MockVoiceIO):
+            raise NotImplementedError(
+                "classify_chunk() requires mock mode. "
+                "Real audio inference is not yet implemented."
+            )
+        # Generate a synthetic VoiceFrame to derive events from
+        rng = self._io._rng
+        import time
+        label = rng.choice(self._io._labels)
+        shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
+        frame = VoiceFrame(
+            label=label,
+            confidence=rng.uniform(0.6, 0.97),
+            speaker_id=rng.choice(self._io._speakers),
+            shift_magnitude=round(shift, 3),
+            timestamp=timestamp,
+        )
+        tone = tone_event_from_voice_frame(
+            frame_label=frame.label,
+            frame_confidence=frame.confidence,
+            shift_magnitude=frame.shift_magnitude,
+            timestamp=frame.timestamp,
+            elcor=elcor,
+        )
+        return [tone]
+
    def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
        """
        Apply tone classification to a raw frame.
--- a/cf_voice/events.py
+++ b/cf_voice/events.py
@ -0,0 +1,150 @@
+# cf_voice/events.py — AudioEvent models from the parallel classifier
+#
+# These are the outputs of cf_voice.context (not cf_voice.io).
+# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents
+# from the same audio window, running in parallel.
+#
+# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame.
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal
+
+EventType = Literal["queue", "speaker", "environ", "tone"]
+
+# ── Queue state labels ────────────────────────────────────────────────────────
+# Detected from YAMNet acoustic event classification
+QUEUE_LABELS = Literal[
+    "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
+]
+
+# ── Speaker type labels ───────────────────────────────────────────────────────
+# Detected from pyannote VAD + custom IVR-vs-human head
+SPEAKER_LABELS = Literal[
+    "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker"
+]
+
+# ── Environmental labels ──────────────────────────────────────────────────────
+# Background shift is the primary AMD (answering machine detection) signal
+ENVIRON_LABELS = Literal[
+    "call_center", "music", "background_shift", "noise_floor_change", "quiet"
+]
+
+# ── Tone / affect labels ──────────────────────────────────────────────────────
+# From SER model (wav2vec2-based); feeds Elcor label generation
+AFFECT_LABELS = Literal[
+    "neutral", "warm", "frustrated", "dismissive", "apologetic",
+    "urgent", "condescending", "scripted", "genuine", "confused",
+    "tired", "optimistic",
+]
+
+# Generic subtext format (default, always on): "Tone: Frustrated"
+# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:"
+_ELCOR_MAP: dict[str, str] = {
+    "neutral":       "In a measured, neutral tone:",
+    "warm":          "Warmly:",
+    "frustrated":    "With barely concealed frustration:",
+    "dismissive":    "With polite dismissiveness:",
+    "apologetic":    "Apologetically:",
+    "urgent":        "With evident urgency:",
+    "condescending": "With patronizing brightness:",
+    "scripted":      "Reading from a script:",
+    "genuine":       "With apparent sincerity:",
+    "confused":      "With evident confusion:",
+    "tired":         "With audible fatigue:",
+    "optimistic":    "With cautious optimism:",
+}
+
+
+@dataclass
+class AudioEvent:
+    """
+    A single classified event from the parallel audio classifier.
+
+    event_type determines how to interpret label and whether subtext is present.
+    """
+    timestamp: float
+    event_type: EventType
+    label: str
+    confidence: float
+    # Tone annotation — present on ToneEvent only.
+    # Generic format (default): "Tone: Frustrated"
+    # Elcor format (easter egg):  "With barely concealed frustration:"
+    subtext: str | None = None
+
+
+@dataclass
+class ToneEvent(AudioEvent):
+    """
+    Tone/affect classification event.
+
+    The subtext field carries the human-readable annotation.
+    Format is controlled by the caller (elcor flag in the classify request).
+    """
+    affect: str = "neutral"
+    shift_magnitude: float = 0.0
+    shift_direction: str = "stable"   # "warmer" | "colder" | "more_urgent" | "stable"
+    prosody_flags: list[str] = field(default_factory=list)
+
+    def __post_init__(self) -> None:
+        # Force event_type to "tone" regardless of what the caller passed.
+        # Overriding a parent field with a default in a child dataclass breaks
+        # MRO field ordering in Python, so we use __post_init__ instead.
+        self.event_type = "tone"
+
+
+def make_subtext(affect: str, elcor: bool) -> str:
+    """Generate the subtext annotation for a tone event."""
+    if elcor:
+        return _ELCOR_MAP.get(affect, f"With {affect} tone:")
+    return f"Tone: {affect.replace('_', ' ').capitalize()}"
+
+
+def tone_event_from_voice_frame(
+    frame_label: str,
+    frame_confidence: float,
+    shift_magnitude: float,
+    timestamp: float,
+    elcor: bool = False,
+) -> ToneEvent:
+    """
+    Convert a VoiceFrame label into a ToneEvent.
+
+    Used in mock mode and as the bridge from VoiceFrame to AudioEvent.
+    """
+    # Map VoiceFrame labels to affect labels
+    _label_to_affect: dict[str, str] = {
+        "Calm and focused":       "neutral",
+        "Warmly impatient":       "frustrated",
+        "Deflecting":             "dismissive",
+        "Genuinely curious":      "genuine",
+        "Politely dismissive":    "dismissive",
+        "Nervous but cooperative": "apologetic",
+        "Frustrated but contained": "frustrated",
+        "Enthusiastic":           "warm",
+        "Tired and compliant":    "tired",
+        "Guardedly optimistic":   "optimistic",
+        "Apologetically firm":    "apologetic",
+        "Confused but engaged":   "confused",
+    }
+    affect = _label_to_affect.get(frame_label, "neutral")
+
+    shift_dir = (
+        "stable" if shift_magnitude < 0.15
+        else "warmer" if affect in ("warm", "genuine", "optimistic")
+        else "colder" if affect in ("dismissive", "condescending")
+        else "more_urgent" if affect in ("frustrated", "urgent")
+        else "stable"
+    )
+
+    return ToneEvent(
+        timestamp=timestamp,
+        event_type="tone",
+        label=frame_label,
+        confidence=frame_confidence,
+        subtext=make_subtext(affect, elcor),
+        affect=affect,
+        shift_magnitude=shift_magnitude,
+        shift_direction=shift_dir,
+        prosody_flags=[],
+    )