cf-voice/cf_voice/events.py

# cf_voice/events.py — AudioEvent models from the parallel classifier
#
# These are the outputs of cf_voice.context (not cf_voice.io).
# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents
# from the same audio window, running in parallel.
#
# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame.
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Literal

EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"]

# ── Queue state labels ────────────────────────────────────────────────────────
# Detected from AST acoustic event classification
QUEUE_LABELS = Literal[
    "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
]

# ── Speaker type labels ───────────────────────────────────────────────────────
# Detected from pyannote VAD + custom IVR-vs-human head
SPEAKER_LABELS = Literal[
    "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker",
    "background_voices",
]

# ── Environmental labels ──────────────────────────────────────────────────────
# Background shift is the primary AMD (answering machine detection) signal.
# Telephony labels + general-purpose acoustic scene labels.
ENVIRON_LABELS = Literal[
    # Telephony
    "call_center", "music", "background_shift", "noise_floor_change", "quiet",
    # Nature
    "birdsong", "wind", "rain", "water",
    # Urban
    "traffic", "crowd_chatter", "street_signal", "construction",
    # Indoor
    "hvac", "keyboard_typing", "restaurant",
]

# ── Acoustic scene labels ─────────────────────────────────────────────────────
# Broad scene category — primary input to privacy risk scoring.
SCENE_LABELS = Literal[
    "indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature",
    "vehicle", "public_transit",
]

# ── Accent / language labels ──────────────────────────────────────────────────
# Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1.
ACCENT_LABELS = Literal[
    "en_gb", "en_us", "en_au", "en_ca", "en_in",
    "fr", "es", "de", "zh", "ja", "other",
]

# ── Tone / affect labels ──────────────────────────────────────────────────────
# From SER model (wav2vec2-based); feeds Elcor label generation
AFFECT_LABELS = Literal[
    "neutral", "warm", "frustrated", "dismissive", "apologetic",
    "urgent", "condescending", "scripted", "genuine", "confused",
    "tired", "optimistic",
]

# Generic subtext format (default, always on): "Tone: Frustrated"
# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:"
_ELCOR_MAP: dict[str, str] = {
    "neutral":       "In a measured, neutral tone:",
    "warm":          "Warmly:",
    "frustrated":    "With barely concealed frustration:",
    "dismissive":    "With polite dismissiveness:",
    "apologetic":    "Apologetically:",
    "urgent":        "With evident urgency:",
    "condescending": "With patronizing brightness:",
    "scripted":      "Reading from a script:",
    "genuine":       "With apparent sincerity:",
    "confused":      "With evident confusion:",
    "tired":         "With audible fatigue:",
    "optimistic":    "With cautious optimism:",
}


@dataclass
class AudioEvent:
    """
    A single classified event from the parallel audio classifier.

    event_type determines how to interpret label and whether subtext is present.
    speaker_id is the ephemeral local diarization label for this event's speaker.
    """
    timestamp: float
    event_type: EventType
    label: str
    confidence: float
    speaker_id: str = "speaker_a"
    # Tone annotation — present on ToneEvent only.
    # Generic format (default): "Tone: Frustrated"
    # Elcor format (easter egg):  "With barely concealed frustration:"
    subtext: str | None = None


@dataclass
class ToneEvent(AudioEvent):
    """
    Tone/affect classification event.

    This is the SSE wire type for Linnet's annotation stream and the
    <LinnetWidget /> embed protocol. Field names are stable as of cf-voice
    v0.1.0 — see cf-core#40 for the wire format spec.

    The subtext field carries the human-readable annotation.
    Format is controlled by the caller (elcor flag in the classify request).

    Dimensional emotion (Navigation v0.2.x — audeering model):
      valence / arousal / dominance are None when the dimensional classifier
      is not enabled (CF_VOICE_DIMENSIONAL != "1").

    Prosodic signals (Navigation v0.2.x — openSMILE):
      sarcasm_risk / flat_f0_score are None when extractor is not enabled.
    """
    affect: str = "neutral"
    shift_magnitude: float = 0.0
    shift_direction: str = "stable"   # "warmer" | "colder" | "more_urgent" | "stable"
    prosody_flags: list[str] = field(default_factory=list)
    session_id: str = ""              # caller-assigned; correlates events to a session
    # Dimensional emotion scores (audeering, optional)
    valence: float | None = None
    arousal: float | None = None
    dominance: float | None = None
    # Prosodic signals (openSMILE, optional)
    sarcasm_risk: float | None = None
    flat_f0_score: float | None = None
    # Trajectory signals (rolling buffer — activates after BASELINE_MIN frames)
    arousal_delta: float | None = None
    valence_delta: float | None = None
    trend: str | None = None              # "stable"|"escalating"|"suppressed"|…
    # Coherence signals (SER vs VAD cross-comparison)
    coherence_score: float | None = None
    suppression_flag: bool | None = None
    reframe_type: str | None = None       # "none"|"genuine"|"surface"
    affect_divergence: float | None = None

    def __post_init__(self) -> None:
        # Force event_type to "tone" regardless of what the caller passed.
        # Overriding a parent field with a default in a child dataclass breaks
        # MRO field ordering in Python, so we use __post_init__ instead.
        self.event_type = "tone"


def make_subtext(affect: str, elcor: bool) -> str:
    """Generate the subtext annotation for a tone event."""
    if elcor:
        return _ELCOR_MAP.get(affect, f"With {affect} tone:")
    return f"Tone: {affect.replace('_', ' ').capitalize()}"


def tone_event_from_voice_frame(
    frame_label: str,
    frame_confidence: float,
    shift_magnitude: float,
    timestamp: float,
    elcor: bool = False,
) -> ToneEvent:
    """
    Convert a VoiceFrame label into a ToneEvent.

    Used in mock mode and as the bridge from VoiceFrame to AudioEvent.
    """
    # Map VoiceFrame labels to affect labels
    _label_to_affect: dict[str, str] = {
        "Calm and focused":       "neutral",
        "Warmly impatient":       "frustrated",
        "Deflecting":             "dismissive",
        "Genuinely curious":      "genuine",
        "Politely dismissive":    "dismissive",
        "Nervous but cooperative": "apologetic",
        "Frustrated but contained": "frustrated",
        "Enthusiastic":           "warm",
        "Tired and compliant":    "tired",
        "Guardedly optimistic":   "optimistic",
        "Apologetically firm":    "apologetic",
        "Confused but engaged":   "confused",
    }
    affect = _label_to_affect.get(frame_label, "neutral")

    shift_dir = (
        "stable" if shift_magnitude < 0.15
        else "warmer" if affect in ("warm", "genuine", "optimistic")
        else "colder" if affect in ("dismissive", "condescending")
        else "more_urgent" if affect in ("frustrated", "urgent")
        else "stable"
    )

    return ToneEvent(
        timestamp=timestamp,
        event_type="tone",
        label=frame_label,
        confidence=frame_confidence,
        subtext=make_subtext(affect, elcor),
        affect=affect,
        shift_magnitude=shift_magnitude,
        shift_direction=shift_dir,
        prosody_flags=[],
    )