cf-voice/cf_voice/events.py
pyr0ball 335d51f02f feat: lock ToneEvent SSE wire format (cf-core#40)
- AudioEvent: add speaker_id field (was on VoiceFrame only; needed on all events)
- ToneEvent: add session_id field for session correlation across embedded consumers
- README: full wire format documentation — JSON shape, field reference table,
  SSE envelope, Elcor mode subtext table, module license map
- ToneEvent docstring references cf-core#40 as the wire format spec

Closes cf-core#40
2026-04-06 17:51:09 -07:00

157 lines
6.2 KiB
Python

# cf_voice/events.py — AudioEvent models from the parallel classifier
#
# These are the outputs of cf_voice.context (not cf_voice.io).
# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents
# from the same audio window, running in parallel.
#
# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame.
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Literal
EventType = Literal["queue", "speaker", "environ", "tone"]
# ── Queue state labels ────────────────────────────────────────────────────────
# Detected from YAMNet acoustic event classification
QUEUE_LABELS = Literal[
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
]
# ── Speaker type labels ───────────────────────────────────────────────────────
# Detected from pyannote VAD + custom IVR-vs-human head
SPEAKER_LABELS = Literal[
"ivr_synth", "human_single", "human_multi", "transfer", "no_speaker"
]
# ── Environmental labels ──────────────────────────────────────────────────────
# Background shift is the primary AMD (answering machine detection) signal
ENVIRON_LABELS = Literal[
"call_center", "music", "background_shift", "noise_floor_change", "quiet"
]
# ── Tone / affect labels ──────────────────────────────────────────────────────
# From SER model (wav2vec2-based); feeds Elcor label generation
AFFECT_LABELS = Literal[
"neutral", "warm", "frustrated", "dismissive", "apologetic",
"urgent", "condescending", "scripted", "genuine", "confused",
"tired", "optimistic",
]
# Generic subtext format (default, always on): "Tone: Frustrated"
# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:"
_ELCOR_MAP: dict[str, str] = {
"neutral": "In a measured, neutral tone:",
"warm": "Warmly:",
"frustrated": "With barely concealed frustration:",
"dismissive": "With polite dismissiveness:",
"apologetic": "Apologetically:",
"urgent": "With evident urgency:",
"condescending": "With patronizing brightness:",
"scripted": "Reading from a script:",
"genuine": "With apparent sincerity:",
"confused": "With evident confusion:",
"tired": "With audible fatigue:",
"optimistic": "With cautious optimism:",
}
@dataclass
class AudioEvent:
"""
A single classified event from the parallel audio classifier.
event_type determines how to interpret label and whether subtext is present.
speaker_id is the ephemeral local diarization label for this event's speaker.
"""
timestamp: float
event_type: EventType
label: str
confidence: float
speaker_id: str = "speaker_a"
# Tone annotation — present on ToneEvent only.
# Generic format (default): "Tone: Frustrated"
# Elcor format (easter egg): "With barely concealed frustration:"
subtext: str | None = None
@dataclass
class ToneEvent(AudioEvent):
"""
Tone/affect classification event.
This is the SSE wire type for Linnet's annotation stream and the
<LinnetWidget /> embed protocol. Field names are stable as of cf-voice
v0.1.0 — see cf-core#40 for the wire format spec.
The subtext field carries the human-readable annotation.
Format is controlled by the caller (elcor flag in the classify request).
"""
affect: str = "neutral"
shift_magnitude: float = 0.0
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
prosody_flags: list[str] = field(default_factory=list)
session_id: str = "" # caller-assigned; correlates events to a session
def __post_init__(self) -> None:
# Force event_type to "tone" regardless of what the caller passed.
# Overriding a parent field with a default in a child dataclass breaks
# MRO field ordering in Python, so we use __post_init__ instead.
self.event_type = "tone"
def make_subtext(affect: str, elcor: bool) -> str:
"""Generate the subtext annotation for a tone event."""
if elcor:
return _ELCOR_MAP.get(affect, f"With {affect} tone:")
return f"Tone: {affect.replace('_', ' ').capitalize()}"
def tone_event_from_voice_frame(
frame_label: str,
frame_confidence: float,
shift_magnitude: float,
timestamp: float,
elcor: bool = False,
) -> ToneEvent:
"""
Convert a VoiceFrame label into a ToneEvent.
Used in mock mode and as the bridge from VoiceFrame to AudioEvent.
"""
# Map VoiceFrame labels to affect labels
_label_to_affect: dict[str, str] = {
"Calm and focused": "neutral",
"Warmly impatient": "frustrated",
"Deflecting": "dismissive",
"Genuinely curious": "genuine",
"Politely dismissive": "dismissive",
"Nervous but cooperative": "apologetic",
"Frustrated but contained": "frustrated",
"Enthusiastic": "warm",
"Tired and compliant": "tired",
"Guardedly optimistic": "optimistic",
"Apologetically firm": "apologetic",
"Confused but engaged": "confused",
}
affect = _label_to_affect.get(frame_label, "neutral")
shift_dir = (
"stable" if shift_magnitude < 0.15
else "warmer" if affect in ("warm", "genuine", "optimistic")
else "colder" if affect in ("dismissive", "condescending")
else "more_urgent" if affect in ("frustrated", "urgent")
else "stable"
)
return ToneEvent(
timestamp=timestamp,
event_type="tone",
label=frame_label,
confidence=frame_confidence,
subtext=make_subtext(affect, elcor),
affect=affect,
shift_magnitude=shift_magnitude,
shift_direction=shift_dir,
prosody_flags=[],
)