New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
203 lines
8.3 KiB
Python
203 lines
8.3 KiB
Python
# cf_voice/events.py — AudioEvent models from the parallel classifier
|
|
#
|
|
# These are the outputs of cf_voice.context (not cf_voice.io).
|
|
# cf_voice.io produces transcripts; cf_voice.context produces AudioEvents
|
|
# from the same audio window, running in parallel.
|
|
#
|
|
# Consumers (Osprey, Linnet, Peregrine) receive both combined in a VoiceFrame.
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Literal
|
|
|
|
EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"]
|
|
|
|
# ── Queue state labels ────────────────────────────────────────────────────────
|
|
# Detected from AST acoustic event classification
|
|
QUEUE_LABELS = Literal[
|
|
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
|
|
]
|
|
|
|
# ── Speaker type labels ───────────────────────────────────────────────────────
|
|
# Detected from pyannote VAD + custom IVR-vs-human head
|
|
SPEAKER_LABELS = Literal[
|
|
"ivr_synth", "human_single", "human_multi", "transfer", "no_speaker",
|
|
"background_voices",
|
|
]
|
|
|
|
# ── Environmental labels ──────────────────────────────────────────────────────
|
|
# Background shift is the primary AMD (answering machine detection) signal.
|
|
# Telephony labels + general-purpose acoustic scene labels.
|
|
ENVIRON_LABELS = Literal[
|
|
# Telephony
|
|
"call_center", "music", "background_shift", "noise_floor_change", "quiet",
|
|
# Nature
|
|
"birdsong", "wind", "rain", "water",
|
|
# Urban
|
|
"traffic", "crowd_chatter", "street_signal", "construction",
|
|
# Indoor
|
|
"hvac", "keyboard_typing", "restaurant",
|
|
]
|
|
|
|
# ── Acoustic scene labels ─────────────────────────────────────────────────────
|
|
# Broad scene category — primary input to privacy risk scoring.
|
|
SCENE_LABELS = Literal[
|
|
"indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature",
|
|
"vehicle", "public_transit",
|
|
]
|
|
|
|
# ── Accent / language labels ──────────────────────────────────────────────────
|
|
# Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1.
|
|
ACCENT_LABELS = Literal[
|
|
"en_gb", "en_us", "en_au", "en_ca", "en_in",
|
|
"fr", "es", "de", "zh", "ja", "other",
|
|
]
|
|
|
|
# ── Tone / affect labels ──────────────────────────────────────────────────────
|
|
# From SER model (wav2vec2-based); feeds Elcor label generation
|
|
AFFECT_LABELS = Literal[
|
|
"neutral", "warm", "frustrated", "dismissive", "apologetic",
|
|
"urgent", "condescending", "scripted", "genuine", "confused",
|
|
"tired", "optimistic",
|
|
]
|
|
|
|
# Generic subtext format (default, always on): "Tone: Frustrated"
|
|
# Elcor format (easter egg, elcor=True in request): "With barely concealed frustration:"
|
|
_ELCOR_MAP: dict[str, str] = {
|
|
"neutral": "In a measured, neutral tone:",
|
|
"warm": "Warmly:",
|
|
"frustrated": "With barely concealed frustration:",
|
|
"dismissive": "With polite dismissiveness:",
|
|
"apologetic": "Apologetically:",
|
|
"urgent": "With evident urgency:",
|
|
"condescending": "With patronizing brightness:",
|
|
"scripted": "Reading from a script:",
|
|
"genuine": "With apparent sincerity:",
|
|
"confused": "With evident confusion:",
|
|
"tired": "With audible fatigue:",
|
|
"optimistic": "With cautious optimism:",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class AudioEvent:
|
|
"""
|
|
A single classified event from the parallel audio classifier.
|
|
|
|
event_type determines how to interpret label and whether subtext is present.
|
|
speaker_id is the ephemeral local diarization label for this event's speaker.
|
|
"""
|
|
timestamp: float
|
|
event_type: EventType
|
|
label: str
|
|
confidence: float
|
|
speaker_id: str = "speaker_a"
|
|
# Tone annotation — present on ToneEvent only.
|
|
# Generic format (default): "Tone: Frustrated"
|
|
# Elcor format (easter egg): "With barely concealed frustration:"
|
|
subtext: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class ToneEvent(AudioEvent):
|
|
"""
|
|
Tone/affect classification event.
|
|
|
|
This is the SSE wire type for Linnet's annotation stream and the
|
|
<LinnetWidget /> embed protocol. Field names are stable as of cf-voice
|
|
v0.1.0 — see cf-core#40 for the wire format spec.
|
|
|
|
The subtext field carries the human-readable annotation.
|
|
Format is controlled by the caller (elcor flag in the classify request).
|
|
|
|
Dimensional emotion (Navigation v0.2.x — audeering model):
|
|
valence / arousal / dominance are None when the dimensional classifier
|
|
is not enabled (CF_VOICE_DIMENSIONAL != "1").
|
|
|
|
Prosodic signals (Navigation v0.2.x — openSMILE):
|
|
sarcasm_risk / flat_f0_score are None when extractor is not enabled.
|
|
"""
|
|
affect: str = "neutral"
|
|
shift_magnitude: float = 0.0
|
|
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
|
|
prosody_flags: list[str] = field(default_factory=list)
|
|
session_id: str = "" # caller-assigned; correlates events to a session
|
|
# Dimensional emotion scores (audeering, optional)
|
|
valence: float | None = None
|
|
arousal: float | None = None
|
|
dominance: float | None = None
|
|
# Prosodic signals (openSMILE, optional)
|
|
sarcasm_risk: float | None = None
|
|
flat_f0_score: float | None = None
|
|
# Trajectory signals (rolling buffer — activates after BASELINE_MIN frames)
|
|
arousal_delta: float | None = None
|
|
valence_delta: float | None = None
|
|
trend: str | None = None # "stable"|"escalating"|"suppressed"|…
|
|
# Coherence signals (SER vs VAD cross-comparison)
|
|
coherence_score: float | None = None
|
|
suppression_flag: bool | None = None
|
|
reframe_type: str | None = None # "none"|"genuine"|"surface"
|
|
affect_divergence: float | None = None
|
|
|
|
def __post_init__(self) -> None:
|
|
# Force event_type to "tone" regardless of what the caller passed.
|
|
# Overriding a parent field with a default in a child dataclass breaks
|
|
# MRO field ordering in Python, so we use __post_init__ instead.
|
|
self.event_type = "tone"
|
|
|
|
|
|
def make_subtext(affect: str, elcor: bool) -> str:
|
|
"""Generate the subtext annotation for a tone event."""
|
|
if elcor:
|
|
return _ELCOR_MAP.get(affect, f"With {affect} tone:")
|
|
return f"Tone: {affect.replace('_', ' ').capitalize()}"
|
|
|
|
|
|
def tone_event_from_voice_frame(
|
|
frame_label: str,
|
|
frame_confidence: float,
|
|
shift_magnitude: float,
|
|
timestamp: float,
|
|
elcor: bool = False,
|
|
) -> ToneEvent:
|
|
"""
|
|
Convert a VoiceFrame label into a ToneEvent.
|
|
|
|
Used in mock mode and as the bridge from VoiceFrame to AudioEvent.
|
|
"""
|
|
# Map VoiceFrame labels to affect labels
|
|
_label_to_affect: dict[str, str] = {
|
|
"Calm and focused": "neutral",
|
|
"Warmly impatient": "frustrated",
|
|
"Deflecting": "dismissive",
|
|
"Genuinely curious": "genuine",
|
|
"Politely dismissive": "dismissive",
|
|
"Nervous but cooperative": "apologetic",
|
|
"Frustrated but contained": "frustrated",
|
|
"Enthusiastic": "warm",
|
|
"Tired and compliant": "tired",
|
|
"Guardedly optimistic": "optimistic",
|
|
"Apologetically firm": "apologetic",
|
|
"Confused but engaged": "confused",
|
|
}
|
|
affect = _label_to_affect.get(frame_label, "neutral")
|
|
|
|
shift_dir = (
|
|
"stable" if shift_magnitude < 0.15
|
|
else "warmer" if affect in ("warm", "genuine", "optimistic")
|
|
else "colder" if affect in ("dismissive", "condescending")
|
|
else "more_urgent" if affect in ("frustrated", "urgent")
|
|
else "stable"
|
|
)
|
|
|
|
return ToneEvent(
|
|
timestamp=timestamp,
|
|
event_type="tone",
|
|
label=frame_label,
|
|
confidence=frame_confidence,
|
|
subtext=make_subtext(affect, elcor),
|
|
affect=affect,
|
|
shift_magnitude=shift_magnitude,
|
|
shift_direction=shift_dir,
|
|
prosody_flags=[],
|
|
)
|