New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
190 lines
6.7 KiB
Python
190 lines
6.7 KiB
Python
# cf_voice/dimensional.py — audeering dimensional emotion model
|
|
#
|
|
# BSL 1.1: real inference. Requires [inference] extras.
|
|
#
|
|
# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
|
|
# Outputs three continuous 0-1 scores:
|
|
# valence: negative (0) to positive (1)
|
|
# arousal: low energy (0) to high energy (1)
|
|
# dominance: submissive (0) to dominant (1)
|
|
#
|
|
# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech.
|
|
# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP.
|
|
#
|
|
# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is
|
|
# downloaded — ~1.5GB, adds ~800MB GPU VRAM)
|
|
#
|
|
# HuggingFace model page:
|
|
# https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass
|
|
from functools import partial
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SAMPLE_RATE = 16_000
|
|
_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
|
|
|
|
|
|
@dataclass
|
|
class DimensionalResult:
|
|
"""
|
|
Output of the audeering dimensional emotion model.
|
|
|
|
All scores are 0.0-1.0 continuous values:
|
|
valence: negative affect (0) to positive affect (1)
|
|
arousal: low energy / calm (0) to high energy / excited (1)
|
|
dominance: submissive / uncertain (0) to dominant / assertive (1)
|
|
|
|
Sarcasm signal: low arousal + higher valence = "calm-positive" profile.
|
|
Combined with flat F0 (prosody.py) and text divergence (linnet#22) for
|
|
the full multi-signal sarcasm heuristic.
|
|
"""
|
|
valence: float
|
|
arousal: float
|
|
dominance: float
|
|
|
|
def affect_quadrant(self) -> str:
|
|
"""
|
|
Map VAD position to a descriptive quadrant label.
|
|
|
|
These are reference labels for logging and debugging, not user-facing.
|
|
The annotation layer (Elcor) handles user-facing interpretation.
|
|
"""
|
|
v_high = self.valence >= 0.5
|
|
a_high = self.arousal >= 0.5
|
|
if v_high and a_high:
|
|
return "enthusiastic"
|
|
if v_high and not a_high:
|
|
return "calm_positive" # sarcasm candidate when paired with flat F0
|
|
if not v_high and a_high:
|
|
return "frustrated_urgent"
|
|
return "sad_resigned"
|
|
|
|
def calm_positive_score(self) -> float:
|
|
"""
|
|
0-1 score indicating how strongly the VAD position matches the
|
|
calm-positive sarcasm candidate profile (low arousal, higher valence).
|
|
|
|
Used as one component of the combined sarcasm heuristic.
|
|
"""
|
|
valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive
|
|
arousal_low = 1.0 - self.arousal # how calm
|
|
return (valence_pos * 0.5 + arousal_low * 0.5)
|
|
|
|
|
|
class DimensionalClassifier:
|
|
"""
|
|
Async wrapper around the audeering wav2vec2 dimensional emotion model.
|
|
|
|
The model runs in a thread pool executor to avoid blocking asyncio.
|
|
Loaded once on first call and reused; the underlying wav2vec2 model
|
|
lands on CUDA when available (same device as the SER model in classify.py).
|
|
|
|
Usage
|
|
-----
|
|
clf = DimensionalClassifier.from_env()
|
|
result = await clf.classify_async(audio_float32)
|
|
print(result.valence, result.arousal, result.dominance)
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._model = None
|
|
self._processor = None
|
|
self._loaded = False
|
|
|
|
def _ensure_loaded(self) -> None:
|
|
"""Load model and processor on first inference call (not at construction)."""
|
|
if self._loaded:
|
|
return
|
|
self._loaded = True
|
|
|
|
try:
|
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"transformers is required for dimensional emotion classification. "
|
|
"Install with: pip install cf-voice[inference]"
|
|
) from exc
|
|
|
|
logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID)
|
|
self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID)
|
|
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID)
|
|
|
|
try:
|
|
import torch
|
|
if torch.cuda.is_available():
|
|
self._model = self._model.to(torch.device("cuda"))
|
|
logger.info("Dimensional model on CUDA")
|
|
except ImportError:
|
|
pass
|
|
|
|
self._model.eval()
|
|
|
|
def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult:
|
|
"""
|
|
Synchronous inference. Always call via classify_async.
|
|
|
|
The audeering model outputs [valence, arousal, dominance] as logits
|
|
in the range 0-1 (sigmoid regression heads, not softmax). The model was
|
|
fine-tuned on MSP-Podcast with per-dimension regression, not classification.
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
try:
|
|
import torch
|
|
except ImportError as exc:
|
|
raise ImportError("torch is required for dimensional inference") from exc
|
|
|
|
inputs = self._processor(
|
|
audio_float32,
|
|
sampling_rate=_SAMPLE_RATE,
|
|
return_tensors="pt",
|
|
padding=True,
|
|
)
|
|
|
|
if torch.cuda.is_available():
|
|
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
logits = self._model(**inputs).logits
|
|
|
|
# Model outputs [valence, arousal, dominance] in a single (1, 3) tensor
|
|
scores = logits[0].cpu().float().numpy()
|
|
valence = float(np.clip(scores[0], 0.0, 1.0))
|
|
arousal = float(np.clip(scores[1], 0.0, 1.0))
|
|
dominance = float(np.clip(scores[2], 0.0, 1.0))
|
|
|
|
return DimensionalResult(
|
|
valence=round(valence, 4),
|
|
arousal=round(arousal, 4),
|
|
dominance=round(dominance, 4),
|
|
)
|
|
|
|
async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult:
|
|
"""
|
|
Classify audio without blocking the event loop.
|
|
|
|
Runs in a thread pool executor. Designed to be gathered alongside
|
|
the SER and diarization coroutines in context._classify_real_async().
|
|
"""
|
|
loop = asyncio.get_running_loop()
|
|
return await loop.run_in_executor(
|
|
None, partial(self._classify_sync, audio_float32)
|
|
)
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "DimensionalClassifier":
|
|
"""Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set."""
|
|
if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
|
|
raise EnvironmentError(
|
|
"CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. "
|
|
"Add it to your .env file. The model requires ~800MB GPU VRAM."
|
|
)
|
|
return cls()
|