# cf_voice/dimensional.py — audeering dimensional emotion model # # BSL 1.1: real inference. Requires [inference] extras. # # Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim # Outputs three continuous 0-1 scores: # valence: negative (0) to positive (1) # arousal: low energy (0) to high energy (1) # dominance: submissive (0) to dominant (1) # # Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech. # This is the key differentiator from SER models trained on RAVDESS/IEMOCAP. # # Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is # downloaded — ~1.5GB, adds ~800MB GPU VRAM) # # HuggingFace model page: # https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim from __future__ import annotations import asyncio import logging import os from dataclasses import dataclass from functools import partial import numpy as np logger = logging.getLogger(__name__) _SAMPLE_RATE = 16_000 _DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim" @dataclass class DimensionalResult: """ Output of the audeering dimensional emotion model. All scores are 0.0-1.0 continuous values: valence: negative affect (0) to positive affect (1) arousal: low energy / calm (0) to high energy / excited (1) dominance: submissive / uncertain (0) to dominant / assertive (1) Sarcasm signal: low arousal + higher valence = "calm-positive" profile. Combined with flat F0 (prosody.py) and text divergence (linnet#22) for the full multi-signal sarcasm heuristic. """ valence: float arousal: float dominance: float def affect_quadrant(self) -> str: """ Map VAD position to a descriptive quadrant label. These are reference labels for logging and debugging, not user-facing. The annotation layer (Elcor) handles user-facing interpretation. """ v_high = self.valence >= 0.5 a_high = self.arousal >= 0.5 if v_high and a_high: return "enthusiastic" if v_high and not a_high: return "calm_positive" # sarcasm candidate when paired with flat F0 if not v_high and a_high: return "frustrated_urgent" return "sad_resigned" def calm_positive_score(self) -> float: """ 0-1 score indicating how strongly the VAD position matches the calm-positive sarcasm candidate profile (low arousal, higher valence). Used as one component of the combined sarcasm heuristic. """ valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive arousal_low = 1.0 - self.arousal # how calm return (valence_pos * 0.5 + arousal_low * 0.5) class DimensionalClassifier: """ Async wrapper around the audeering wav2vec2 dimensional emotion model. The model runs in a thread pool executor to avoid blocking asyncio. Loaded once on first call and reused; the underlying wav2vec2 model lands on CUDA when available (same device as the SER model in classify.py). Usage ----- clf = DimensionalClassifier.from_env() result = await clf.classify_async(audio_float32) print(result.valence, result.arousal, result.dominance) """ def __init__(self) -> None: self._model = None self._processor = None self._loaded = False def _ensure_loaded(self) -> None: """Load model and processor on first inference call (not at construction).""" if self._loaded: return self._loaded = True try: from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification except ImportError as exc: raise ImportError( "transformers is required for dimensional emotion classification. " "Install with: pip install cf-voice[inference]" ) from exc logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID) self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID) self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID) try: import torch if torch.cuda.is_available(): self._model = self._model.to(torch.device("cuda")) logger.info("Dimensional model on CUDA") except ImportError: pass self._model.eval() def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult: """ Synchronous inference. Always call via classify_async. The audeering model outputs [valence, arousal, dominance] as logits in the range 0-1 (sigmoid regression heads, not softmax). The model was fine-tuned on MSP-Podcast with per-dimension regression, not classification. """ self._ensure_loaded() try: import torch except ImportError as exc: raise ImportError("torch is required for dimensional inference") from exc inputs = self._processor( audio_float32, sampling_rate=_SAMPLE_RATE, return_tensors="pt", padding=True, ) if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): logits = self._model(**inputs).logits # Model outputs [valence, arousal, dominance] in a single (1, 3) tensor scores = logits[0].cpu().float().numpy() valence = float(np.clip(scores[0], 0.0, 1.0)) arousal = float(np.clip(scores[1], 0.0, 1.0)) dominance = float(np.clip(scores[2], 0.0, 1.0)) return DimensionalResult( valence=round(valence, 4), arousal=round(arousal, 4), dominance=round(dominance, 4), ) async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult: """ Classify audio without blocking the event loop. Runs in a thread pool executor. Designed to be gathered alongside the SER and diarization coroutines in context._classify_real_async(). """ loop = asyncio.get_running_loop() return await loop.run_in_executor( None, partial(self._classify_sync, audio_float32) ) @classmethod def from_env(cls) -> "DimensionalClassifier": """Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set.""" if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1": raise EnvironmentError( "CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. " "Add it to your .env file. The model requires ~800MB GPU VRAM." ) return cls()