cf-voice/cf_voice/dimensional.py

# cf_voice/dimensional.py — audeering dimensional emotion model
#
# BSL 1.1: real inference. Requires [inference] extras.
#
# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
# Outputs three continuous 0-1 scores:
#   valence:    negative (0) to positive (1)
#   arousal:    low energy (0) to high energy (1)
#   dominance:  submissive (0) to dominant (1)
#
# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech.
# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP.
#
# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is
# downloaded — ~1.5GB, adds ~800MB GPU VRAM)
#
# HuggingFace model page:
#   https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
from __future__ import annotations

import asyncio
import logging
import os
from dataclasses import dataclass
from functools import partial

import numpy as np

logger = logging.getLogger(__name__)

_SAMPLE_RATE = 16_000
_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"


@dataclass
class DimensionalResult:
    """
    Output of the audeering dimensional emotion model.

    All scores are 0.0-1.0 continuous values:
      valence:    negative affect (0) to positive affect (1)
      arousal:    low energy / calm (0) to high energy / excited (1)
      dominance:  submissive / uncertain (0) to dominant / assertive (1)

    Sarcasm signal: low arousal + higher valence = "calm-positive" profile.
    Combined with flat F0 (prosody.py) and text divergence (linnet#22) for
    the full multi-signal sarcasm heuristic.
    """
    valence: float
    arousal: float
    dominance: float

    def affect_quadrant(self) -> str:
        """
        Map VAD position to a descriptive quadrant label.

        These are reference labels for logging and debugging, not user-facing.
        The annotation layer (Elcor) handles user-facing interpretation.
        """
        v_high = self.valence >= 0.5
        a_high = self.arousal >= 0.5
        if v_high and a_high:
            return "enthusiastic"
        if v_high and not a_high:
            return "calm_positive"        # sarcasm candidate when paired with flat F0
        if not v_high and a_high:
            return "frustrated_urgent"
        return "sad_resigned"

    def calm_positive_score(self) -> float:
        """
        0-1 score indicating how strongly the VAD position matches the
        calm-positive sarcasm candidate profile (low arousal, higher valence).

        Used as one component of the combined sarcasm heuristic.
        """
        valence_pos = max(0.0, self.valence - 0.5) * 2.0   # how positive
        arousal_low = 1.0 - self.arousal                     # how calm
        return (valence_pos * 0.5 + arousal_low * 0.5)


class DimensionalClassifier:
    """
    Async wrapper around the audeering wav2vec2 dimensional emotion model.

    The model runs in a thread pool executor to avoid blocking asyncio.
    Loaded once on first call and reused; the underlying wav2vec2 model
    lands on CUDA when available (same device as the SER model in classify.py).

    Usage
    -----
        clf = DimensionalClassifier.from_env()
        result = await clf.classify_async(audio_float32)
        print(result.valence, result.arousal, result.dominance)
    """

    def __init__(self) -> None:
        self._model = None
        self._processor = None
        self._loaded = False

    def _ensure_loaded(self) -> None:
        """Load model and processor on first inference call (not at construction)."""
        if self._loaded:
            return
        self._loaded = True

        try:
            from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
        except ImportError as exc:
            raise ImportError(
                "transformers is required for dimensional emotion classification. "
                "Install with: pip install cf-voice[inference]"
            ) from exc

        logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID)
        self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID)
        self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID)

        try:
            import torch
            if torch.cuda.is_available():
                self._model = self._model.to(torch.device("cuda"))
                logger.info("Dimensional model on CUDA")
        except ImportError:
            pass

        self._model.eval()

    def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult:
        """
        Synchronous inference. Always call via classify_async.

        The audeering model outputs [valence, arousal, dominance] as logits
        in the range 0-1 (sigmoid regression heads, not softmax). The model was
        fine-tuned on MSP-Podcast with per-dimension regression, not classification.
        """
        self._ensure_loaded()

        try:
            import torch
        except ImportError as exc:
            raise ImportError("torch is required for dimensional inference") from exc

        inputs = self._processor(
            audio_float32,
            sampling_rate=_SAMPLE_RATE,
            return_tensors="pt",
            padding=True,
        )

        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.no_grad():
            logits = self._model(**inputs).logits

        # Model outputs [valence, arousal, dominance] in a single (1, 3) tensor
        scores = logits[0].cpu().float().numpy()
        valence   = float(np.clip(scores[0], 0.0, 1.0))
        arousal   = float(np.clip(scores[1], 0.0, 1.0))
        dominance = float(np.clip(scores[2], 0.0, 1.0))

        return DimensionalResult(
            valence=round(valence, 4),
            arousal=round(arousal, 4),
            dominance=round(dominance, 4),
        )

    async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult:
        """
        Classify audio without blocking the event loop.

        Runs in a thread pool executor. Designed to be gathered alongside
        the SER and diarization coroutines in context._classify_real_async().
        """
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(
            None, partial(self._classify_sync, audio_float32)
        )

    @classmethod
    def from_env(cls) -> "DimensionalClassifier":
        """Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set."""
        if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
            raise EnvironmentError(
                "CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. "
                "Add it to your .env file. The model requires ~800MB GPU VRAM."
            )
        return cls()