cf-voice/cf_voice/dimensional.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

190 lines
6.7 KiB
Python

# cf_voice/dimensional.py — audeering dimensional emotion model
#
# BSL 1.1: real inference. Requires [inference] extras.
#
# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
# Outputs three continuous 0-1 scores:
# valence: negative (0) to positive (1)
# arousal: low energy (0) to high energy (1)
# dominance: submissive (0) to dominant (1)
#
# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech.
# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP.
#
# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is
# downloaded — ~1.5GB, adds ~800MB GPU VRAM)
#
# HuggingFace model page:
# https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
from __future__ import annotations
import asyncio
import logging
import os
from dataclasses import dataclass
from functools import partial
import numpy as np
logger = logging.getLogger(__name__)
_SAMPLE_RATE = 16_000
_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
@dataclass
class DimensionalResult:
"""
Output of the audeering dimensional emotion model.
All scores are 0.0-1.0 continuous values:
valence: negative affect (0) to positive affect (1)
arousal: low energy / calm (0) to high energy / excited (1)
dominance: submissive / uncertain (0) to dominant / assertive (1)
Sarcasm signal: low arousal + higher valence = "calm-positive" profile.
Combined with flat F0 (prosody.py) and text divergence (linnet#22) for
the full multi-signal sarcasm heuristic.
"""
valence: float
arousal: float
dominance: float
def affect_quadrant(self) -> str:
"""
Map VAD position to a descriptive quadrant label.
These are reference labels for logging and debugging, not user-facing.
The annotation layer (Elcor) handles user-facing interpretation.
"""
v_high = self.valence >= 0.5
a_high = self.arousal >= 0.5
if v_high and a_high:
return "enthusiastic"
if v_high and not a_high:
return "calm_positive" # sarcasm candidate when paired with flat F0
if not v_high and a_high:
return "frustrated_urgent"
return "sad_resigned"
def calm_positive_score(self) -> float:
"""
0-1 score indicating how strongly the VAD position matches the
calm-positive sarcasm candidate profile (low arousal, higher valence).
Used as one component of the combined sarcasm heuristic.
"""
valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive
arousal_low = 1.0 - self.arousal # how calm
return (valence_pos * 0.5 + arousal_low * 0.5)
class DimensionalClassifier:
"""
Async wrapper around the audeering wav2vec2 dimensional emotion model.
The model runs in a thread pool executor to avoid blocking asyncio.
Loaded once on first call and reused; the underlying wav2vec2 model
lands on CUDA when available (same device as the SER model in classify.py).
Usage
-----
clf = DimensionalClassifier.from_env()
result = await clf.classify_async(audio_float32)
print(result.valence, result.arousal, result.dominance)
"""
def __init__(self) -> None:
self._model = None
self._processor = None
self._loaded = False
def _ensure_loaded(self) -> None:
"""Load model and processor on first inference call (not at construction)."""
if self._loaded:
return
self._loaded = True
try:
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
except ImportError as exc:
raise ImportError(
"transformers is required for dimensional emotion classification. "
"Install with: pip install cf-voice[inference]"
) from exc
logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID)
self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID)
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID)
try:
import torch
if torch.cuda.is_available():
self._model = self._model.to(torch.device("cuda"))
logger.info("Dimensional model on CUDA")
except ImportError:
pass
self._model.eval()
def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult:
"""
Synchronous inference. Always call via classify_async.
The audeering model outputs [valence, arousal, dominance] as logits
in the range 0-1 (sigmoid regression heads, not softmax). The model was
fine-tuned on MSP-Podcast with per-dimension regression, not classification.
"""
self._ensure_loaded()
try:
import torch
except ImportError as exc:
raise ImportError("torch is required for dimensional inference") from exc
inputs = self._processor(
audio_float32,
sampling_rate=_SAMPLE_RATE,
return_tensors="pt",
padding=True,
)
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
logits = self._model(**inputs).logits
# Model outputs [valence, arousal, dominance] in a single (1, 3) tensor
scores = logits[0].cpu().float().numpy()
valence = float(np.clip(scores[0], 0.0, 1.0))
arousal = float(np.clip(scores[1], 0.0, 1.0))
dominance = float(np.clip(scores[2], 0.0, 1.0))
return DimensionalResult(
valence=round(valence, 4),
arousal=round(arousal, 4),
dominance=round(dominance, 4),
)
async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult:
"""
Classify audio without blocking the event loop.
Runs in a thread pool executor. Designed to be gathered alongside
the SER and diarization coroutines in context._classify_real_async().
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(
None, partial(self._classify_sync, audio_float32)
)
@classmethod
def from_env(cls) -> "DimensionalClassifier":
"""Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set."""
if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
raise EnvironmentError(
"CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. "
"Add it to your .env file. The model requires ~800MB GPU VRAM."
)
return cls()