# cf_voice/classify.py — tone / affect classifier # # BSL 1.1: real inference. Requires [inference] extras. # Stub behaviour: raises NotImplementedError if inference deps not installed. # # Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody # features → AFFECT_LABELS defined in cf_voice.events. from __future__ import annotations import asyncio import logging import os from dataclasses import dataclass, field from functools import partial import numpy as np logger = logging.getLogger(__name__) _SAMPLE_RATE = 16_000 # Confidence floor — results below this are discarded by the caller _DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55")) # wav2vec2 SER model from HuggingFace # ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition # Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise _SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" # ── Affect label mapping ────────────────────────────────────────────────────── # Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS # Prosody profile is a tuple of flags present from _extract_prosody_flags(). _EMOTION_BASE: dict[str, str] = { "angry": "frustrated", "disgust": "dismissive", "fear": "apologetic", "happy": "warm", "neutral": "neutral", "sadness": "tired", "surprise": "confused", } # Prosody-driven overrides: (base_affect, flag) → override affect _PROSODY_OVERRIDES: dict[tuple[str, str], str] = { ("neutral", "fast_rate"): "genuine", ("neutral", "flat_pitch"): "scripted", ("neutral", "low_energy"): "tired", ("frustrated", "rising"): "urgent", ("warm", "rising"): "genuine", ("tired", "rising"): "optimistic", ("dismissive", "flat_pitch"): "condescending", } # Affect → human-readable VoiceFrame label (reverse of events._label_to_affect) _AFFECT_TO_LABEL: dict[str, str] = { "neutral": "Calm and focused", "warm": "Enthusiastic", "frustrated": "Frustrated but contained", "dismissive": "Politely dismissive", "apologetic": "Nervous but cooperative", "urgent": "Warmly impatient", "condescending": "Politely dismissive", "scripted": "Calm and focused", # scripted reads as neutral to the observer "genuine": "Genuinely curious", "confused": "Confused but engaged", "tired": "Tired and compliant", "optimistic": "Guardedly optimistic", } @dataclass class ToneResult: label: str # human-readable VoiceFrame label affect: str # AFFECT_LABELS key confidence: float prosody_flags: list[str] = field(default_factory=list) class ToneClassifier: """ Tone/affect classifier: wav2vec2 SER + librosa prosody. Loads the model lazily on first call to avoid import-time GPU allocation. Thread-safe for concurrent classify() calls — the model is stateless per-call; session state lives in the caller (ContextClassifier). Uses AutoFeatureExtractor + AutoModelForAudioClassification directly rather than hf_pipeline to avoid torchcodec audio backend initialization. torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x systems. Calling the model directly bypasses the pipeline's audio backend selection entirely since we already have float32 at 16kHz. """ def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None: self._threshold = threshold self._feature_extractor = None # lazy-loaded self._model = None # lazy-loaded self._device: str = "cpu" @classmethod def from_env(cls) -> "ToneClassifier": threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55")) return cls(threshold=threshold) def _load_pipeline(self) -> None: if self._model is not None: return try: from transformers import ( AutoFeatureExtractor, AutoModelForAudioClassification, ) except ImportError as exc: raise ImportError( "transformers is required for tone classification. " "Install with: pip install cf-voice[inference]" ) from exc import torch if _cuda_available(): self._device = "cuda:0" # fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000. # Only supported on CUDA — CPU must stay float32. torch_dtype = torch.float16 else: self._device = "cpu" torch_dtype = torch.float32 logger.info( "Loading SER model %s on device=%s dtype=%s", _SER_MODEL_ID, self._device, torch_dtype, ) self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID) self._model = AutoModelForAudioClassification.from_pretrained( _SER_MODEL_ID, torch_dtype=torch_dtype, ).to(self._device) # Switch to inference mode — disables dropout, batch norm tracks running stats self._model.train(False) def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult: """ Classify tone/affect from a float32 16kHz mono audio window. transcript is used as a weak signal for ambiguous cases (e.g. words like "unfortunately" bias toward apologetic even on a neutral voice). """ import torch self._load_pipeline() # Ensure the model sees float32 at the right rate assert audio_float32.dtype == np.float32, "audio must be float32" # Run SER — call feature extractor + model directly to bypass the # hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency). inputs = self._feature_extractor( audio_float32, sampling_rate=_SAMPLE_RATE, return_tensors="pt", ) inputs = {k: v.to(self._device) for k, v in inputs.items()} if self._model.dtype == torch.float16: inputs = {k: v.to(torch.float16) for k, v in inputs.items()} with torch.no_grad(): logits = self._model(**inputs).logits probs = torch.softmax(logits, dim=-1)[0] id2label = self._model.config.id2label preds = [ {"label": id2label[i], "score": float(probs[i])} for i in range(len(probs)) ] best = max(preds, key=lambda p: p["score"]) emotion = best["label"].lower() confidence = float(best["score"]) # Extract prosody features from raw audio prosody_flags = _extract_prosody_flags(audio_float32) # Resolve affect from base emotion + prosody affect = _EMOTION_BASE.get(emotion, "neutral") for flag in prosody_flags: override = _PROSODY_OVERRIDES.get((affect, flag)) if override: affect = override break # Weak transcript signals affect = _apply_transcript_hints(affect, transcript) label = _AFFECT_TO_LABEL.get(affect, "Calm and focused") return ToneResult( label=label, affect=affect, confidence=confidence, prosody_flags=prosody_flags, ) async def classify_async( self, audio_float32: np.ndarray, transcript: str = "" ) -> ToneResult: """classify() without blocking the event loop.""" loop = asyncio.get_running_loop() fn = partial(self.classify, audio_float32, transcript) return await loop.run_in_executor(None, fn) # ── Prosody helpers ─────────────────────────────────────────────────────────── def _extract_prosody_flags(audio: np.ndarray) -> list[str]: """ Extract lightweight prosody flags from a float32 16kHz mono window. Returns a list of string flags consumed by _PROSODY_OVERRIDES. """ try: import librosa except ImportError: return [] flags: list[str] = [] # Energy (RMS) rms = float(np.sqrt(np.mean(audio ** 2))) if rms < 0.02: flags.append("low_energy") elif rms > 0.15: flags.append("high_energy") # Speech rate approximation via zero-crossing rate zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio))) if zcr > 0.12: flags.append("fast_rate") elif zcr < 0.04: flags.append("slow_rate") # Pitch contour via YIN try: f0 = librosa.yin( audio, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=_SAMPLE_RATE, ) voiced = f0[f0 > 0] if len(voiced) > 5: # Rising: last quarter higher than first quarter q = len(voiced) // 4 if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15: flags.append("rising") # Flat: variance less than 15Hz if np.std(voiced) < 15: flags.append("flat_pitch") except Exception: pass # pitch extraction is best-effort return flags def _apply_transcript_hints(affect: str, transcript: str) -> str: """ Apply weak keyword signals from transcript text to adjust affect. Only overrides when affect is already ambiguous (neutral/tired). """ if not transcript or affect not in ("neutral", "tired"): return affect t = transcript.lower() apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"} urgent_words = {"urgent", "immediately", "asap", "right now", "critical"} dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"} if any(w in t for w in apologetic_words): return "apologetic" if any(w in t for w in urgent_words): return "urgent" if any(w in t for w in dismissive_words): return "dismissive" return affect def _cuda_available() -> bool: try: import torch return torch.cuda.is_available() except ImportError: return False