# cf_voice/classify.py — tone / affect classifier # # BSL 1.1: real inference. Requires [inference] extras. # Stub behaviour: raises NotImplementedError if inference deps not installed. # # Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody # features → AFFECT_LABELS defined in cf_voice.events. from __future__ import annotations import asyncio import logging import os from dataclasses import dataclass, field from functools import partial import numpy as np logger = logging.getLogger(__name__) _SAMPLE_RATE = 16_000 # Confidence floor — results below this are discarded by the caller _DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55")) # wav2vec2 SER model from HuggingFace # ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition # Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise _SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" # ── Affect label mapping ────────────────────────────────────────────────────── # Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS # Prosody profile is a tuple of flags present from _extract_prosody_flags(). _EMOTION_BASE: dict[str, str] = { "angry": "frustrated", "disgust": "dismissive", "fear": "apologetic", "happy": "warm", "neutral": "neutral", "sadness": "tired", "surprise": "confused", } # Prosody-driven overrides: (base_affect, flag) → override affect _PROSODY_OVERRIDES: dict[tuple[str, str], str] = { ("neutral", "fast_rate"): "genuine", ("neutral", "flat_pitch"): "scripted", ("neutral", "low_energy"): "tired", ("frustrated", "rising"): "urgent", ("warm", "rising"): "genuine", ("tired", "rising"): "optimistic", ("dismissive", "flat_pitch"): "condescending", } # Affect → human-readable VoiceFrame label (reverse of events._label_to_affect) _AFFECT_TO_LABEL: dict[str, str] = { "neutral": "Calm and focused", "warm": "Enthusiastic", "frustrated": "Frustrated but contained", "dismissive": "Politely dismissive", "apologetic": "Nervous but cooperative", "urgent": "Warmly impatient", "condescending": "Politely dismissive", "scripted": "Calm and focused", # scripted reads as neutral to the observer "genuine": "Genuinely curious", "confused": "Confused but engaged", "tired": "Tired and compliant", "optimistic": "Guardedly optimistic", } @dataclass class ToneResult: label: str # human-readable VoiceFrame label affect: str # AFFECT_LABELS key confidence: float prosody_flags: list[str] = field(default_factory=list) class ToneClassifier: """ Tone/affect classifier: wav2vec2 SER + librosa prosody. Loads the model lazily on first call to avoid import-time GPU allocation. Thread-safe for concurrent classify() calls — the pipeline is stateless per-call; session state lives in the caller (ContextClassifier). """ def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None: self._threshold = threshold self._pipeline = None # lazy-loaded @classmethod def from_env(cls) -> "ToneClassifier": threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55")) return cls(threshold=threshold) def _load_pipeline(self) -> None: if self._pipeline is not None: return try: from transformers import pipeline as hf_pipeline except ImportError as exc: raise ImportError( "transformers is required for tone classification. " "Install with: pip install cf-voice[inference]" ) from exc device = 0 if _cuda_available() else -1 logger.info("Loading SER model %s on device %s", _SER_MODEL_ID, device) self._pipeline = hf_pipeline( "audio-classification", model=_SER_MODEL_ID, device=device, ) def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult: """ Classify tone/affect from a float32 16kHz mono audio window. transcript is used as a weak signal for ambiguous cases (e.g. words like "unfortunately" bias toward apologetic even on a neutral voice). """ self._load_pipeline() # Ensure the model sees float32 at the right rate assert audio_float32.dtype == np.float32, "audio must be float32" # Run SER preds = self._pipeline({"raw": audio_float32, "sampling_rate": _SAMPLE_RATE}) best = max(preds, key=lambda p: p["score"]) emotion = best["label"].lower() confidence = float(best["score"]) # Extract prosody features from raw audio prosody_flags = _extract_prosody_flags(audio_float32) # Resolve affect from base emotion + prosody affect = _EMOTION_BASE.get(emotion, "neutral") for flag in prosody_flags: override = _PROSODY_OVERRIDES.get((affect, flag)) if override: affect = override break # Weak transcript signals affect = _apply_transcript_hints(affect, transcript) label = _AFFECT_TO_LABEL.get(affect, "Calm and focused") return ToneResult( label=label, affect=affect, confidence=confidence, prosody_flags=prosody_flags, ) async def classify_async( self, audio_float32: np.ndarray, transcript: str = "" ) -> ToneResult: """classify() without blocking the event loop.""" loop = asyncio.get_event_loop() fn = partial(self.classify, audio_float32, transcript) return await loop.run_in_executor(None, fn) # ── Prosody helpers ─────────────────────────────────────────────────────────── def _extract_prosody_flags(audio: np.ndarray) -> list[str]: """ Extract lightweight prosody flags from a float32 16kHz mono window. Returns a list of string flags consumed by _PROSODY_OVERRIDES. """ try: import librosa except ImportError: return [] flags: list[str] = [] # Energy (RMS) rms = float(np.sqrt(np.mean(audio ** 2))) if rms < 0.02: flags.append("low_energy") elif rms > 0.15: flags.append("high_energy") # Speech rate approximation via zero-crossing rate zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio))) if zcr > 0.12: flags.append("fast_rate") elif zcr < 0.04: flags.append("slow_rate") # Pitch contour via YIN try: f0 = librosa.yin( audio, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=_SAMPLE_RATE, ) voiced = f0[f0 > 0] if len(voiced) > 5: # Rising: last quarter higher than first quarter q = len(voiced) // 4 if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15: flags.append("rising") # Flat: variance less than 15Hz if np.std(voiced) < 15: flags.append("flat_pitch") except Exception: pass # pitch extraction is best-effort return flags def _apply_transcript_hints(affect: str, transcript: str) -> str: """ Apply weak keyword signals from transcript text to adjust affect. Only overrides when affect is already ambiguous (neutral/tired). """ if not transcript or affect not in ("neutral", "tired"): return affect t = transcript.lower() apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"} urgent_words = {"urgent", "immediately", "asap", "right now", "critical"} dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"} if any(w in t for w in apologetic_words): return "apologetic" if any(w in t for w in urgent_words): return "urgent" if any(w in t for w in dismissive_words): return "dismissive" return affect def _cuda_available() -> bool: try: import torch return torch.cuda.is_available() except ImportError: return False