cf-voice/cf_voice/classify.py

# cf_voice/classify.py — tone / affect classifier
#
# BSL 1.1: real inference. Requires [inference] extras.
# Stub behaviour: raises NotImplementedError if inference deps not installed.
#
# Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody
# features → AFFECT_LABELS defined in cf_voice.events.
from __future__ import annotations

import asyncio
import logging
import os
from dataclasses import dataclass, field
from functools import partial

import numpy as np

logger = logging.getLogger(__name__)

_SAMPLE_RATE = 16_000

# Confidence floor — results below this are discarded by the caller
_DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))

# wav2vec2 SER model from HuggingFace
# ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
# Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise
_SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"

# ── Affect label mapping ──────────────────────────────────────────────────────
# Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS
# Prosody profile is a tuple of flags present from _extract_prosody_flags().

_EMOTION_BASE: dict[str, str] = {
    "angry":    "frustrated",
    "disgust":  "dismissive",
    "fear":     "apologetic",
    "happy":    "warm",
    "neutral":  "neutral",
    "sadness":  "tired",
    "surprise": "confused",
}

# Prosody-driven overrides: (base_affect, flag) → override affect
_PROSODY_OVERRIDES: dict[tuple[str, str], str] = {
    ("neutral", "fast_rate"):    "genuine",
    ("neutral", "flat_pitch"):   "scripted",
    ("neutral", "low_energy"):   "tired",
    ("frustrated", "rising"):    "urgent",
    ("warm", "rising"):          "genuine",
    ("tired", "rising"):         "optimistic",
    ("dismissive", "flat_pitch"): "condescending",
}

# Affect → human-readable VoiceFrame label (reverse of events._label_to_affect)
_AFFECT_TO_LABEL: dict[str, str] = {
    "neutral":       "Calm and focused",
    "warm":          "Enthusiastic",
    "frustrated":    "Frustrated but contained",
    "dismissive":    "Politely dismissive",
    "apologetic":    "Nervous but cooperative",
    "urgent":        "Warmly impatient",
    "condescending": "Politely dismissive",
    "scripted":      "Calm and focused",   # scripted reads as neutral to the observer
    "genuine":       "Genuinely curious",
    "confused":      "Confused but engaged",
    "tired":         "Tired and compliant",
    "optimistic":    "Guardedly optimistic",
}


@dataclass
class ToneResult:
    label: str               # human-readable VoiceFrame label
    affect: str              # AFFECT_LABELS key
    confidence: float
    prosody_flags: list[str] = field(default_factory=list)


class ToneClassifier:
    """
    Tone/affect classifier: wav2vec2 SER + librosa prosody.

    Loads the model lazily on first call to avoid import-time GPU allocation.
    Thread-safe for concurrent classify() calls — the model is stateless
    per-call; session state lives in the caller (ContextClassifier).

    Uses AutoFeatureExtractor + AutoModelForAudioClassification directly
    rather than hf_pipeline to avoid torchcodec audio backend initialization.
    torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x
    systems. Calling the model directly bypasses the pipeline's audio backend
    selection entirely since we already have float32 at 16kHz.
    """

    def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
        self._threshold = threshold
        self._feature_extractor = None   # lazy-loaded
        self._model = None               # lazy-loaded
        self._device: str = "cpu"

    @classmethod
    def from_env(cls) -> "ToneClassifier":
        threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
        return cls(threshold=threshold)

    def _load_pipeline(self) -> None:
        if self._model is not None:
            return
        try:
            from transformers import (
                AutoFeatureExtractor,
                AutoModelForAudioClassification,
            )
        except ImportError as exc:
            raise ImportError(
                "transformers is required for tone classification. "
                "Install with: pip install cf-voice[inference]"
            ) from exc

        import torch

        if _cuda_available():
            self._device = "cuda:0"
            # fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000.
            # Only supported on CUDA — CPU must stay float32.
            torch_dtype = torch.float16
        else:
            self._device = "cpu"
            torch_dtype = torch.float32

        logger.info(
            "Loading SER model %s on device=%s dtype=%s",
            _SER_MODEL_ID, self._device, torch_dtype,
        )
        self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID)
        self._model = AutoModelForAudioClassification.from_pretrained(
            _SER_MODEL_ID,
            torch_dtype=torch_dtype,
        ).to(self._device)
        # Switch to inference mode — disables dropout, batch norm tracks running stats
        self._model.train(False)

    def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
        """
        Classify tone/affect from a float32 16kHz mono audio window.

        transcript is used as a weak signal for ambiguous cases (e.g. words
        like "unfortunately" bias toward apologetic even on a neutral voice).
        """
        import torch

        self._load_pipeline()

        # Ensure the model sees float32 at the right rate
        assert audio_float32.dtype == np.float32, "audio must be float32"

        # Run SER — call feature extractor + model directly to bypass the
        # hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency).
        inputs = self._feature_extractor(
            audio_float32,
            sampling_rate=_SAMPLE_RATE,
            return_tensors="pt",
        )
        inputs = {k: v.to(self._device) for k, v in inputs.items()}
        if self._model.dtype == torch.float16:
            inputs = {k: v.to(torch.float16) for k, v in inputs.items()}

        with torch.no_grad():
            logits = self._model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)[0]
        id2label = self._model.config.id2label
        preds = [
            {"label": id2label[i], "score": float(probs[i])}
            for i in range(len(probs))
        ]

        best = max(preds, key=lambda p: p["score"])
        emotion = best["label"].lower()
        confidence = float(best["score"])

        # Extract prosody features from raw audio
        prosody_flags = _extract_prosody_flags(audio_float32)

        # Resolve affect from base emotion + prosody
        affect = _EMOTION_BASE.get(emotion, "neutral")
        for flag in prosody_flags:
            override = _PROSODY_OVERRIDES.get((affect, flag))
            if override:
                affect = override
                break

        # Weak transcript signals
        affect = _apply_transcript_hints(affect, transcript)

        label = _AFFECT_TO_LABEL.get(affect, "Calm and focused")
        return ToneResult(
            label=label,
            affect=affect,
            confidence=confidence,
            prosody_flags=prosody_flags,
        )

    async def classify_async(
        self, audio_float32: np.ndarray, transcript: str = ""
    ) -> ToneResult:
        """classify() without blocking the event loop."""
        loop = asyncio.get_running_loop()
        fn = partial(self.classify, audio_float32, transcript)
        return await loop.run_in_executor(None, fn)


# ── Prosody helpers ───────────────────────────────────────────────────────────

def _extract_prosody_flags(audio: np.ndarray) -> list[str]:
    """
    Extract lightweight prosody flags from a float32 16kHz mono window.
    Returns a list of string flags consumed by _PROSODY_OVERRIDES.
    """
    try:
        import librosa
    except ImportError:
        return []

    flags: list[str] = []

    # Energy (RMS)
    rms = float(np.sqrt(np.mean(audio ** 2)))
    if rms < 0.02:
        flags.append("low_energy")
    elif rms > 0.15:
        flags.append("high_energy")

    # Speech rate approximation via zero-crossing rate
    zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio)))
    if zcr > 0.12:
        flags.append("fast_rate")
    elif zcr < 0.04:
        flags.append("slow_rate")

    # Pitch contour via YIN
    try:
        f0 = librosa.yin(
            audio,
            fmin=librosa.note_to_hz("C2"),
            fmax=librosa.note_to_hz("C7"),
            sr=_SAMPLE_RATE,
        )
        voiced = f0[f0 > 0]
        if len(voiced) > 5:
            # Rising: last quarter higher than first quarter
            q = len(voiced) // 4
            if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15:
                flags.append("rising")
            # Flat: variance less than 15Hz
            if np.std(voiced) < 15:
                flags.append("flat_pitch")
    except Exception:
        pass   # pitch extraction is best-effort

    return flags


def _apply_transcript_hints(affect: str, transcript: str) -> str:
    """
    Apply weak keyword signals from transcript text to adjust affect.
    Only overrides when affect is already ambiguous (neutral/tired).
    """
    if not transcript or affect not in ("neutral", "tired"):
        return affect

    t = transcript.lower()
    apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"}
    urgent_words = {"urgent", "immediately", "asap", "right now", "critical"}
    dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"}

    if any(w in t for w in apologetic_words):
        return "apologetic"
    if any(w in t for w in urgent_words):
        return "urgent"
    if any(w in t for w in dismissive_words):
        return "dismissive"

    return affect


def _cuda_available() -> bool:
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False