New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
291 lines
10 KiB
Python
291 lines
10 KiB
Python
# cf_voice/classify.py — tone / affect classifier
|
|
#
|
|
# BSL 1.1: real inference. Requires [inference] extras.
|
|
# Stub behaviour: raises NotImplementedError if inference deps not installed.
|
|
#
|
|
# Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody
|
|
# features → AFFECT_LABELS defined in cf_voice.events.
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from functools import partial
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SAMPLE_RATE = 16_000
|
|
|
|
# Confidence floor — results below this are discarded by the caller
|
|
_DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
|
|
|
|
# wav2vec2 SER model from HuggingFace
|
|
# ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
|
|
# Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise
|
|
_SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
|
|
|
# ── Affect label mapping ──────────────────────────────────────────────────────
|
|
# Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS
|
|
# Prosody profile is a tuple of flags present from _extract_prosody_flags().
|
|
|
|
_EMOTION_BASE: dict[str, str] = {
|
|
"angry": "frustrated",
|
|
"disgust": "dismissive",
|
|
"fear": "apologetic",
|
|
"happy": "warm",
|
|
"neutral": "neutral",
|
|
"sadness": "tired",
|
|
"surprise": "confused",
|
|
}
|
|
|
|
# Prosody-driven overrides: (base_affect, flag) → override affect
|
|
_PROSODY_OVERRIDES: dict[tuple[str, str], str] = {
|
|
("neutral", "fast_rate"): "genuine",
|
|
("neutral", "flat_pitch"): "scripted",
|
|
("neutral", "low_energy"): "tired",
|
|
("frustrated", "rising"): "urgent",
|
|
("warm", "rising"): "genuine",
|
|
("tired", "rising"): "optimistic",
|
|
("dismissive", "flat_pitch"): "condescending",
|
|
}
|
|
|
|
# Affect → human-readable VoiceFrame label (reverse of events._label_to_affect)
|
|
_AFFECT_TO_LABEL: dict[str, str] = {
|
|
"neutral": "Calm and focused",
|
|
"warm": "Enthusiastic",
|
|
"frustrated": "Frustrated but contained",
|
|
"dismissive": "Politely dismissive",
|
|
"apologetic": "Nervous but cooperative",
|
|
"urgent": "Warmly impatient",
|
|
"condescending": "Politely dismissive",
|
|
"scripted": "Calm and focused", # scripted reads as neutral to the observer
|
|
"genuine": "Genuinely curious",
|
|
"confused": "Confused but engaged",
|
|
"tired": "Tired and compliant",
|
|
"optimistic": "Guardedly optimistic",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ToneResult:
|
|
label: str # human-readable VoiceFrame label
|
|
affect: str # AFFECT_LABELS key
|
|
confidence: float
|
|
prosody_flags: list[str] = field(default_factory=list)
|
|
|
|
|
|
class ToneClassifier:
|
|
"""
|
|
Tone/affect classifier: wav2vec2 SER + librosa prosody.
|
|
|
|
Loads the model lazily on first call to avoid import-time GPU allocation.
|
|
Thread-safe for concurrent classify() calls — the model is stateless
|
|
per-call; session state lives in the caller (ContextClassifier).
|
|
|
|
Uses AutoFeatureExtractor + AutoModelForAudioClassification directly
|
|
rather than hf_pipeline to avoid torchcodec audio backend initialization.
|
|
torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x
|
|
systems. Calling the model directly bypasses the pipeline's audio backend
|
|
selection entirely since we already have float32 at 16kHz.
|
|
"""
|
|
|
|
def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
|
|
self._threshold = threshold
|
|
self._feature_extractor = None # lazy-loaded
|
|
self._model = None # lazy-loaded
|
|
self._device: str = "cpu"
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "ToneClassifier":
|
|
threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
|
|
return cls(threshold=threshold)
|
|
|
|
def _load_pipeline(self) -> None:
|
|
if self._model is not None:
|
|
return
|
|
try:
|
|
from transformers import (
|
|
AutoFeatureExtractor,
|
|
AutoModelForAudioClassification,
|
|
)
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"transformers is required for tone classification. "
|
|
"Install with: pip install cf-voice[inference]"
|
|
) from exc
|
|
|
|
import torch
|
|
|
|
if _cuda_available():
|
|
self._device = "cuda:0"
|
|
# fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000.
|
|
# Only supported on CUDA — CPU must stay float32.
|
|
torch_dtype = torch.float16
|
|
else:
|
|
self._device = "cpu"
|
|
torch_dtype = torch.float32
|
|
|
|
logger.info(
|
|
"Loading SER model %s on device=%s dtype=%s",
|
|
_SER_MODEL_ID, self._device, torch_dtype,
|
|
)
|
|
self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID)
|
|
self._model = AutoModelForAudioClassification.from_pretrained(
|
|
_SER_MODEL_ID,
|
|
torch_dtype=torch_dtype,
|
|
).to(self._device)
|
|
# Switch to inference mode — disables dropout, batch norm tracks running stats
|
|
self._model.train(False)
|
|
|
|
def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
|
|
"""
|
|
Classify tone/affect from a float32 16kHz mono audio window.
|
|
|
|
transcript is used as a weak signal for ambiguous cases (e.g. words
|
|
like "unfortunately" bias toward apologetic even on a neutral voice).
|
|
"""
|
|
import torch
|
|
|
|
self._load_pipeline()
|
|
|
|
# Ensure the model sees float32 at the right rate
|
|
assert audio_float32.dtype == np.float32, "audio must be float32"
|
|
|
|
# Run SER — call feature extractor + model directly to bypass the
|
|
# hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency).
|
|
inputs = self._feature_extractor(
|
|
audio_float32,
|
|
sampling_rate=_SAMPLE_RATE,
|
|
return_tensors="pt",
|
|
)
|
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
|
if self._model.dtype == torch.float16:
|
|
inputs = {k: v.to(torch.float16) for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
logits = self._model(**inputs).logits
|
|
probs = torch.softmax(logits, dim=-1)[0]
|
|
id2label = self._model.config.id2label
|
|
preds = [
|
|
{"label": id2label[i], "score": float(probs[i])}
|
|
for i in range(len(probs))
|
|
]
|
|
|
|
best = max(preds, key=lambda p: p["score"])
|
|
emotion = best["label"].lower()
|
|
confidence = float(best["score"])
|
|
|
|
# Extract prosody features from raw audio
|
|
prosody_flags = _extract_prosody_flags(audio_float32)
|
|
|
|
# Resolve affect from base emotion + prosody
|
|
affect = _EMOTION_BASE.get(emotion, "neutral")
|
|
for flag in prosody_flags:
|
|
override = _PROSODY_OVERRIDES.get((affect, flag))
|
|
if override:
|
|
affect = override
|
|
break
|
|
|
|
# Weak transcript signals
|
|
affect = _apply_transcript_hints(affect, transcript)
|
|
|
|
label = _AFFECT_TO_LABEL.get(affect, "Calm and focused")
|
|
return ToneResult(
|
|
label=label,
|
|
affect=affect,
|
|
confidence=confidence,
|
|
prosody_flags=prosody_flags,
|
|
)
|
|
|
|
async def classify_async(
|
|
self, audio_float32: np.ndarray, transcript: str = ""
|
|
) -> ToneResult:
|
|
"""classify() without blocking the event loop."""
|
|
loop = asyncio.get_running_loop()
|
|
fn = partial(self.classify, audio_float32, transcript)
|
|
return await loop.run_in_executor(None, fn)
|
|
|
|
|
|
# ── Prosody helpers ───────────────────────────────────────────────────────────
|
|
|
|
def _extract_prosody_flags(audio: np.ndarray) -> list[str]:
|
|
"""
|
|
Extract lightweight prosody flags from a float32 16kHz mono window.
|
|
Returns a list of string flags consumed by _PROSODY_OVERRIDES.
|
|
"""
|
|
try:
|
|
import librosa
|
|
except ImportError:
|
|
return []
|
|
|
|
flags: list[str] = []
|
|
|
|
# Energy (RMS)
|
|
rms = float(np.sqrt(np.mean(audio ** 2)))
|
|
if rms < 0.02:
|
|
flags.append("low_energy")
|
|
elif rms > 0.15:
|
|
flags.append("high_energy")
|
|
|
|
# Speech rate approximation via zero-crossing rate
|
|
zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio)))
|
|
if zcr > 0.12:
|
|
flags.append("fast_rate")
|
|
elif zcr < 0.04:
|
|
flags.append("slow_rate")
|
|
|
|
# Pitch contour via YIN
|
|
try:
|
|
f0 = librosa.yin(
|
|
audio,
|
|
fmin=librosa.note_to_hz("C2"),
|
|
fmax=librosa.note_to_hz("C7"),
|
|
sr=_SAMPLE_RATE,
|
|
)
|
|
voiced = f0[f0 > 0]
|
|
if len(voiced) > 5:
|
|
# Rising: last quarter higher than first quarter
|
|
q = len(voiced) // 4
|
|
if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15:
|
|
flags.append("rising")
|
|
# Flat: variance less than 15Hz
|
|
if np.std(voiced) < 15:
|
|
flags.append("flat_pitch")
|
|
except Exception:
|
|
pass # pitch extraction is best-effort
|
|
|
|
return flags
|
|
|
|
|
|
def _apply_transcript_hints(affect: str, transcript: str) -> str:
|
|
"""
|
|
Apply weak keyword signals from transcript text to adjust affect.
|
|
Only overrides when affect is already ambiguous (neutral/tired).
|
|
"""
|
|
if not transcript or affect not in ("neutral", "tired"):
|
|
return affect
|
|
|
|
t = transcript.lower()
|
|
apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"}
|
|
urgent_words = {"urgent", "immediately", "asap", "right now", "critical"}
|
|
dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"}
|
|
|
|
if any(w in t for w in apologetic_words):
|
|
return "apologetic"
|
|
if any(w in t for w in urgent_words):
|
|
return "urgent"
|
|
if any(w in t for w in dismissive_words):
|
|
return "dismissive"
|
|
|
|
return affect
|
|
|
|
|
|
def _cuda_available() -> bool:
|
|
try:
|
|
import torch
|
|
return torch.cuda.is_available()
|
|
except ImportError:
|
|
return False
|