cf-voice/cf_voice/classify.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

291 lines
10 KiB
Python

# cf_voice/classify.py — tone / affect classifier
#
# BSL 1.1: real inference. Requires [inference] extras.
# Stub behaviour: raises NotImplementedError if inference deps not installed.
#
# Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody
# features → AFFECT_LABELS defined in cf_voice.events.
from __future__ import annotations
import asyncio
import logging
import os
from dataclasses import dataclass, field
from functools import partial
import numpy as np
logger = logging.getLogger(__name__)
_SAMPLE_RATE = 16_000
# Confidence floor — results below this are discarded by the caller
_DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
# wav2vec2 SER model from HuggingFace
# ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
# Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise
_SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
# ── Affect label mapping ──────────────────────────────────────────────────────
# Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS
# Prosody profile is a tuple of flags present from _extract_prosody_flags().
_EMOTION_BASE: dict[str, str] = {
"angry": "frustrated",
"disgust": "dismissive",
"fear": "apologetic",
"happy": "warm",
"neutral": "neutral",
"sadness": "tired",
"surprise": "confused",
}
# Prosody-driven overrides: (base_affect, flag) → override affect
_PROSODY_OVERRIDES: dict[tuple[str, str], str] = {
("neutral", "fast_rate"): "genuine",
("neutral", "flat_pitch"): "scripted",
("neutral", "low_energy"): "tired",
("frustrated", "rising"): "urgent",
("warm", "rising"): "genuine",
("tired", "rising"): "optimistic",
("dismissive", "flat_pitch"): "condescending",
}
# Affect → human-readable VoiceFrame label (reverse of events._label_to_affect)
_AFFECT_TO_LABEL: dict[str, str] = {
"neutral": "Calm and focused",
"warm": "Enthusiastic",
"frustrated": "Frustrated but contained",
"dismissive": "Politely dismissive",
"apologetic": "Nervous but cooperative",
"urgent": "Warmly impatient",
"condescending": "Politely dismissive",
"scripted": "Calm and focused", # scripted reads as neutral to the observer
"genuine": "Genuinely curious",
"confused": "Confused but engaged",
"tired": "Tired and compliant",
"optimistic": "Guardedly optimistic",
}
@dataclass
class ToneResult:
label: str # human-readable VoiceFrame label
affect: str # AFFECT_LABELS key
confidence: float
prosody_flags: list[str] = field(default_factory=list)
class ToneClassifier:
"""
Tone/affect classifier: wav2vec2 SER + librosa prosody.
Loads the model lazily on first call to avoid import-time GPU allocation.
Thread-safe for concurrent classify() calls — the model is stateless
per-call; session state lives in the caller (ContextClassifier).
Uses AutoFeatureExtractor + AutoModelForAudioClassification directly
rather than hf_pipeline to avoid torchcodec audio backend initialization.
torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x
systems. Calling the model directly bypasses the pipeline's audio backend
selection entirely since we already have float32 at 16kHz.
"""
def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
self._threshold = threshold
self._feature_extractor = None # lazy-loaded
self._model = None # lazy-loaded
self._device: str = "cpu"
@classmethod
def from_env(cls) -> "ToneClassifier":
threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
return cls(threshold=threshold)
def _load_pipeline(self) -> None:
if self._model is not None:
return
try:
from transformers import (
AutoFeatureExtractor,
AutoModelForAudioClassification,
)
except ImportError as exc:
raise ImportError(
"transformers is required for tone classification. "
"Install with: pip install cf-voice[inference]"
) from exc
import torch
if _cuda_available():
self._device = "cuda:0"
# fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000.
# Only supported on CUDA — CPU must stay float32.
torch_dtype = torch.float16
else:
self._device = "cpu"
torch_dtype = torch.float32
logger.info(
"Loading SER model %s on device=%s dtype=%s",
_SER_MODEL_ID, self._device, torch_dtype,
)
self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID)
self._model = AutoModelForAudioClassification.from_pretrained(
_SER_MODEL_ID,
torch_dtype=torch_dtype,
).to(self._device)
# Switch to inference mode — disables dropout, batch norm tracks running stats
self._model.train(False)
def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
"""
Classify tone/affect from a float32 16kHz mono audio window.
transcript is used as a weak signal for ambiguous cases (e.g. words
like "unfortunately" bias toward apologetic even on a neutral voice).
"""
import torch
self._load_pipeline()
# Ensure the model sees float32 at the right rate
assert audio_float32.dtype == np.float32, "audio must be float32"
# Run SER — call feature extractor + model directly to bypass the
# hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency).
inputs = self._feature_extractor(
audio_float32,
sampling_rate=_SAMPLE_RATE,
return_tensors="pt",
)
inputs = {k: v.to(self._device) for k, v in inputs.items()}
if self._model.dtype == torch.float16:
inputs = {k: v.to(torch.float16) for k, v in inputs.items()}
with torch.no_grad():
logits = self._model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]
id2label = self._model.config.id2label
preds = [
{"label": id2label[i], "score": float(probs[i])}
for i in range(len(probs))
]
best = max(preds, key=lambda p: p["score"])
emotion = best["label"].lower()
confidence = float(best["score"])
# Extract prosody features from raw audio
prosody_flags = _extract_prosody_flags(audio_float32)
# Resolve affect from base emotion + prosody
affect = _EMOTION_BASE.get(emotion, "neutral")
for flag in prosody_flags:
override = _PROSODY_OVERRIDES.get((affect, flag))
if override:
affect = override
break
# Weak transcript signals
affect = _apply_transcript_hints(affect, transcript)
label = _AFFECT_TO_LABEL.get(affect, "Calm and focused")
return ToneResult(
label=label,
affect=affect,
confidence=confidence,
prosody_flags=prosody_flags,
)
async def classify_async(
self, audio_float32: np.ndarray, transcript: str = ""
) -> ToneResult:
"""classify() without blocking the event loop."""
loop = asyncio.get_running_loop()
fn = partial(self.classify, audio_float32, transcript)
return await loop.run_in_executor(None, fn)
# ── Prosody helpers ───────────────────────────────────────────────────────────
def _extract_prosody_flags(audio: np.ndarray) -> list[str]:
"""
Extract lightweight prosody flags from a float32 16kHz mono window.
Returns a list of string flags consumed by _PROSODY_OVERRIDES.
"""
try:
import librosa
except ImportError:
return []
flags: list[str] = []
# Energy (RMS)
rms = float(np.sqrt(np.mean(audio ** 2)))
if rms < 0.02:
flags.append("low_energy")
elif rms > 0.15:
flags.append("high_energy")
# Speech rate approximation via zero-crossing rate
zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio)))
if zcr > 0.12:
flags.append("fast_rate")
elif zcr < 0.04:
flags.append("slow_rate")
# Pitch contour via YIN
try:
f0 = librosa.yin(
audio,
fmin=librosa.note_to_hz("C2"),
fmax=librosa.note_to_hz("C7"),
sr=_SAMPLE_RATE,
)
voiced = f0[f0 > 0]
if len(voiced) > 5:
# Rising: last quarter higher than first quarter
q = len(voiced) // 4
if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15:
flags.append("rising")
# Flat: variance less than 15Hz
if np.std(voiced) < 15:
flags.append("flat_pitch")
except Exception:
pass # pitch extraction is best-effort
return flags
def _apply_transcript_hints(affect: str, transcript: str) -> str:
"""
Apply weak keyword signals from transcript text to adjust affect.
Only overrides when affect is already ambiguous (neutral/tired).
"""
if not transcript or affect not in ("neutral", "tired"):
return affect
t = transcript.lower()
apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"}
urgent_words = {"urgent", "immediately", "asap", "right now", "critical"}
dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"}
if any(w in t for w in apologetic_words):
return "apologetic"
if any(w in t for w in urgent_words):
return "urgent"
if any(w in t for w in dismissive_words):
return "dismissive"
return affect
def _cuda_available() -> bool:
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False