New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
152 lines
4.7 KiB
Python
152 lines
4.7 KiB
Python
# cf_voice/accent.py — accent / language identification classifier
|
|
#
|
|
# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference).
|
|
# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity).
|
|
#
|
|
# Accent alone is not high-risk, but combined with birdsong or a quiet rural
|
|
# background it becomes location-identifying. The privacy scorer accounts for
|
|
# this compound signal.
|
|
#
|
|
# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent
|
|
# fine-tune for region. Lazy-loaded to keep startup fast.
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class AccentResult:
|
|
"""
|
|
Language + regional accent classification for the primary speaker.
|
|
|
|
language: BCP-47 language tag (e.g. "en", "fr", "zh")
|
|
region: cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other")
|
|
confidence: float in [0, 1]
|
|
"""
|
|
language: str
|
|
region: str
|
|
confidence: float
|
|
|
|
|
|
class MockAccentClassifier:
|
|
"""
|
|
Synthetic accent classifier for development and CI.
|
|
|
|
Returns a fixed result so the privacy scorer can exercise all code paths
|
|
without loading a real model.
|
|
"""
|
|
|
|
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
|
|
return AccentResult(language="en", region="en_gb", confidence=0.72)
|
|
|
|
|
|
class AccentClassifier:
|
|
"""
|
|
Real accent / language classifier.
|
|
|
|
BSL 1.1 — requires [inference] extras.
|
|
|
|
Language detection: facebook/mms-lid-126 (126 languages, MIT licensed).
|
|
Accent region: maps language tag to a regional ACCENT_LABEL.
|
|
|
|
VRAM: ~500 MB on CUDA.
|
|
"""
|
|
|
|
_LANG_MODEL_ID = "facebook/mms-lid-126"
|
|
|
|
def __init__(self) -> None:
|
|
try:
|
|
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"transformers is required for accent classification. "
|
|
"Install with: pip install cf-voice[inference]"
|
|
) from exc
|
|
|
|
import torch
|
|
|
|
self._device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device)
|
|
self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID)
|
|
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
|
|
self._LANG_MODEL_ID
|
|
).to(self._device)
|
|
self._model.eval()
|
|
|
|
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
|
|
import numpy as np
|
|
import torch
|
|
|
|
if isinstance(audio, bytes):
|
|
audio_np = np.frombuffer(audio, dtype=np.float32)
|
|
else:
|
|
audio_np = np.asarray(audio, dtype=np.float32)
|
|
|
|
if len(audio_np) < 1600: # need at least 100ms at 16kHz
|
|
return None
|
|
|
|
inputs = self._extractor(
|
|
audio_np, sampling_rate=16_000, return_tensors="pt", padding=True
|
|
)
|
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
logits = self._model(**inputs).logits
|
|
probs = torch.softmax(logits, dim=-1)[0]
|
|
|
|
top_idx = int(probs.argmax())
|
|
confidence = float(probs[top_idx])
|
|
language = self._model.config.id2label.get(top_idx, "other")
|
|
|
|
region = _lang_to_region(language)
|
|
return AccentResult(language=language, region=region, confidence=confidence)
|
|
|
|
|
|
def _lang_to_region(lang: str) -> str:
|
|
"""Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL."""
|
|
_MAP: dict[str, str] = {
|
|
"eng": "en_us", # MMS uses ISO 639-3; sub-regional accent needs fine-tune
|
|
"fra": "fr",
|
|
"spa": "es",
|
|
"deu": "de",
|
|
"zho": "zh",
|
|
"jpn": "ja",
|
|
"en": "en_us",
|
|
"en-GB": "en_gb",
|
|
"en-AU": "en_au",
|
|
"en-CA": "en_ca",
|
|
"en-IN": "en_in",
|
|
"fr": "fr",
|
|
"de": "de",
|
|
"es": "es",
|
|
"zh": "zh",
|
|
"ja": "ja",
|
|
}
|
|
return _MAP.get(lang, "other")
|
|
|
|
|
|
def make_accent_classifier(
|
|
mock: bool | None = None,
|
|
) -> "MockAccentClassifier | AccentClassifier | None":
|
|
"""
|
|
Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None.
|
|
|
|
Callers must check for None before invoking classify().
|
|
"""
|
|
enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1"
|
|
if not enabled:
|
|
return None
|
|
|
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
|
if use_mock:
|
|
return MockAccentClassifier()
|
|
|
|
try:
|
|
return AccentClassifier()
|
|
except (ImportError, Exception) as exc:
|
|
logger.warning("AccentClassifier unavailable (%s) — using mock", exc)
|
|
return MockAccentClassifier()
|