cf-voice/cf_voice/accent.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

152 lines
4.7 KiB
Python

# cf_voice/accent.py — accent / language identification classifier
#
# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference).
# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity).
#
# Accent alone is not high-risk, but combined with birdsong or a quiet rural
# background it becomes location-identifying. The privacy scorer accounts for
# this compound signal.
#
# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent
# fine-tune for region. Lazy-loaded to keep startup fast.
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class AccentResult:
"""
Language + regional accent classification for the primary speaker.
language: BCP-47 language tag (e.g. "en", "fr", "zh")
region: cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other")
confidence: float in [0, 1]
"""
language: str
region: str
confidence: float
class MockAccentClassifier:
"""
Synthetic accent classifier for development and CI.
Returns a fixed result so the privacy scorer can exercise all code paths
without loading a real model.
"""
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
return AccentResult(language="en", region="en_gb", confidence=0.72)
class AccentClassifier:
"""
Real accent / language classifier.
BSL 1.1 — requires [inference] extras.
Language detection: facebook/mms-lid-126 (126 languages, MIT licensed).
Accent region: maps language tag to a regional ACCENT_LABEL.
VRAM: ~500 MB on CUDA.
"""
_LANG_MODEL_ID = "facebook/mms-lid-126"
def __init__(self) -> None:
try:
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
except ImportError as exc:
raise ImportError(
"transformers is required for accent classification. "
"Install with: pip install cf-voice[inference]"
) from exc
import torch
self._device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device)
self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID)
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
self._LANG_MODEL_ID
).to(self._device)
self._model.eval()
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
import numpy as np
import torch
if isinstance(audio, bytes):
audio_np = np.frombuffer(audio, dtype=np.float32)
else:
audio_np = np.asarray(audio, dtype=np.float32)
if len(audio_np) < 1600: # need at least 100ms at 16kHz
return None
inputs = self._extractor(
audio_np, sampling_rate=16_000, return_tensors="pt", padding=True
)
inputs = {k: v.to(self._device) for k, v in inputs.items()}
with torch.no_grad():
logits = self._model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]
top_idx = int(probs.argmax())
confidence = float(probs[top_idx])
language = self._model.config.id2label.get(top_idx, "other")
region = _lang_to_region(language)
return AccentResult(language=language, region=region, confidence=confidence)
def _lang_to_region(lang: str) -> str:
"""Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL."""
_MAP: dict[str, str] = {
"eng": "en_us", # MMS uses ISO 639-3; sub-regional accent needs fine-tune
"fra": "fr",
"spa": "es",
"deu": "de",
"zho": "zh",
"jpn": "ja",
"en": "en_us",
"en-GB": "en_gb",
"en-AU": "en_au",
"en-CA": "en_ca",
"en-IN": "en_in",
"fr": "fr",
"de": "de",
"es": "es",
"zh": "zh",
"ja": "ja",
}
return _MAP.get(lang, "other")
def make_accent_classifier(
mock: bool | None = None,
) -> "MockAccentClassifier | AccentClassifier | None":
"""
Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None.
Callers must check for None before invoking classify().
"""
enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1"
if not enabled:
return None
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
if use_mock:
return MockAccentClassifier()
try:
return AccentClassifier()
except (ImportError, Exception) as exc:
logger.warning("AccentClassifier unavailable (%s) — using mock", exc)
return MockAccentClassifier()