cf-voice/cf_voice/accent.py

# cf_voice/accent.py — accent / language identification classifier
#
# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference).
# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity).
#
# Accent alone is not high-risk, but combined with birdsong or a quiet rural
# background it becomes location-identifying. The privacy scorer accounts for
# this compound signal.
#
# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent
# fine-tune for region. Lazy-loaded to keep startup fast.
from __future__ import annotations

import logging
import os
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class AccentResult:
    """
    Language + regional accent classification for the primary speaker.

    language: BCP-47 language tag (e.g. "en", "fr", "zh")
    region:   cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other")
    confidence: float in [0, 1]
    """
    language: str
    region: str
    confidence: float


class MockAccentClassifier:
    """
    Synthetic accent classifier for development and CI.

    Returns a fixed result so the privacy scorer can exercise all code paths
    without loading a real model.
    """

    def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
        return AccentResult(language="en", region="en_gb", confidence=0.72)


class AccentClassifier:
    """
    Real accent / language classifier.

    BSL 1.1 — requires [inference] extras.

    Language detection: facebook/mms-lid-126 (126 languages, MIT licensed).
    Accent region: maps language tag to a regional ACCENT_LABEL.

    VRAM: ~500 MB on CUDA.
    """

    _LANG_MODEL_ID = "facebook/mms-lid-126"

    def __init__(self) -> None:
        try:
            from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
        except ImportError as exc:
            raise ImportError(
                "transformers is required for accent classification. "
                "Install with: pip install cf-voice[inference]"
            ) from exc

        import torch

        self._device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device)
        self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID)
        self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
            self._LANG_MODEL_ID
        ).to(self._device)
        self._model.eval()

    def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
        import numpy as np
        import torch

        if isinstance(audio, bytes):
            audio_np = np.frombuffer(audio, dtype=np.float32)
        else:
            audio_np = np.asarray(audio, dtype=np.float32)

        if len(audio_np) < 1600:  # need at least 100ms at 16kHz
            return None

        inputs = self._extractor(
            audio_np, sampling_rate=16_000, return_tensors="pt", padding=True
        )
        inputs = {k: v.to(self._device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = self._model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)[0]

        top_idx = int(probs.argmax())
        confidence = float(probs[top_idx])
        language = self._model.config.id2label.get(top_idx, "other")

        region = _lang_to_region(language)
        return AccentResult(language=language, region=region, confidence=confidence)


def _lang_to_region(lang: str) -> str:
    """Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL."""
    _MAP: dict[str, str] = {
        "eng":   "en_us",   # MMS uses ISO 639-3; sub-regional accent needs fine-tune
        "fra":   "fr",
        "spa":   "es",
        "deu":   "de",
        "zho":   "zh",
        "jpn":   "ja",
        "en":    "en_us",
        "en-GB": "en_gb",
        "en-AU": "en_au",
        "en-CA": "en_ca",
        "en-IN": "en_in",
        "fr":    "fr",
        "de":    "de",
        "es":    "es",
        "zh":    "zh",
        "ja":    "ja",
    }
    return _MAP.get(lang, "other")


def make_accent_classifier(
    mock: bool | None = None,
) -> "MockAccentClassifier | AccentClassifier | None":
    """
    Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None.

    Callers must check for None before invoking classify().
    """
    enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1"
    if not enabled:
        return None

    use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
    if use_mock:
        return MockAccentClassifier()

    try:
        return AccentClassifier()
    except (ImportError, Exception) as exc:
        logger.warning("AccentClassifier unavailable (%s) — using mock", exc)
        return MockAccentClassifier()