cf-voice/cf_voice/acoustic.py

# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier
#
# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference).
# Requires [inference] extras for real mode.
#
# This module is the AMD (answering machine detection) backbone for Osprey.
# It runs in parallel with the STT pipeline — it never processes words,
# only acoustic features (pitch, timbre, background, DTMF tones, ringback).
#
# Navigation v0.2.x wires the real YAMNet model.
# Current: mock emits a plausible call-lifecycle sequence.
from __future__ import annotations

import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import AsyncIterator, Protocol, Sequence, runtime_checkable

from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS

logger = logging.getLogger(__name__)

_SAMPLE_RATE = 16_000


@dataclass
class AcousticResult:
    """Batch of AudioEvents produced from a single audio window."""
    queue: AudioEvent | None
    speaker: AudioEvent | None
    environ: AudioEvent | None
    scene: AudioEvent | None
    timestamp: float


@runtime_checkable
class AcousticBackend(Protocol):
    """
    Interface for acoustic event classifiers.

    classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an
    AcousticResult covering one analysis window (~2s). It is synchronous and
    runs in a thread pool when called from async code.
    """

    def classify_window(
        self,
        audio: "list[float] | bytes",
        timestamp: float = 0.0,
    ) -> AcousticResult:
        ...


@runtime_checkable
class SceneBackend(Protocol):
    """
    Interface for dedicated acoustic scene classifiers.

    Separate from AcousticBackend to allow future swapping to a specialised
    scene model (e.g. AudioSet acoustic-scene subset) without touching the
    telephony event classifier.
    """

    def classify_scene(
        self,
        audio: "list[float] | bytes",
        timestamp: float = 0.0,
    ) -> AudioEvent | None:
        ...


# ── Call lifecycle for mock mode ──────────────────────────────────────────────
# Approximates what a real outbound call looks like acoustically.
# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center

_MOCK_LIFECYCLE: list[dict] = [
    # (min_s, max_s): how long to stay in this phase
    {"queue": "ringback",    "speaker": "no_speaker",   "environ": "quiet",            "scene": "indoor_quiet",  "dur": (2, 5)},
    {"queue": "silence",     "speaker": "ivr_synth",    "environ": "quiet",            "scene": "indoor_quiet",  "dur": (1, 2)},
    {"queue": "hold_music",  "speaker": "no_speaker",   "environ": "music",            "scene": "indoor_quiet",  "dur": (2, 8)},
    {"queue": "silence",     "speaker": "ivr_synth",    "environ": "quiet",            "scene": "indoor_quiet",  "dur": (1, 3)},
    {"queue": "dtmf_tone",   "speaker": "no_speaker",   "environ": "quiet",            "scene": "indoor_quiet",  "dur": (0.5, 1)},
    {"queue": "silence",     "speaker": "ivr_synth",    "environ": "quiet",            "scene": "indoor_quiet",  "dur": (0.5, 1)},
    {"queue": "hold_music",  "speaker": "no_speaker",   "environ": "music",            "scene": "indoor_quiet",  "dur": (3, 12)},
    # AMD moment: background_shift is the primary signal
    {"queue": "silence",     "speaker": "no_speaker",   "environ": "background_shift", "scene": "indoor_crowd",  "dur": (0.5, 1)},
    {"queue": "silence",     "speaker": "human_single", "environ": "call_center",      "scene": "indoor_crowd",  "dur": (30, 60)},
]


class MockAcousticBackend:
    """
    Synthetic acoustic classifier for development and CI.

    Cycles through a plausible call lifecycle so Osprey's IVR state machine
    can be tested without real telephony. The AMD signal (background_shift →
    human_single) is emitted at the right point in the sequence.

    Usage:
        backend = MockAcousticBackend(seed=42)
        result = backend.classify_window(b"", timestamp=4.5)
        print(result.environ.label)  # → "hold_music", "background_shift", etc.
    """

    def __init__(self, seed: int | None = None) -> None:
        self._rng = random.Random(seed)
        self._phase_idx = 0
        self._phase_start = time.monotonic()
        self._phase_dur = self._draw_phase_dur(0)

    def _draw_phase_dur(self, idx: int) -> float:
        lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"]
        return self._rng.uniform(lo, hi)

    def _current_phase(self) -> dict:
        now = time.monotonic()
        elapsed = now - self._phase_start
        if elapsed >= self._phase_dur:
            self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE)
            self._phase_start = now
            self._phase_dur = self._draw_phase_dur(self._phase_idx)
        return _MOCK_LIFECYCLE[self._phase_idx]

    def _make_event(
        self,
        event_type: str,
        label: str,
        timestamp: float,
    ) -> AudioEvent:
        return AudioEvent(
            timestamp=timestamp,
            event_type=event_type,  # type: ignore[arg-type]
            label=label,
            confidence=self._rng.uniform(0.72, 0.97),
        )

    def classify_window(
        self,
        audio: "list[float] | bytes",
        timestamp: float = 0.0,
    ) -> AcousticResult:
        phase = self._current_phase()
        return AcousticResult(
            queue=self._make_event("queue", phase["queue"], timestamp),
            speaker=self._make_event("speaker", phase["speaker"], timestamp),
            environ=self._make_event("environ", phase["environ"], timestamp),
            scene=self._make_event("scene", phase["scene"], timestamp),
            timestamp=timestamp,
        )


# ── AST acoustic backend (BSL 1.1) ───────────────────────────────────────────


class ASTAcousticBackend:
    """
    Audio Spectrogram Transformer acoustic event classifier.

    BSL 1.1 — requires [inference] extras.

    Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to
    classify queue state, speaker type, and background environment from a
    single forward pass. Top-15 predictions are scanned; the highest-confidence
    match per event category is emitted.

    Model: MIT/ast-finetuned-audioset-10-10-0.4593
    VRAM: ~300 MB on CUDA (fp32)
    Input: float32 16kHz mono audio (any length; feature extractor pads/truncates)

    Replaces the YAMNet stub. Synchronous — run from a thread pool executor
    when called from async code.
    """

    _MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593"
    _SAMPLE_RATE = 16_000
    _TOP_K = 20   # scan more classes — many relevant ones are in the 10-20 range

    # Minimum confidence below which an event is suppressed even if it's the
    # top match in its category.
    _MIN_CONFIDENCE: dict[str, float] = {
        "queue":   0.10,
        "speaker": 0.08,
        "environ": 0.12,
        "scene":   0.08,   # scenes fire reliably — lower bar is fine
    }

    # AudioSet class name → (event_type, cf-voice label).
    # Top-K predictions are scanned; highest confidence per category wins.
    # "call_center" requires dedicated call-centre acoustics, not generic indoor.
    # "Music" was previously duplicated (queue + environ) — Python dicts keep the
    # last entry, silently losing the queue mapping. Fixed: use the specific
    # "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ.
    _LABEL_MAP: dict[str, tuple[str, str]] = {
        # ── Queue / call-state labels ──────────────────────────────────────────
        "Ringtone":                             ("queue", "ringback"),
        "Telephone bell ringing":               ("queue", "ringback"),
        "Busy signal":                          ("queue", "busy"),
        "Dial tone":                            ("queue", "dtmf_tone"),
        "DTMF":                                 ("queue", "dtmf_tone"),
        "Silence":                              ("queue", "silence"),
        # ── Speaker type labels ────────────────────────────────────────────────
        "Speech":                               ("speaker", "human_single"),
        "Male speech, man speaking":            ("speaker", "human_single"),
        "Female speech, woman speaking":        ("speaker", "human_single"),
        "Child speech, kid speaking":           ("speaker", "human_single"),
        "Crowd":                                ("speaker", "human_multi"),
        "Hubbub, speech noise, speech babble":  ("speaker", "human_multi"),
        "Laughter":                             ("speaker", "human_multi"),
        "Chuckle, chortle":                     ("speaker", "human_multi"),
        "Speech synthesizer":                   ("speaker", "ivr_synth"),
        # ── Environmental labels ───────────────────────────────────────────────
        # Telephony — requires specific call-centre acoustics, not generic indoor
        "Telephone":                            ("environ", "call_center"),
        "Telephone dialing, DTMF":              ("environ", "call_center"),
        "Reverberation":                        ("environ", "background_shift"),
        "Echo":                                 ("environ", "background_shift"),
        "Background noise":                     ("environ", "noise_floor_change"),
        "Noise":                                ("environ", "noise_floor_change"),
        "White noise":                          ("environ", "noise_floor_change"),
        "Pink noise":                           ("environ", "noise_floor_change"),
        "Static":                               ("environ", "noise_floor_change"),
        "Music":                                ("environ", "music"),
        # Nature
        "Bird":                                 ("environ", "birdsong"),
        "Bird vocalization, bird call, bird song": ("environ", "birdsong"),
        "Chirp, tweet":                         ("environ", "birdsong"),
        "Wind":                                 ("environ", "wind"),
        "Wind noise (microphone)":              ("environ", "wind"),
        "Rain":                                 ("environ", "rain"),
        "Rain on surface":                      ("environ", "rain"),
        "Water":                                ("environ", "water"),
        "Stream":                               ("environ", "water"),
        # Urban
        "Traffic noise, roadway noise":         ("environ", "traffic"),
        "Vehicle":                              ("environ", "traffic"),
        "Crowd":                                ("environ", "crowd_chatter"),
        "Chatter":                              ("environ", "crowd_chatter"),
        "Construction":                         ("environ", "construction"),
        "Drill":                                ("environ", "construction"),
        # Indoor
        "Air conditioning":                     ("environ", "hvac"),
        "Mechanical fan":                       ("environ", "hvac"),
        "Computer keyboard":                    ("environ", "keyboard_typing"),
        "Typing":                               ("environ", "keyboard_typing"),
        "Restaurant":                           ("environ", "restaurant"),
        "Dishes, pots, and pans":               ("environ", "restaurant"),
        # ── Acoustic scene labels ──────────────────────────────────────────────
        # "Inside, small/large room" moved from environ to scene — they correctly
        # describe the acoustic scene but are NOT specific enough for call_center.
        "Inside, small room":                   ("scene", "indoor_quiet"),
        "Inside, large room or hall":           ("scene", "indoor_crowd"),
        "Outside, urban or manmade":            ("scene", "outdoor_urban"),
        "Field recording":                      ("scene", "outdoor_nature"),
        "Rail transport":                       ("scene", "public_transit"),
        "Bus":                                  ("scene", "public_transit"),
        "Train":                                ("scene", "public_transit"),
        "Car":                                  ("scene", "vehicle"),
        "Truck":                                ("scene", "vehicle"),
        "Motorcycle":                           ("scene", "vehicle"),
        # Music in the queue sense — "Musical instrument" is more specific
        # than the ambiguous top-level "Music" class
        "Musical instrument":                   ("queue", "hold_music"),
        "Piano":                                ("queue", "hold_music"),
        "Guitar":                               ("queue", "hold_music"),
    }

    def __init__(self) -> None:
        try:
            from transformers import ASTFeatureExtractor, ASTForAudioClassification
        except ImportError as exc:
            raise ImportError(
                "transformers is required for AST acoustic classification. "
                "Install with: pip install cf-voice[inference]"
            ) from exc

        import torch

        self._device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device)
        self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID)
        self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to(
            self._device
        )
        self._model.eval()

    def classify_window(
        self,
        audio: "list[float] | bytes",
        timestamp: float = 0.0,
    ) -> AcousticResult:
        import numpy as np
        import torch

        if isinstance(audio, bytes):
            audio_np = np.frombuffer(audio, dtype=np.float32)
        else:
            audio_np = np.asarray(audio, dtype=np.float32)

        if len(audio_np) == 0:
            return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp)

        inputs = self._extractor(
            audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt"
        )
        inputs = {k: v.to(self._device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = self._model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)[0]
        id2label = self._model.config.id2label

        top_k = min(self._TOP_K, len(probs))
        top_indices = probs.topk(top_k).indices.tolist()
        predictions = [(id2label[i], float(probs[i])) for i in top_indices]

        # Take highest-confidence match per category
        best: dict[str, tuple[str, float]] = {}  # event_type → (label, conf)
        for ast_label, conf in predictions:
            mapping = self._LABEL_MAP.get(ast_label)
            if mapping is None:
                continue
            etype, cf_label = mapping
            if etype not in best or conf > best[etype][1]:
                best[etype] = (cf_label, conf)

        def _make_event(etype: str, label: str, conf: float) -> AudioEvent:
            return AudioEvent(
                timestamp=timestamp,
                event_type=etype,  # type: ignore[arg-type]
                label=label,
                confidence=round(conf, 4),
            )

        def _above_threshold(etype: str) -> bool:
            if etype not in best:
                return False
            _, conf = best[etype]
            return conf >= self._MIN_CONFIDENCE.get(etype, 0.10)

        return AcousticResult(
            queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None,
            speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None,
            environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None,
            scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None,
            timestamp=timestamp,
        )


def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend":
    """
    Factory: return an AcousticBackend for the current environment.

    mock=True or CF_VOICE_MOCK=1  → MockAcousticBackend
    Otherwise                     → ASTAcousticBackend (falls back to mock on import error)
    """
    import os
    use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
    if use_mock:
        return MockAcousticBackend()
    try:
        return ASTAcousticBackend()
    except (ImportError, Exception) as exc:
        logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc)
        return MockAcousticBackend()