# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier # # MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference). # Requires [inference] extras for real mode. # # This module is the AMD (answering machine detection) backbone for Osprey. # It runs in parallel with the STT pipeline — it never processes words, # only acoustic features (pitch, timbre, background, DTMF tones, ringback). # # Navigation v0.2.x wires the real YAMNet model. # Current: mock emits a plausible call-lifecycle sequence. from __future__ import annotations import asyncio import logging import random import time from dataclasses import dataclass from typing import AsyncIterator, Protocol, Sequence, runtime_checkable from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS logger = logging.getLogger(__name__) _SAMPLE_RATE = 16_000 @dataclass class AcousticResult: """Batch of AudioEvents produced from a single audio window.""" queue: AudioEvent | None speaker: AudioEvent | None environ: AudioEvent | None scene: AudioEvent | None timestamp: float @runtime_checkable class AcousticBackend(Protocol): """ Interface for acoustic event classifiers. classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an AcousticResult covering one analysis window (~2s). It is synchronous and runs in a thread pool when called from async code. """ def classify_window( self, audio: "list[float] | bytes", timestamp: float = 0.0, ) -> AcousticResult: ... @runtime_checkable class SceneBackend(Protocol): """ Interface for dedicated acoustic scene classifiers. Separate from AcousticBackend to allow future swapping to a specialised scene model (e.g. AudioSet acoustic-scene subset) without touching the telephony event classifier. """ def classify_scene( self, audio: "list[float] | bytes", timestamp: float = 0.0, ) -> AudioEvent | None: ... # ── Call lifecycle for mock mode ────────────────────────────────────────────── # Approximates what a real outbound call looks like acoustically. # Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center _MOCK_LIFECYCLE: list[dict] = [ # (min_s, max_s): how long to stay in this phase {"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)}, {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)}, {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)}, {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)}, {"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)}, {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)}, {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)}, # AMD moment: background_shift is the primary signal {"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)}, {"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)}, ] class MockAcousticBackend: """ Synthetic acoustic classifier for development and CI. Cycles through a plausible call lifecycle so Osprey's IVR state machine can be tested without real telephony. The AMD signal (background_shift → human_single) is emitted at the right point in the sequence. Usage: backend = MockAcousticBackend(seed=42) result = backend.classify_window(b"", timestamp=4.5) print(result.environ.label) # → "hold_music", "background_shift", etc. """ def __init__(self, seed: int | None = None) -> None: self._rng = random.Random(seed) self._phase_idx = 0 self._phase_start = time.monotonic() self._phase_dur = self._draw_phase_dur(0) def _draw_phase_dur(self, idx: int) -> float: lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"] return self._rng.uniform(lo, hi) def _current_phase(self) -> dict: now = time.monotonic() elapsed = now - self._phase_start if elapsed >= self._phase_dur: self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE) self._phase_start = now self._phase_dur = self._draw_phase_dur(self._phase_idx) return _MOCK_LIFECYCLE[self._phase_idx] def _make_event( self, event_type: str, label: str, timestamp: float, ) -> AudioEvent: return AudioEvent( timestamp=timestamp, event_type=event_type, # type: ignore[arg-type] label=label, confidence=self._rng.uniform(0.72, 0.97), ) def classify_window( self, audio: "list[float] | bytes", timestamp: float = 0.0, ) -> AcousticResult: phase = self._current_phase() return AcousticResult( queue=self._make_event("queue", phase["queue"], timestamp), speaker=self._make_event("speaker", phase["speaker"], timestamp), environ=self._make_event("environ", phase["environ"], timestamp), scene=self._make_event("scene", phase["scene"], timestamp), timestamp=timestamp, ) # ── AST acoustic backend (BSL 1.1) ─────────────────────────────────────────── class ASTAcousticBackend: """ Audio Spectrogram Transformer acoustic event classifier. BSL 1.1 — requires [inference] extras. Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to classify queue state, speaker type, and background environment from a single forward pass. Top-15 predictions are scanned; the highest-confidence match per event category is emitted. Model: MIT/ast-finetuned-audioset-10-10-0.4593 VRAM: ~300 MB on CUDA (fp32) Input: float32 16kHz mono audio (any length; feature extractor pads/truncates) Replaces the YAMNet stub. Synchronous — run from a thread pool executor when called from async code. """ _MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593" _SAMPLE_RATE = 16_000 _TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range # Minimum confidence below which an event is suppressed even if it's the # top match in its category. _MIN_CONFIDENCE: dict[str, float] = { "queue": 0.10, "speaker": 0.08, "environ": 0.12, "scene": 0.08, # scenes fire reliably — lower bar is fine } # AudioSet class name → (event_type, cf-voice label). # Top-K predictions are scanned; highest confidence per category wins. # "call_center" requires dedicated call-centre acoustics, not generic indoor. # "Music" was previously duplicated (queue + environ) — Python dicts keep the # last entry, silently losing the queue mapping. Fixed: use the specific # "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ. _LABEL_MAP: dict[str, tuple[str, str]] = { # ── Queue / call-state labels ────────────────────────────────────────── "Ringtone": ("queue", "ringback"), "Telephone bell ringing": ("queue", "ringback"), "Busy signal": ("queue", "busy"), "Dial tone": ("queue", "dtmf_tone"), "DTMF": ("queue", "dtmf_tone"), "Silence": ("queue", "silence"), # ── Speaker type labels ──────────────────────────────────────────────── "Speech": ("speaker", "human_single"), "Male speech, man speaking": ("speaker", "human_single"), "Female speech, woman speaking": ("speaker", "human_single"), "Child speech, kid speaking": ("speaker", "human_single"), "Crowd": ("speaker", "human_multi"), "Hubbub, speech noise, speech babble": ("speaker", "human_multi"), "Laughter": ("speaker", "human_multi"), "Chuckle, chortle": ("speaker", "human_multi"), "Speech synthesizer": ("speaker", "ivr_synth"), # ── Environmental labels ─────────────────────────────────────────────── # Telephony — requires specific call-centre acoustics, not generic indoor "Telephone": ("environ", "call_center"), "Telephone dialing, DTMF": ("environ", "call_center"), "Reverberation": ("environ", "background_shift"), "Echo": ("environ", "background_shift"), "Background noise": ("environ", "noise_floor_change"), "Noise": ("environ", "noise_floor_change"), "White noise": ("environ", "noise_floor_change"), "Pink noise": ("environ", "noise_floor_change"), "Static": ("environ", "noise_floor_change"), "Music": ("environ", "music"), # Nature "Bird": ("environ", "birdsong"), "Bird vocalization, bird call, bird song": ("environ", "birdsong"), "Chirp, tweet": ("environ", "birdsong"), "Wind": ("environ", "wind"), "Wind noise (microphone)": ("environ", "wind"), "Rain": ("environ", "rain"), "Rain on surface": ("environ", "rain"), "Water": ("environ", "water"), "Stream": ("environ", "water"), # Urban "Traffic noise, roadway noise": ("environ", "traffic"), "Vehicle": ("environ", "traffic"), "Crowd": ("environ", "crowd_chatter"), "Chatter": ("environ", "crowd_chatter"), "Construction": ("environ", "construction"), "Drill": ("environ", "construction"), # Indoor "Air conditioning": ("environ", "hvac"), "Mechanical fan": ("environ", "hvac"), "Computer keyboard": ("environ", "keyboard_typing"), "Typing": ("environ", "keyboard_typing"), "Restaurant": ("environ", "restaurant"), "Dishes, pots, and pans": ("environ", "restaurant"), # ── Acoustic scene labels ────────────────────────────────────────────── # "Inside, small/large room" moved from environ to scene — they correctly # describe the acoustic scene but are NOT specific enough for call_center. "Inside, small room": ("scene", "indoor_quiet"), "Inside, large room or hall": ("scene", "indoor_crowd"), "Outside, urban or manmade": ("scene", "outdoor_urban"), "Field recording": ("scene", "outdoor_nature"), "Rail transport": ("scene", "public_transit"), "Bus": ("scene", "public_transit"), "Train": ("scene", "public_transit"), "Car": ("scene", "vehicle"), "Truck": ("scene", "vehicle"), "Motorcycle": ("scene", "vehicle"), # Music in the queue sense — "Musical instrument" is more specific # than the ambiguous top-level "Music" class "Musical instrument": ("queue", "hold_music"), "Piano": ("queue", "hold_music"), "Guitar": ("queue", "hold_music"), } def __init__(self) -> None: try: from transformers import ASTFeatureExtractor, ASTForAudioClassification except ImportError as exc: raise ImportError( "transformers is required for AST acoustic classification. " "Install with: pip install cf-voice[inference]" ) from exc import torch self._device = "cuda" if torch.cuda.is_available() else "cpu" logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device) self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID) self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to( self._device ) self._model.eval() def classify_window( self, audio: "list[float] | bytes", timestamp: float = 0.0, ) -> AcousticResult: import numpy as np import torch if isinstance(audio, bytes): audio_np = np.frombuffer(audio, dtype=np.float32) else: audio_np = np.asarray(audio, dtype=np.float32) if len(audio_np) == 0: return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp) inputs = self._extractor( audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt" ) inputs = {k: v.to(self._device) for k, v in inputs.items()} with torch.no_grad(): logits = self._model(**inputs).logits probs = torch.softmax(logits, dim=-1)[0] id2label = self._model.config.id2label top_k = min(self._TOP_K, len(probs)) top_indices = probs.topk(top_k).indices.tolist() predictions = [(id2label[i], float(probs[i])) for i in top_indices] # Take highest-confidence match per category best: dict[str, tuple[str, float]] = {} # event_type → (label, conf) for ast_label, conf in predictions: mapping = self._LABEL_MAP.get(ast_label) if mapping is None: continue etype, cf_label = mapping if etype not in best or conf > best[etype][1]: best[etype] = (cf_label, conf) def _make_event(etype: str, label: str, conf: float) -> AudioEvent: return AudioEvent( timestamp=timestamp, event_type=etype, # type: ignore[arg-type] label=label, confidence=round(conf, 4), ) def _above_threshold(etype: str) -> bool: if etype not in best: return False _, conf = best[etype] return conf >= self._MIN_CONFIDENCE.get(etype, 0.10) return AcousticResult( queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None, speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None, environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None, scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None, timestamp=timestamp, ) def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend": """ Factory: return an AcousticBackend for the current environment. mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend Otherwise → ASTAcousticBackend (falls back to mock on import error) """ import os use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" if use_mock: return MockAcousticBackend() try: return ASTAcousticBackend() except (ImportError, Exception) as exc: logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc) return MockAcousticBackend()