New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
366 lines
17 KiB
Python
366 lines
17 KiB
Python
# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier
|
|
#
|
|
# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference).
|
|
# Requires [inference] extras for real mode.
|
|
#
|
|
# This module is the AMD (answering machine detection) backbone for Osprey.
|
|
# It runs in parallel with the STT pipeline — it never processes words,
|
|
# only acoustic features (pitch, timbre, background, DTMF tones, ringback).
|
|
#
|
|
# Navigation v0.2.x wires the real YAMNet model.
|
|
# Current: mock emits a plausible call-lifecycle sequence.
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import random
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import AsyncIterator, Protocol, Sequence, runtime_checkable
|
|
|
|
from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SAMPLE_RATE = 16_000
|
|
|
|
|
|
@dataclass
|
|
class AcousticResult:
|
|
"""Batch of AudioEvents produced from a single audio window."""
|
|
queue: AudioEvent | None
|
|
speaker: AudioEvent | None
|
|
environ: AudioEvent | None
|
|
scene: AudioEvent | None
|
|
timestamp: float
|
|
|
|
|
|
@runtime_checkable
|
|
class AcousticBackend(Protocol):
|
|
"""
|
|
Interface for acoustic event classifiers.
|
|
|
|
classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an
|
|
AcousticResult covering one analysis window (~2s). It is synchronous and
|
|
runs in a thread pool when called from async code.
|
|
"""
|
|
|
|
def classify_window(
|
|
self,
|
|
audio: "list[float] | bytes",
|
|
timestamp: float = 0.0,
|
|
) -> AcousticResult:
|
|
...
|
|
|
|
|
|
@runtime_checkable
|
|
class SceneBackend(Protocol):
|
|
"""
|
|
Interface for dedicated acoustic scene classifiers.
|
|
|
|
Separate from AcousticBackend to allow future swapping to a specialised
|
|
scene model (e.g. AudioSet acoustic-scene subset) without touching the
|
|
telephony event classifier.
|
|
"""
|
|
|
|
def classify_scene(
|
|
self,
|
|
audio: "list[float] | bytes",
|
|
timestamp: float = 0.0,
|
|
) -> AudioEvent | None:
|
|
...
|
|
|
|
|
|
# ── Call lifecycle for mock mode ──────────────────────────────────────────────
|
|
# Approximates what a real outbound call looks like acoustically.
|
|
# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center
|
|
|
|
_MOCK_LIFECYCLE: list[dict] = [
|
|
# (min_s, max_s): how long to stay in this phase
|
|
{"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)},
|
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)},
|
|
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)},
|
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)},
|
|
{"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
|
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
|
|
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)},
|
|
# AMD moment: background_shift is the primary signal
|
|
{"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)},
|
|
{"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)},
|
|
]
|
|
|
|
|
|
class MockAcousticBackend:
|
|
"""
|
|
Synthetic acoustic classifier for development and CI.
|
|
|
|
Cycles through a plausible call lifecycle so Osprey's IVR state machine
|
|
can be tested without real telephony. The AMD signal (background_shift →
|
|
human_single) is emitted at the right point in the sequence.
|
|
|
|
Usage:
|
|
backend = MockAcousticBackend(seed=42)
|
|
result = backend.classify_window(b"", timestamp=4.5)
|
|
print(result.environ.label) # → "hold_music", "background_shift", etc.
|
|
"""
|
|
|
|
def __init__(self, seed: int | None = None) -> None:
|
|
self._rng = random.Random(seed)
|
|
self._phase_idx = 0
|
|
self._phase_start = time.monotonic()
|
|
self._phase_dur = self._draw_phase_dur(0)
|
|
|
|
def _draw_phase_dur(self, idx: int) -> float:
|
|
lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"]
|
|
return self._rng.uniform(lo, hi)
|
|
|
|
def _current_phase(self) -> dict:
|
|
now = time.monotonic()
|
|
elapsed = now - self._phase_start
|
|
if elapsed >= self._phase_dur:
|
|
self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE)
|
|
self._phase_start = now
|
|
self._phase_dur = self._draw_phase_dur(self._phase_idx)
|
|
return _MOCK_LIFECYCLE[self._phase_idx]
|
|
|
|
def _make_event(
|
|
self,
|
|
event_type: str,
|
|
label: str,
|
|
timestamp: float,
|
|
) -> AudioEvent:
|
|
return AudioEvent(
|
|
timestamp=timestamp,
|
|
event_type=event_type, # type: ignore[arg-type]
|
|
label=label,
|
|
confidence=self._rng.uniform(0.72, 0.97),
|
|
)
|
|
|
|
def classify_window(
|
|
self,
|
|
audio: "list[float] | bytes",
|
|
timestamp: float = 0.0,
|
|
) -> AcousticResult:
|
|
phase = self._current_phase()
|
|
return AcousticResult(
|
|
queue=self._make_event("queue", phase["queue"], timestamp),
|
|
speaker=self._make_event("speaker", phase["speaker"], timestamp),
|
|
environ=self._make_event("environ", phase["environ"], timestamp),
|
|
scene=self._make_event("scene", phase["scene"], timestamp),
|
|
timestamp=timestamp,
|
|
)
|
|
|
|
|
|
# ── AST acoustic backend (BSL 1.1) ───────────────────────────────────────────
|
|
|
|
|
|
class ASTAcousticBackend:
|
|
"""
|
|
Audio Spectrogram Transformer acoustic event classifier.
|
|
|
|
BSL 1.1 — requires [inference] extras.
|
|
|
|
Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to
|
|
classify queue state, speaker type, and background environment from a
|
|
single forward pass. Top-15 predictions are scanned; the highest-confidence
|
|
match per event category is emitted.
|
|
|
|
Model: MIT/ast-finetuned-audioset-10-10-0.4593
|
|
VRAM: ~300 MB on CUDA (fp32)
|
|
Input: float32 16kHz mono audio (any length; feature extractor pads/truncates)
|
|
|
|
Replaces the YAMNet stub. Synchronous — run from a thread pool executor
|
|
when called from async code.
|
|
"""
|
|
|
|
_MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
|
_SAMPLE_RATE = 16_000
|
|
_TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range
|
|
|
|
# Minimum confidence below which an event is suppressed even if it's the
|
|
# top match in its category.
|
|
_MIN_CONFIDENCE: dict[str, float] = {
|
|
"queue": 0.10,
|
|
"speaker": 0.08,
|
|
"environ": 0.12,
|
|
"scene": 0.08, # scenes fire reliably — lower bar is fine
|
|
}
|
|
|
|
# AudioSet class name → (event_type, cf-voice label).
|
|
# Top-K predictions are scanned; highest confidence per category wins.
|
|
# "call_center" requires dedicated call-centre acoustics, not generic indoor.
|
|
# "Music" was previously duplicated (queue + environ) — Python dicts keep the
|
|
# last entry, silently losing the queue mapping. Fixed: use the specific
|
|
# "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ.
|
|
_LABEL_MAP: dict[str, tuple[str, str]] = {
|
|
# ── Queue / call-state labels ──────────────────────────────────────────
|
|
"Ringtone": ("queue", "ringback"),
|
|
"Telephone bell ringing": ("queue", "ringback"),
|
|
"Busy signal": ("queue", "busy"),
|
|
"Dial tone": ("queue", "dtmf_tone"),
|
|
"DTMF": ("queue", "dtmf_tone"),
|
|
"Silence": ("queue", "silence"),
|
|
# ── Speaker type labels ────────────────────────────────────────────────
|
|
"Speech": ("speaker", "human_single"),
|
|
"Male speech, man speaking": ("speaker", "human_single"),
|
|
"Female speech, woman speaking": ("speaker", "human_single"),
|
|
"Child speech, kid speaking": ("speaker", "human_single"),
|
|
"Crowd": ("speaker", "human_multi"),
|
|
"Hubbub, speech noise, speech babble": ("speaker", "human_multi"),
|
|
"Laughter": ("speaker", "human_multi"),
|
|
"Chuckle, chortle": ("speaker", "human_multi"),
|
|
"Speech synthesizer": ("speaker", "ivr_synth"),
|
|
# ── Environmental labels ───────────────────────────────────────────────
|
|
# Telephony — requires specific call-centre acoustics, not generic indoor
|
|
"Telephone": ("environ", "call_center"),
|
|
"Telephone dialing, DTMF": ("environ", "call_center"),
|
|
"Reverberation": ("environ", "background_shift"),
|
|
"Echo": ("environ", "background_shift"),
|
|
"Background noise": ("environ", "noise_floor_change"),
|
|
"Noise": ("environ", "noise_floor_change"),
|
|
"White noise": ("environ", "noise_floor_change"),
|
|
"Pink noise": ("environ", "noise_floor_change"),
|
|
"Static": ("environ", "noise_floor_change"),
|
|
"Music": ("environ", "music"),
|
|
# Nature
|
|
"Bird": ("environ", "birdsong"),
|
|
"Bird vocalization, bird call, bird song": ("environ", "birdsong"),
|
|
"Chirp, tweet": ("environ", "birdsong"),
|
|
"Wind": ("environ", "wind"),
|
|
"Wind noise (microphone)": ("environ", "wind"),
|
|
"Rain": ("environ", "rain"),
|
|
"Rain on surface": ("environ", "rain"),
|
|
"Water": ("environ", "water"),
|
|
"Stream": ("environ", "water"),
|
|
# Urban
|
|
"Traffic noise, roadway noise": ("environ", "traffic"),
|
|
"Vehicle": ("environ", "traffic"),
|
|
"Crowd": ("environ", "crowd_chatter"),
|
|
"Chatter": ("environ", "crowd_chatter"),
|
|
"Construction": ("environ", "construction"),
|
|
"Drill": ("environ", "construction"),
|
|
# Indoor
|
|
"Air conditioning": ("environ", "hvac"),
|
|
"Mechanical fan": ("environ", "hvac"),
|
|
"Computer keyboard": ("environ", "keyboard_typing"),
|
|
"Typing": ("environ", "keyboard_typing"),
|
|
"Restaurant": ("environ", "restaurant"),
|
|
"Dishes, pots, and pans": ("environ", "restaurant"),
|
|
# ── Acoustic scene labels ──────────────────────────────────────────────
|
|
# "Inside, small/large room" moved from environ to scene — they correctly
|
|
# describe the acoustic scene but are NOT specific enough for call_center.
|
|
"Inside, small room": ("scene", "indoor_quiet"),
|
|
"Inside, large room or hall": ("scene", "indoor_crowd"),
|
|
"Outside, urban or manmade": ("scene", "outdoor_urban"),
|
|
"Field recording": ("scene", "outdoor_nature"),
|
|
"Rail transport": ("scene", "public_transit"),
|
|
"Bus": ("scene", "public_transit"),
|
|
"Train": ("scene", "public_transit"),
|
|
"Car": ("scene", "vehicle"),
|
|
"Truck": ("scene", "vehicle"),
|
|
"Motorcycle": ("scene", "vehicle"),
|
|
# Music in the queue sense — "Musical instrument" is more specific
|
|
# than the ambiguous top-level "Music" class
|
|
"Musical instrument": ("queue", "hold_music"),
|
|
"Piano": ("queue", "hold_music"),
|
|
"Guitar": ("queue", "hold_music"),
|
|
}
|
|
|
|
def __init__(self) -> None:
|
|
try:
|
|
from transformers import ASTFeatureExtractor, ASTForAudioClassification
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"transformers is required for AST acoustic classification. "
|
|
"Install with: pip install cf-voice[inference]"
|
|
) from exc
|
|
|
|
import torch
|
|
|
|
self._device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device)
|
|
self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID)
|
|
self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to(
|
|
self._device
|
|
)
|
|
self._model.eval()
|
|
|
|
def classify_window(
|
|
self,
|
|
audio: "list[float] | bytes",
|
|
timestamp: float = 0.0,
|
|
) -> AcousticResult:
|
|
import numpy as np
|
|
import torch
|
|
|
|
if isinstance(audio, bytes):
|
|
audio_np = np.frombuffer(audio, dtype=np.float32)
|
|
else:
|
|
audio_np = np.asarray(audio, dtype=np.float32)
|
|
|
|
if len(audio_np) == 0:
|
|
return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp)
|
|
|
|
inputs = self._extractor(
|
|
audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt"
|
|
)
|
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
logits = self._model(**inputs).logits
|
|
probs = torch.softmax(logits, dim=-1)[0]
|
|
id2label = self._model.config.id2label
|
|
|
|
top_k = min(self._TOP_K, len(probs))
|
|
top_indices = probs.topk(top_k).indices.tolist()
|
|
predictions = [(id2label[i], float(probs[i])) for i in top_indices]
|
|
|
|
# Take highest-confidence match per category
|
|
best: dict[str, tuple[str, float]] = {} # event_type → (label, conf)
|
|
for ast_label, conf in predictions:
|
|
mapping = self._LABEL_MAP.get(ast_label)
|
|
if mapping is None:
|
|
continue
|
|
etype, cf_label = mapping
|
|
if etype not in best or conf > best[etype][1]:
|
|
best[etype] = (cf_label, conf)
|
|
|
|
def _make_event(etype: str, label: str, conf: float) -> AudioEvent:
|
|
return AudioEvent(
|
|
timestamp=timestamp,
|
|
event_type=etype, # type: ignore[arg-type]
|
|
label=label,
|
|
confidence=round(conf, 4),
|
|
)
|
|
|
|
def _above_threshold(etype: str) -> bool:
|
|
if etype not in best:
|
|
return False
|
|
_, conf = best[etype]
|
|
return conf >= self._MIN_CONFIDENCE.get(etype, 0.10)
|
|
|
|
return AcousticResult(
|
|
queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None,
|
|
speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None,
|
|
environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None,
|
|
scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None,
|
|
timestamp=timestamp,
|
|
)
|
|
|
|
|
|
def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend":
|
|
"""
|
|
Factory: return an AcousticBackend for the current environment.
|
|
|
|
mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend
|
|
Otherwise → ASTAcousticBackend (falls back to mock on import error)
|
|
"""
|
|
import os
|
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
|
if use_mock:
|
|
return MockAcousticBackend()
|
|
try:
|
|
return ASTAcousticBackend()
|
|
except (ImportError, Exception) as exc:
|
|
logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc)
|
|
return MockAcousticBackend()
|