cf-voice/cf_voice/acoustic.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

366 lines
17 KiB
Python

# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier
#
# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference).
# Requires [inference] extras for real mode.
#
# This module is the AMD (answering machine detection) backbone for Osprey.
# It runs in parallel with the STT pipeline — it never processes words,
# only acoustic features (pitch, timbre, background, DTMF tones, ringback).
#
# Navigation v0.2.x wires the real YAMNet model.
# Current: mock emits a plausible call-lifecycle sequence.
from __future__ import annotations
import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import AsyncIterator, Protocol, Sequence, runtime_checkable
from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS
logger = logging.getLogger(__name__)
_SAMPLE_RATE = 16_000
@dataclass
class AcousticResult:
"""Batch of AudioEvents produced from a single audio window."""
queue: AudioEvent | None
speaker: AudioEvent | None
environ: AudioEvent | None
scene: AudioEvent | None
timestamp: float
@runtime_checkable
class AcousticBackend(Protocol):
"""
Interface for acoustic event classifiers.
classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an
AcousticResult covering one analysis window (~2s). It is synchronous and
runs in a thread pool when called from async code.
"""
def classify_window(
self,
audio: "list[float] | bytes",
timestamp: float = 0.0,
) -> AcousticResult:
...
@runtime_checkable
class SceneBackend(Protocol):
"""
Interface for dedicated acoustic scene classifiers.
Separate from AcousticBackend to allow future swapping to a specialised
scene model (e.g. AudioSet acoustic-scene subset) without touching the
telephony event classifier.
"""
def classify_scene(
self,
audio: "list[float] | bytes",
timestamp: float = 0.0,
) -> AudioEvent | None:
...
# ── Call lifecycle for mock mode ──────────────────────────────────────────────
# Approximates what a real outbound call looks like acoustically.
# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center
_MOCK_LIFECYCLE: list[dict] = [
# (min_s, max_s): how long to stay in this phase
{"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)},
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)},
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)},
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)},
{"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)},
# AMD moment: background_shift is the primary signal
{"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)},
{"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)},
]
class MockAcousticBackend:
"""
Synthetic acoustic classifier for development and CI.
Cycles through a plausible call lifecycle so Osprey's IVR state machine
can be tested without real telephony. The AMD signal (background_shift →
human_single) is emitted at the right point in the sequence.
Usage:
backend = MockAcousticBackend(seed=42)
result = backend.classify_window(b"", timestamp=4.5)
print(result.environ.label) # → "hold_music", "background_shift", etc.
"""
def __init__(self, seed: int | None = None) -> None:
self._rng = random.Random(seed)
self._phase_idx = 0
self._phase_start = time.monotonic()
self._phase_dur = self._draw_phase_dur(0)
def _draw_phase_dur(self, idx: int) -> float:
lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"]
return self._rng.uniform(lo, hi)
def _current_phase(self) -> dict:
now = time.monotonic()
elapsed = now - self._phase_start
if elapsed >= self._phase_dur:
self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE)
self._phase_start = now
self._phase_dur = self._draw_phase_dur(self._phase_idx)
return _MOCK_LIFECYCLE[self._phase_idx]
def _make_event(
self,
event_type: str,
label: str,
timestamp: float,
) -> AudioEvent:
return AudioEvent(
timestamp=timestamp,
event_type=event_type, # type: ignore[arg-type]
label=label,
confidence=self._rng.uniform(0.72, 0.97),
)
def classify_window(
self,
audio: "list[float] | bytes",
timestamp: float = 0.0,
) -> AcousticResult:
phase = self._current_phase()
return AcousticResult(
queue=self._make_event("queue", phase["queue"], timestamp),
speaker=self._make_event("speaker", phase["speaker"], timestamp),
environ=self._make_event("environ", phase["environ"], timestamp),
scene=self._make_event("scene", phase["scene"], timestamp),
timestamp=timestamp,
)
# ── AST acoustic backend (BSL 1.1) ───────────────────────────────────────────
class ASTAcousticBackend:
"""
Audio Spectrogram Transformer acoustic event classifier.
BSL 1.1 — requires [inference] extras.
Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to
classify queue state, speaker type, and background environment from a
single forward pass. Top-15 predictions are scanned; the highest-confidence
match per event category is emitted.
Model: MIT/ast-finetuned-audioset-10-10-0.4593
VRAM: ~300 MB on CUDA (fp32)
Input: float32 16kHz mono audio (any length; feature extractor pads/truncates)
Replaces the YAMNet stub. Synchronous — run from a thread pool executor
when called from async code.
"""
_MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593"
_SAMPLE_RATE = 16_000
_TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range
# Minimum confidence below which an event is suppressed even if it's the
# top match in its category.
_MIN_CONFIDENCE: dict[str, float] = {
"queue": 0.10,
"speaker": 0.08,
"environ": 0.12,
"scene": 0.08, # scenes fire reliably — lower bar is fine
}
# AudioSet class name → (event_type, cf-voice label).
# Top-K predictions are scanned; highest confidence per category wins.
# "call_center" requires dedicated call-centre acoustics, not generic indoor.
# "Music" was previously duplicated (queue + environ) — Python dicts keep the
# last entry, silently losing the queue mapping. Fixed: use the specific
# "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ.
_LABEL_MAP: dict[str, tuple[str, str]] = {
# ── Queue / call-state labels ──────────────────────────────────────────
"Ringtone": ("queue", "ringback"),
"Telephone bell ringing": ("queue", "ringback"),
"Busy signal": ("queue", "busy"),
"Dial tone": ("queue", "dtmf_tone"),
"DTMF": ("queue", "dtmf_tone"),
"Silence": ("queue", "silence"),
# ── Speaker type labels ────────────────────────────────────────────────
"Speech": ("speaker", "human_single"),
"Male speech, man speaking": ("speaker", "human_single"),
"Female speech, woman speaking": ("speaker", "human_single"),
"Child speech, kid speaking": ("speaker", "human_single"),
"Crowd": ("speaker", "human_multi"),
"Hubbub, speech noise, speech babble": ("speaker", "human_multi"),
"Laughter": ("speaker", "human_multi"),
"Chuckle, chortle": ("speaker", "human_multi"),
"Speech synthesizer": ("speaker", "ivr_synth"),
# ── Environmental labels ───────────────────────────────────────────────
# Telephony — requires specific call-centre acoustics, not generic indoor
"Telephone": ("environ", "call_center"),
"Telephone dialing, DTMF": ("environ", "call_center"),
"Reverberation": ("environ", "background_shift"),
"Echo": ("environ", "background_shift"),
"Background noise": ("environ", "noise_floor_change"),
"Noise": ("environ", "noise_floor_change"),
"White noise": ("environ", "noise_floor_change"),
"Pink noise": ("environ", "noise_floor_change"),
"Static": ("environ", "noise_floor_change"),
"Music": ("environ", "music"),
# Nature
"Bird": ("environ", "birdsong"),
"Bird vocalization, bird call, bird song": ("environ", "birdsong"),
"Chirp, tweet": ("environ", "birdsong"),
"Wind": ("environ", "wind"),
"Wind noise (microphone)": ("environ", "wind"),
"Rain": ("environ", "rain"),
"Rain on surface": ("environ", "rain"),
"Water": ("environ", "water"),
"Stream": ("environ", "water"),
# Urban
"Traffic noise, roadway noise": ("environ", "traffic"),
"Vehicle": ("environ", "traffic"),
"Crowd": ("environ", "crowd_chatter"),
"Chatter": ("environ", "crowd_chatter"),
"Construction": ("environ", "construction"),
"Drill": ("environ", "construction"),
# Indoor
"Air conditioning": ("environ", "hvac"),
"Mechanical fan": ("environ", "hvac"),
"Computer keyboard": ("environ", "keyboard_typing"),
"Typing": ("environ", "keyboard_typing"),
"Restaurant": ("environ", "restaurant"),
"Dishes, pots, and pans": ("environ", "restaurant"),
# ── Acoustic scene labels ──────────────────────────────────────────────
# "Inside, small/large room" moved from environ to scene — they correctly
# describe the acoustic scene but are NOT specific enough for call_center.
"Inside, small room": ("scene", "indoor_quiet"),
"Inside, large room or hall": ("scene", "indoor_crowd"),
"Outside, urban or manmade": ("scene", "outdoor_urban"),
"Field recording": ("scene", "outdoor_nature"),
"Rail transport": ("scene", "public_transit"),
"Bus": ("scene", "public_transit"),
"Train": ("scene", "public_transit"),
"Car": ("scene", "vehicle"),
"Truck": ("scene", "vehicle"),
"Motorcycle": ("scene", "vehicle"),
# Music in the queue sense — "Musical instrument" is more specific
# than the ambiguous top-level "Music" class
"Musical instrument": ("queue", "hold_music"),
"Piano": ("queue", "hold_music"),
"Guitar": ("queue", "hold_music"),
}
def __init__(self) -> None:
try:
from transformers import ASTFeatureExtractor, ASTForAudioClassification
except ImportError as exc:
raise ImportError(
"transformers is required for AST acoustic classification. "
"Install with: pip install cf-voice[inference]"
) from exc
import torch
self._device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device)
self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID)
self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to(
self._device
)
self._model.eval()
def classify_window(
self,
audio: "list[float] | bytes",
timestamp: float = 0.0,
) -> AcousticResult:
import numpy as np
import torch
if isinstance(audio, bytes):
audio_np = np.frombuffer(audio, dtype=np.float32)
else:
audio_np = np.asarray(audio, dtype=np.float32)
if len(audio_np) == 0:
return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp)
inputs = self._extractor(
audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt"
)
inputs = {k: v.to(self._device) for k, v in inputs.items()}
with torch.no_grad():
logits = self._model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]
id2label = self._model.config.id2label
top_k = min(self._TOP_K, len(probs))
top_indices = probs.topk(top_k).indices.tolist()
predictions = [(id2label[i], float(probs[i])) for i in top_indices]
# Take highest-confidence match per category
best: dict[str, tuple[str, float]] = {} # event_type → (label, conf)
for ast_label, conf in predictions:
mapping = self._LABEL_MAP.get(ast_label)
if mapping is None:
continue
etype, cf_label = mapping
if etype not in best or conf > best[etype][1]:
best[etype] = (cf_label, conf)
def _make_event(etype: str, label: str, conf: float) -> AudioEvent:
return AudioEvent(
timestamp=timestamp,
event_type=etype, # type: ignore[arg-type]
label=label,
confidence=round(conf, 4),
)
def _above_threshold(etype: str) -> bool:
if etype not in best:
return False
_, conf = best[etype]
return conf >= self._MIN_CONFIDENCE.get(etype, 0.10)
return AcousticResult(
queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None,
speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None,
environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None,
scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None,
timestamp=timestamp,
)
def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend":
"""
Factory: return an AcousticBackend for the current environment.
mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend
Otherwise → ASTAcousticBackend (falls back to mock on import error)
"""
import os
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
if use_mock:
return MockAcousticBackend()
try:
return ASTAcousticBackend()
except (ImportError, Exception) as exc:
logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc)
return MockAcousticBackend()