New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction
|
|
#
|
|
# MIT licensed (opensmile-python package is MIT).
|
|
#
|
|
# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set:
|
|
# F0 mean / std / percentiles (pitch)
|
|
# Jitter / Shimmer (cycle-to-cycle variation — vocal tension)
|
|
# Energy / loudness envelope
|
|
# MFCCs, spectral centroid
|
|
# Speaking rate, pause ratio
|
|
#
|
|
# Runs on CPU in a thread pool executor — no GPU required. Designed to run
|
|
# in parallel with the GPU classifiers in context._classify_real_async() via
|
|
# asyncio.gather().
|
|
#
|
|
# Enable with: CF_VOICE_PROSODY=1 (default off)
|
|
# Install: pip install opensmile
|
|
#
|
|
# openSMILE docs: https://audeering.github.io/opensmile-python/
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass
|
|
from functools import partial
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SAMPLE_RATE = 16_000
|
|
|
|
# F0 std normalisation constant: values below this threshold indicate flat prosody.
|
|
# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm".
|
|
# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat.
|
|
_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm"
|
|
_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean"
|
|
_LOUDNESS_FEATURE = "loudness_sma3_amean"
|
|
_JITTER_FEATURE = "jitterLocal_sma3nz_amean"
|
|
_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean"
|
|
_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec"
|
|
|
|
|
|
@dataclass
|
|
class ProsodicSignal:
|
|
"""
|
|
Summary prosodic features for a single audio window.
|
|
|
|
These are derived from the openSMILE eGeMAPS v02 feature set.
|
|
All values are raw feature magnitudes unless noted otherwise.
|
|
|
|
f0_mean: Mean F0 in semitones from 27.5Hz reference
|
|
f0_std: Normalised F0 standard deviation (flatness indicator)
|
|
jitter: Cycle-to-cycle pitch variation (vocal tension)
|
|
shimmer: Cycle-to-cycle amplitude variation (vocal stress)
|
|
loudness: Mean loudness (energy proxy)
|
|
sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive
|
|
audio (from DimensionalResult if available), and optional
|
|
text-audio divergence (linnet#22 signal, not yet wired).
|
|
flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied.
|
|
"""
|
|
f0_mean: float
|
|
f0_std: float
|
|
jitter: float
|
|
shimmer: float
|
|
loudness: float
|
|
flat_f0_score: float
|
|
sarcasm_risk: float
|
|
|
|
|
|
def _compute_sarcasm_risk(
|
|
flat_f0: float,
|
|
calm_positive: float = 0.0,
|
|
text_divergence: float = 0.0,
|
|
) -> float:
|
|
"""
|
|
Heuristic sarcasm indicator. Not a trained model — a signal to combine
|
|
with text divergence (linnet#22) for the final confidence score.
|
|
|
|
flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied).
|
|
calm_positive: DimensionalResult.calm_positive_score() when available.
|
|
text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22.
|
|
Pass 0.0 until the parallel text classifier is wired.
|
|
|
|
Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%).
|
|
"""
|
|
return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3)
|
|
|
|
|
|
class ProsodicExtractor:
|
|
"""
|
|
openSMILE eGeMAPS feature extractor for a single audio window.
|
|
|
|
CPU-bound inference — uses thread pool executor to avoid blocking asyncio.
|
|
Lazy-loads opensmile on first call so import cost is deferred.
|
|
|
|
Usage
|
|
-----
|
|
extractor = ProsodicExtractor()
|
|
signal = await extractor.extract_async(audio_float32)
|
|
print(signal.flat_f0_score, signal.sarcasm_risk)
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._smile = None
|
|
|
|
def _ensure_loaded(self) -> None:
|
|
"""Lazy-load opensmile on first extract call."""
|
|
if self._smile is not None:
|
|
return
|
|
|
|
try:
|
|
import opensmile
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"opensmile is required for prosodic feature extraction. "
|
|
"Install with: pip install opensmile"
|
|
) from exc
|
|
|
|
self._smile = opensmile.Smile(
|
|
feature_set=opensmile.FeatureSet.eGeMAPSv02,
|
|
feature_level=opensmile.FeatureLevel.Functionals,
|
|
)
|
|
logger.info("openSMILE eGeMAPS loaded")
|
|
|
|
def _extract_sync(
|
|
self,
|
|
audio_float32: np.ndarray,
|
|
calm_positive: float = 0.0,
|
|
text_divergence: float = 0.0,
|
|
) -> ProsodicSignal:
|
|
"""
|
|
Synchronous feature extraction. Always call via extract_async.
|
|
|
|
Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score.
|
|
If opensmile raises (e.g. audio too short, no voiced frames), returns a
|
|
zero-filled ProsodicSignal so the caller does not need to handle exceptions.
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
try:
|
|
feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE)
|
|
row = feats.iloc[0]
|
|
|
|
f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0))
|
|
f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0))
|
|
jitter = float(row.get(_JITTER_FEATURE, 0.0))
|
|
shimmer = float(row.get(_SHIMMER_FEATURE, 0.0))
|
|
loudness = float(row.get(_LOUDNESS_FEATURE, 0.0))
|
|
|
|
except Exception as exc:
|
|
logger.debug("openSMILE extraction failed (likely silent window): %s", exc)
|
|
return ProsodicSignal(
|
|
f0_mean=0.0, f0_std=0.0, jitter=0.0,
|
|
shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0,
|
|
)
|
|
|
|
# Normalise F0 variance to a flatness score.
|
|
# f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0
|
|
# f0_std of 0.0 = maximally flat → flat_f0 = 1.0
|
|
flat_f0 = 1.0 - min(f0_std / 0.4, 1.0)
|
|
|
|
sarcasm = _compute_sarcasm_risk(
|
|
flat_f0=flat_f0,
|
|
calm_positive=calm_positive,
|
|
text_divergence=text_divergence,
|
|
)
|
|
|
|
return ProsodicSignal(
|
|
f0_mean=round(f0_mean, 4),
|
|
f0_std=round(f0_std, 4),
|
|
jitter=round(jitter, 6),
|
|
shimmer=round(shimmer, 6),
|
|
loudness=round(loudness, 4),
|
|
flat_f0_score=round(flat_f0, 4),
|
|
sarcasm_risk=round(sarcasm, 4),
|
|
)
|
|
|
|
async def extract_async(
|
|
self,
|
|
audio_float32: np.ndarray,
|
|
calm_positive: float = 0.0,
|
|
text_divergence: float = 0.0,
|
|
) -> ProsodicSignal:
|
|
"""
|
|
Extract prosodic features without blocking the event loop.
|
|
|
|
calm_positive: Pass DimensionalResult.calm_positive_score() when
|
|
dimensional classification has already run.
|
|
text_divergence: Pass abs(transcript_sentiment - valence) when the
|
|
parallel text classifier (linnet#22) is wired.
|
|
"""
|
|
loop = asyncio.get_running_loop()
|
|
return await loop.run_in_executor(
|
|
None,
|
|
partial(self._extract_sync, audio_float32, calm_positive, text_divergence),
|
|
)
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "ProsodicExtractor":
|
|
"""Construct from environment. Raises if CF_VOICE_PROSODY is not set."""
|
|
if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
|
|
raise EnvironmentError(
|
|
"CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. "
|
|
"Add it to your .env and install opensmile: pip install opensmile"
|
|
)
|
|
return cls()
|