# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction # # MIT licensed (opensmile-python package is MIT). # # Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set: # F0 mean / std / percentiles (pitch) # Jitter / Shimmer (cycle-to-cycle variation — vocal tension) # Energy / loudness envelope # MFCCs, spectral centroid # Speaking rate, pause ratio # # Runs on CPU in a thread pool executor — no GPU required. Designed to run # in parallel with the GPU classifiers in context._classify_real_async() via # asyncio.gather(). # # Enable with: CF_VOICE_PROSODY=1 (default off) # Install: pip install opensmile # # openSMILE docs: https://audeering.github.io/opensmile-python/ from __future__ import annotations import asyncio import logging import os from dataclasses import dataclass from functools import partial import numpy as np logger = logging.getLogger(__name__) _SAMPLE_RATE = 16_000 # F0 std normalisation constant: values below this threshold indicate flat prosody. # Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm". # A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat. _F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm" _F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean" _LOUDNESS_FEATURE = "loudness_sma3_amean" _JITTER_FEATURE = "jitterLocal_sma3nz_amean" _SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean" _SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec" @dataclass class ProsodicSignal: """ Summary prosodic features for a single audio window. These are derived from the openSMILE eGeMAPS v02 feature set. All values are raw feature magnitudes unless noted otherwise. f0_mean: Mean F0 in semitones from 27.5Hz reference f0_std: Normalised F0 standard deviation (flatness indicator) jitter: Cycle-to-cycle pitch variation (vocal tension) shimmer: Cycle-to-cycle amplitude variation (vocal stress) loudness: Mean loudness (energy proxy) sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive audio (from DimensionalResult if available), and optional text-audio divergence (linnet#22 signal, not yet wired). flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied. """ f0_mean: float f0_std: float jitter: float shimmer: float loudness: float flat_f0_score: float sarcasm_risk: float def _compute_sarcasm_risk( flat_f0: float, calm_positive: float = 0.0, text_divergence: float = 0.0, ) -> float: """ Heuristic sarcasm indicator. Not a trained model — a signal to combine with text divergence (linnet#22) for the final confidence score. flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied). calm_positive: DimensionalResult.calm_positive_score() when available. text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22. Pass 0.0 until the parallel text classifier is wired. Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%). """ return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3) class ProsodicExtractor: """ openSMILE eGeMAPS feature extractor for a single audio window. CPU-bound inference — uses thread pool executor to avoid blocking asyncio. Lazy-loads opensmile on first call so import cost is deferred. Usage ----- extractor = ProsodicExtractor() signal = await extractor.extract_async(audio_float32) print(signal.flat_f0_score, signal.sarcasm_risk) """ def __init__(self) -> None: self._smile = None def _ensure_loaded(self) -> None: """Lazy-load opensmile on first extract call.""" if self._smile is not None: return try: import opensmile except ImportError as exc: raise ImportError( "opensmile is required for prosodic feature extraction. " "Install with: pip install opensmile" ) from exc self._smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, ) logger.info("openSMILE eGeMAPS loaded") def _extract_sync( self, audio_float32: np.ndarray, calm_positive: float = 0.0, text_divergence: float = 0.0, ) -> ProsodicSignal: """ Synchronous feature extraction. Always call via extract_async. Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score. If opensmile raises (e.g. audio too short, no voiced frames), returns a zero-filled ProsodicSignal so the caller does not need to handle exceptions. """ self._ensure_loaded() try: feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE) row = feats.iloc[0] f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0)) f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0)) jitter = float(row.get(_JITTER_FEATURE, 0.0)) shimmer = float(row.get(_SHIMMER_FEATURE, 0.0)) loudness = float(row.get(_LOUDNESS_FEATURE, 0.0)) except Exception as exc: logger.debug("openSMILE extraction failed (likely silent window): %s", exc) return ProsodicSignal( f0_mean=0.0, f0_std=0.0, jitter=0.0, shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0, ) # Normalise F0 variance to a flatness score. # f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0 # f0_std of 0.0 = maximally flat → flat_f0 = 1.0 flat_f0 = 1.0 - min(f0_std / 0.4, 1.0) sarcasm = _compute_sarcasm_risk( flat_f0=flat_f0, calm_positive=calm_positive, text_divergence=text_divergence, ) return ProsodicSignal( f0_mean=round(f0_mean, 4), f0_std=round(f0_std, 4), jitter=round(jitter, 6), shimmer=round(shimmer, 6), loudness=round(loudness, 4), flat_f0_score=round(flat_f0, 4), sarcasm_risk=round(sarcasm, 4), ) async def extract_async( self, audio_float32: np.ndarray, calm_positive: float = 0.0, text_divergence: float = 0.0, ) -> ProsodicSignal: """ Extract prosodic features without blocking the event loop. calm_positive: Pass DimensionalResult.calm_positive_score() when dimensional classification has already run. text_divergence: Pass abs(transcript_sentiment - valence) when the parallel text classifier (linnet#22) is wired. """ loop = asyncio.get_running_loop() return await loop.run_in_executor( None, partial(self._extract_sync, audio_float32, calm_positive, text_divergence), ) @classmethod def from_env(cls) -> "ProsodicExtractor": """Construct from environment. Raises if CF_VOICE_PROSODY is not set.""" if os.environ.get("CF_VOICE_PROSODY", "0") != "1": raise EnvironmentError( "CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. " "Add it to your .env and install opensmile: pip install opensmile" ) return cls()