cf-voice/cf_voice/prosody.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

208 lines
7.4 KiB
Python

# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction
#
# MIT licensed (opensmile-python package is MIT).
#
# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set:
# F0 mean / std / percentiles (pitch)
# Jitter / Shimmer (cycle-to-cycle variation — vocal tension)
# Energy / loudness envelope
# MFCCs, spectral centroid
# Speaking rate, pause ratio
#
# Runs on CPU in a thread pool executor — no GPU required. Designed to run
# in parallel with the GPU classifiers in context._classify_real_async() via
# asyncio.gather().
#
# Enable with: CF_VOICE_PROSODY=1 (default off)
# Install: pip install opensmile
#
# openSMILE docs: https://audeering.github.io/opensmile-python/
from __future__ import annotations
import asyncio
import logging
import os
from dataclasses import dataclass
from functools import partial
import numpy as np
logger = logging.getLogger(__name__)
_SAMPLE_RATE = 16_000
# F0 std normalisation constant: values below this threshold indicate flat prosody.
# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm".
# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat.
_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm"
_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean"
_LOUDNESS_FEATURE = "loudness_sma3_amean"
_JITTER_FEATURE = "jitterLocal_sma3nz_amean"
_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean"
_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec"
@dataclass
class ProsodicSignal:
"""
Summary prosodic features for a single audio window.
These are derived from the openSMILE eGeMAPS v02 feature set.
All values are raw feature magnitudes unless noted otherwise.
f0_mean: Mean F0 in semitones from 27.5Hz reference
f0_std: Normalised F0 standard deviation (flatness indicator)
jitter: Cycle-to-cycle pitch variation (vocal tension)
shimmer: Cycle-to-cycle amplitude variation (vocal stress)
loudness: Mean loudness (energy proxy)
sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive
audio (from DimensionalResult if available), and optional
text-audio divergence (linnet#22 signal, not yet wired).
flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied.
"""
f0_mean: float
f0_std: float
jitter: float
shimmer: float
loudness: float
flat_f0_score: float
sarcasm_risk: float
def _compute_sarcasm_risk(
flat_f0: float,
calm_positive: float = 0.0,
text_divergence: float = 0.0,
) -> float:
"""
Heuristic sarcasm indicator. Not a trained model — a signal to combine
with text divergence (linnet#22) for the final confidence score.
flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied).
calm_positive: DimensionalResult.calm_positive_score() when available.
text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22.
Pass 0.0 until the parallel text classifier is wired.
Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%).
"""
return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3)
class ProsodicExtractor:
"""
openSMILE eGeMAPS feature extractor for a single audio window.
CPU-bound inference — uses thread pool executor to avoid blocking asyncio.
Lazy-loads opensmile on first call so import cost is deferred.
Usage
-----
extractor = ProsodicExtractor()
signal = await extractor.extract_async(audio_float32)
print(signal.flat_f0_score, signal.sarcasm_risk)
"""
def __init__(self) -> None:
self._smile = None
def _ensure_loaded(self) -> None:
"""Lazy-load opensmile on first extract call."""
if self._smile is not None:
return
try:
import opensmile
except ImportError as exc:
raise ImportError(
"opensmile is required for prosodic feature extraction. "
"Install with: pip install opensmile"
) from exc
self._smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
)
logger.info("openSMILE eGeMAPS loaded")
def _extract_sync(
self,
audio_float32: np.ndarray,
calm_positive: float = 0.0,
text_divergence: float = 0.0,
) -> ProsodicSignal:
"""
Synchronous feature extraction. Always call via extract_async.
Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score.
If opensmile raises (e.g. audio too short, no voiced frames), returns a
zero-filled ProsodicSignal so the caller does not need to handle exceptions.
"""
self._ensure_loaded()
try:
feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE)
row = feats.iloc[0]
f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0))
f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0))
jitter = float(row.get(_JITTER_FEATURE, 0.0))
shimmer = float(row.get(_SHIMMER_FEATURE, 0.0))
loudness = float(row.get(_LOUDNESS_FEATURE, 0.0))
except Exception as exc:
logger.debug("openSMILE extraction failed (likely silent window): %s", exc)
return ProsodicSignal(
f0_mean=0.0, f0_std=0.0, jitter=0.0,
shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0,
)
# Normalise F0 variance to a flatness score.
# f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0
# f0_std of 0.0 = maximally flat → flat_f0 = 1.0
flat_f0 = 1.0 - min(f0_std / 0.4, 1.0)
sarcasm = _compute_sarcasm_risk(
flat_f0=flat_f0,
calm_positive=calm_positive,
text_divergence=text_divergence,
)
return ProsodicSignal(
f0_mean=round(f0_mean, 4),
f0_std=round(f0_std, 4),
jitter=round(jitter, 6),
shimmer=round(shimmer, 6),
loudness=round(loudness, 4),
flat_f0_score=round(flat_f0, 4),
sarcasm_risk=round(sarcasm, 4),
)
async def extract_async(
self,
audio_float32: np.ndarray,
calm_positive: float = 0.0,
text_divergence: float = 0.0,
) -> ProsodicSignal:
"""
Extract prosodic features without blocking the event loop.
calm_positive: Pass DimensionalResult.calm_positive_score() when
dimensional classification has already run.
text_divergence: Pass abs(transcript_sentiment - valence) when the
parallel text classifier (linnet#22) is wired.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(
None,
partial(self._extract_sync, audio_float32, calm_positive, text_divergence),
)
@classmethod
def from_env(cls) -> "ProsodicExtractor":
"""Construct from environment. Raises if CF_VOICE_PROSODY is not set."""
if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
raise EnvironmentError(
"CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. "
"Add it to your .env and install opensmile: pip install opensmile"
)
return cls()