cf-voice/cf_voice/prosody.py

# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction
#
# MIT licensed (opensmile-python package is MIT).
#
# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set:
#   F0 mean / std / percentiles (pitch)
#   Jitter / Shimmer (cycle-to-cycle variation — vocal tension)
#   Energy / loudness envelope
#   MFCCs, spectral centroid
#   Speaking rate, pause ratio
#
# Runs on CPU in a thread pool executor — no GPU required. Designed to run
# in parallel with the GPU classifiers in context._classify_real_async() via
# asyncio.gather().
#
# Enable with: CF_VOICE_PROSODY=1 (default off)
# Install:     pip install opensmile
#
# openSMILE docs: https://audeering.github.io/opensmile-python/
from __future__ import annotations

import asyncio
import logging
import os
from dataclasses import dataclass
from functools import partial

import numpy as np

logger = logging.getLogger(__name__)

_SAMPLE_RATE = 16_000

# F0 std normalisation constant: values below this threshold indicate flat prosody.
# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm".
# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat.
_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm"
_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean"
_LOUDNESS_FEATURE = "loudness_sma3_amean"
_JITTER_FEATURE = "jitterLocal_sma3nz_amean"
_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean"
_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec"


@dataclass
class ProsodicSignal:
    """
    Summary prosodic features for a single audio window.

    These are derived from the openSMILE eGeMAPS v02 feature set.
    All values are raw feature magnitudes unless noted otherwise.

    f0_mean:         Mean F0 in semitones from 27.5Hz reference
    f0_std:          Normalised F0 standard deviation (flatness indicator)
    jitter:          Cycle-to-cycle pitch variation (vocal tension)
    shimmer:         Cycle-to-cycle amplitude variation (vocal stress)
    loudness:        Mean loudness (energy proxy)
    sarcasm_risk:    0-1 heuristic score combining flat F0, calm-positive
                     audio (from DimensionalResult if available), and optional
                     text-audio divergence (linnet#22 signal, not yet wired).
    flat_f0_score:   Normalised flatness: 1.0 = maximally flat, 0.0 = varied.
    """
    f0_mean: float
    f0_std: float
    jitter: float
    shimmer: float
    loudness: float
    flat_f0_score: float
    sarcasm_risk: float


def _compute_sarcasm_risk(
    flat_f0: float,
    calm_positive: float = 0.0,
    text_divergence: float = 0.0,
) -> float:
    """
    Heuristic sarcasm indicator. Not a trained model — a signal to combine
    with text divergence (linnet#22) for the final confidence score.

    flat_f0:          Normalised F0 flatness (1.0 = flat, 0.0 = varied).
    calm_positive:    DimensionalResult.calm_positive_score() when available.
    text_divergence:  abs(transcript_sentiment - audio_valence) from linnet#22.
                      Pass 0.0 until the parallel text classifier is wired.

    Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%).
    """
    return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3)


class ProsodicExtractor:
    """
    openSMILE eGeMAPS feature extractor for a single audio window.

    CPU-bound inference — uses thread pool executor to avoid blocking asyncio.
    Lazy-loads opensmile on first call so import cost is deferred.

    Usage
    -----
        extractor = ProsodicExtractor()
        signal = await extractor.extract_async(audio_float32)
        print(signal.flat_f0_score, signal.sarcasm_risk)
    """

    def __init__(self) -> None:
        self._smile = None

    def _ensure_loaded(self) -> None:
        """Lazy-load opensmile on first extract call."""
        if self._smile is not None:
            return

        try:
            import opensmile
        except ImportError as exc:
            raise ImportError(
                "opensmile is required for prosodic feature extraction. "
                "Install with: pip install opensmile"
            ) from exc

        self._smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
        logger.info("openSMILE eGeMAPS loaded")

    def _extract_sync(
        self,
        audio_float32: np.ndarray,
        calm_positive: float = 0.0,
        text_divergence: float = 0.0,
    ) -> ProsodicSignal:
        """
        Synchronous feature extraction. Always call via extract_async.

        Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score.
        If opensmile raises (e.g. audio too short, no voiced frames), returns a
        zero-filled ProsodicSignal so the caller does not need to handle exceptions.
        """
        self._ensure_loaded()

        try:
            feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE)
            row = feats.iloc[0]

            f0_mean  = float(row.get(_F0_MEAN_FEATURE, 0.0))
            f0_std   = float(row.get(_F0_STD_NORM_FEATURE, 0.0))
            jitter   = float(row.get(_JITTER_FEATURE, 0.0))
            shimmer  = float(row.get(_SHIMMER_FEATURE, 0.0))
            loudness = float(row.get(_LOUDNESS_FEATURE, 0.0))

        except Exception as exc:
            logger.debug("openSMILE extraction failed (likely silent window): %s", exc)
            return ProsodicSignal(
                f0_mean=0.0, f0_std=0.0, jitter=0.0,
                shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0,
            )

        # Normalise F0 variance to a flatness score.
        # f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0
        # f0_std of 0.0 = maximally flat → flat_f0 = 1.0
        flat_f0 = 1.0 - min(f0_std / 0.4, 1.0)

        sarcasm = _compute_sarcasm_risk(
            flat_f0=flat_f0,
            calm_positive=calm_positive,
            text_divergence=text_divergence,
        )

        return ProsodicSignal(
            f0_mean=round(f0_mean, 4),
            f0_std=round(f0_std, 4),
            jitter=round(jitter, 6),
            shimmer=round(shimmer, 6),
            loudness=round(loudness, 4),
            flat_f0_score=round(flat_f0, 4),
            sarcasm_risk=round(sarcasm, 4),
        )

    async def extract_async(
        self,
        audio_float32: np.ndarray,
        calm_positive: float = 0.0,
        text_divergence: float = 0.0,
    ) -> ProsodicSignal:
        """
        Extract prosodic features without blocking the event loop.

        calm_positive:   Pass DimensionalResult.calm_positive_score() when
                         dimensional classification has already run.
        text_divergence: Pass abs(transcript_sentiment - valence) when the
                         parallel text classifier (linnet#22) is wired.
        """
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(
            None,
            partial(self._extract_sync, audio_float32, calm_positive, text_divergence),
        )

    @classmethod
    def from_env(cls) -> "ProsodicExtractor":
        """Construct from environment. Raises if CF_VOICE_PROSODY is not set."""
        if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
            raise EnvironmentError(
                "CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. "
                "Add it to your .env and install opensmile: pip install opensmile"
            )
        return cls()