cf-voice/cf_voice/models.py

# cf_voice/models.py — VoiceFrame API contract
#
# This module is MIT licensed. All consumers (Linnet, Osprey, etc.)
# import VoiceFrame from here so the shape is consistent across the stack.
from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class VoiceFrame:
    """
    A single annotated moment in a voice stream.

    Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
    (tone classification, speaker diarization, dimensional emotion).

    Fields
    ------
    label           Tone annotation, e.g. "Warmly impatient" or "Deflecting".
                    Generic by default; Elcor-style prefix format is an
                    easter egg surfaced by the product UI, not set here.
    confidence      0.0-1.0. Below ~0.5 the annotation is speculative.
    speaker_id      Ephemeral local label ("speaker_a", "speaker_b").
                    Not tied to identity — resets each session.
    shift_magnitude Delta from the previous frame's tone, 0.0-1.0.
                    High values indicate a meaningful register shift.
    timestamp       Session-relative seconds since capture started.

    Dimensional emotion (audeering model — Navigation v0.2.x, optional):
    valence         0.0-1.0. Negative affect (0) to positive affect (1).
    arousal         0.0-1.0. Low energy / calm (0) to high energy / excited (1).
    dominance       0.0-1.0. Submissive / uncertain (0) to assertive / dominant (1).

    Prosodic features (openSMILE eGeMAPS — Navigation v0.2.x, optional):
    sarcasm_risk    0.0-1.0 heuristic score: flat F0 + calm-positive VAD +
                    text divergence (linnet#22). All three signals required for
                    high confidence — audio-only signals are weak priors.
    flat_f0_score   Normalised F0 flatness: 1.0 = maximally flat pitch.
    """

    label: str
    confidence: float
    speaker_id: str
    shift_magnitude: float
    timestamp: float

    # Dimensional emotion scores — None when dimensional classifier is disabled
    valence: float | None = None
    arousal: float | None = None
    dominance: float | None = None

    # Prosodic signals — None when prosodic extractor is disabled
    sarcasm_risk: float | None = None
    flat_f0_score: float | None = None

    def is_reliable(self, threshold: float = 0.6) -> bool:
        """Return True when confidence meets the given threshold."""
        return self.confidence >= threshold

    def is_shift(self, threshold: float = 0.3) -> bool:
        """Return True when shift_magnitude indicates a meaningful register change."""
        return self.shift_magnitude >= threshold