New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
288 lines
11 KiB
Python
288 lines
11 KiB
Python
# cf_voice/trajectory.py — affect trajectory and SER/VAD coherence signals
|
|
#
|
|
# MIT licensed — derived computation only, no inference models.
|
|
#
|
|
# Two signal families:
|
|
#
|
|
# 1. TrajectorySignal — rolling arousal/valence trend across the last N windows.
|
|
# Detects escalation, de-escalation, suppression, worsening, improving.
|
|
#
|
|
# 2. CoherenceSignal — cross-model comparison between SER (categorical affect)
|
|
# and VAD (continuous dimensional valence). Disagreement indicates affect
|
|
# suppression, controlled presentation, or surface-only semantic reframe.
|
|
#
|
|
# Both signals activate only after BASELINE_MIN windows per speaker are buffered.
|
|
# All thresholds are relative to the per-speaker rolling mean, not absolute —
|
|
# this is required for ND/neurodivergent speaker safety (see design doc).
|
|
#
|
|
# Safety note: these signals must never be labelled "deception" in any
|
|
# user-facing context. Use: "affect divergence", "controlled presentation",
|
|
# "framing shift". The user interprets; the system observes.
|
|
from __future__ import annotations
|
|
|
|
from collections import deque
|
|
from dataclasses import dataclass
|
|
|
|
from cf_voice.dimensional import DimensionalResult
|
|
|
|
# Rolling window depth per speaker
|
|
BUFFER_WINDOW = 5
|
|
|
|
# Minimum frames before signals activate (relative baseline requirement)
|
|
BASELINE_MIN = 3
|
|
|
|
# Minimum arousal/valence delta per window to count as directional movement
|
|
_DELTA_THRESHOLD = 0.05
|
|
|
|
# Arousal threshold above which "neutral SER + high arousal" = suppression candidate
|
|
_SUPPRESSION_AROUSAL_MIN = 0.65
|
|
|
|
# SER affects that imply low arousal presentation (used for suppression detection)
|
|
_LOW_PRESENTATION_AFFECTS = frozenset({"neutral", "scripted", "tired", "apologetic"})
|
|
|
|
# Expected valence ranges derived from MSP-Podcast emotion distribution.
|
|
# Used to determine whether SER affect label and dimensional valence agree.
|
|
_AFFECT_VALENCE_PRIOR: dict[str, tuple[float, float]] = {
|
|
"warm": (0.60, 1.00),
|
|
"genuine": (0.55, 1.00),
|
|
"optimistic": (0.55, 0.90),
|
|
"neutral": (0.35, 0.65),
|
|
"confused": (0.30, 0.60),
|
|
"scripted": (0.30, 0.65),
|
|
"apologetic": (0.20, 0.55),
|
|
"tired": (0.10, 0.50),
|
|
"frustrated": (0.10, 0.45),
|
|
"dismissive": (0.15, 0.50),
|
|
"condescending": (0.10, 0.45),
|
|
"urgent": (0.15, 0.55),
|
|
}
|
|
|
|
# Ordinal positivity for reframe direction detection.
|
|
# Higher = more positive presentation.
|
|
_AFFECT_POSITIVITY: dict[str, int] = {
|
|
"urgent": 1,
|
|
"frustrated": 1,
|
|
"condescending": 1,
|
|
"dismissive": 2,
|
|
"tired": 2,
|
|
"apologetic": 3,
|
|
"confused": 3,
|
|
"scripted": 4,
|
|
"neutral": 4,
|
|
"optimistic": 5,
|
|
"genuine": 5,
|
|
"warm": 6,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class TrajectorySignal:
|
|
"""
|
|
Rolling trend across recent dimensional frames for one speaker.
|
|
|
|
All delta values: current_frame_value - mean(buffer_values).
|
|
Positive arousal_delta = current frame is more activated than baseline.
|
|
Negative valence_delta = current frame is more negative than baseline.
|
|
|
|
trend values:
|
|
"calibrating" not enough frames yet (< BASELINE_MIN)
|
|
"stable" no significant directional movement
|
|
"escalating" arousal rising: current > mean by DELTA_THRESHOLD, consecutive
|
|
"de-escalating" arousal falling after elevated period
|
|
"worsening" valence falling: current < mean, consecutive
|
|
"improving" valence rising after depressed period
|
|
"suppressed" SER affect is calm/neutral, VAD arousal is elevated
|
|
"""
|
|
arousal_delta: float
|
|
valence_delta: float
|
|
dominance_delta: float
|
|
arousal_trend: str # "rising" | "falling" | "flat"
|
|
valence_trend: str # "rising" | "falling" | "flat"
|
|
trend: str
|
|
frames_in_buffer: int
|
|
baseline_established: bool
|
|
|
|
|
|
@dataclass
|
|
class CoherenceSignal:
|
|
"""
|
|
Cross-signal comparison: SER categorical affect vs. VAD dimensional valence.
|
|
|
|
coherence_score:
|
|
1.0 = SER label and VAD valence are fully consistent.
|
|
0.0 = maximum disagreement.
|
|
|
|
suppression_flag:
|
|
True when the speaker is presenting as calm/neutral (SER) but VAD arousal
|
|
is elevated. Indicates controlled presentation with activation underneath.
|
|
This is relative to a per-session threshold — not a universal claim.
|
|
|
|
reframe_type:
|
|
"none" no SER category shift this window
|
|
"genuine" SER shifted toward more positive AND dimensional valence also
|
|
improved (>= DELTA_THRESHOLD in this window)
|
|
"surface" SER shifted toward more positive BUT dimensional valence
|
|
continued its prior trajectory unchanged or worsening
|
|
|
|
affect_divergence:
|
|
Signed: VAD-implied valence minus SER-implied valence midpoint.
|
|
Negative = VAD more negative than SER label implies (masking candidate).
|
|
Positive = VAD more positive than SER label implies (unusual).
|
|
"""
|
|
coherence_score: float
|
|
suppression_flag: bool
|
|
reframe_type: str # "none" | "genuine" | "surface"
|
|
affect_divergence: float
|
|
|
|
|
|
# ── Public helpers ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
def affect_coherence(affect: str, valence: float) -> float:
|
|
"""
|
|
Compute coherence between a SER affect category and a VAD valence score.
|
|
|
|
Returns 1.0 when valence falls inside the expected range for the affect.
|
|
Returns 0.0 when the gap between valence and the nearest range boundary
|
|
exceeds 0.40 (the full range of a typical incoherence gap).
|
|
"""
|
|
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
|
|
if lo <= valence <= hi:
|
|
return 1.0
|
|
gap = min(abs(valence - lo), abs(valence - hi))
|
|
return round(max(0.0, 1.0 - (gap / 0.40)), 3)
|
|
|
|
|
|
def affect_divergence_score(affect: str, valence: float) -> float:
|
|
"""
|
|
Signed divergence: actual VAD valence minus the midpoint of the expected range.
|
|
|
|
Negative = VAD more negative than SER label implies.
|
|
Positive = VAD more positive than SER label implies.
|
|
"""
|
|
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
|
|
midpoint = (lo + hi) / 2.0
|
|
return round(valence - midpoint, 3)
|
|
|
|
|
|
def compute_trajectory(
|
|
buffer: deque,
|
|
current: DimensionalResult,
|
|
ser_affect: str,
|
|
prior_ser_affect: str | None,
|
|
) -> tuple[TrajectorySignal, CoherenceSignal]:
|
|
"""
|
|
Compute trajectory and coherence signals for one speaker at one window.
|
|
|
|
buffer Rolling deque of prior DimensionalResult for this speaker.
|
|
Must be updated AFTER this call (append current to buffer).
|
|
current DimensionalResult for the window being classified.
|
|
ser_affect SER affect label for this window (from ToneClassifier).
|
|
prior_ser_affect SER affect label from the previous window, for reframe detection.
|
|
Pass None on the first window or when not tracking.
|
|
|
|
Returns (TrajectorySignal, CoherenceSignal). Both have baseline_established=False
|
|
and trend="calibrating" when buffer has fewer than BASELINE_MIN entries.
|
|
"""
|
|
n = len(buffer)
|
|
|
|
# Coherence can be computed without a buffer
|
|
coh_score = affect_coherence(ser_affect, current.valence)
|
|
div_score = affect_divergence_score(ser_affect, current.valence)
|
|
|
|
suppression = (
|
|
ser_affect in _LOW_PRESENTATION_AFFECTS
|
|
and current.arousal > _SUPPRESSION_AROUSAL_MIN
|
|
and current.valence < 0.50
|
|
)
|
|
|
|
reframe = "none"
|
|
if prior_ser_affect and prior_ser_affect != ser_affect:
|
|
if _is_more_positive(ser_affect, prior_ser_affect):
|
|
# Valence actually improved in this window vs. single prior frame
|
|
if n >= 1:
|
|
prev_valence = list(buffer)[-1].valence
|
|
dim_improved = (current.valence - prev_valence) >= _DELTA_THRESHOLD
|
|
else:
|
|
dim_improved = False
|
|
reframe = "genuine" if dim_improved else "surface"
|
|
|
|
coher = CoherenceSignal(
|
|
coherence_score=coh_score,
|
|
suppression_flag=suppression,
|
|
reframe_type=reframe,
|
|
affect_divergence=div_score,
|
|
)
|
|
|
|
if n < BASELINE_MIN:
|
|
traj = TrajectorySignal(
|
|
arousal_delta=0.0,
|
|
valence_delta=0.0,
|
|
dominance_delta=0.0,
|
|
arousal_trend="flat",
|
|
valence_trend="flat",
|
|
trend="calibrating",
|
|
frames_in_buffer=n,
|
|
baseline_established=False,
|
|
)
|
|
return traj, coher
|
|
|
|
mean_arousal = sum(f.arousal for f in buffer) / n
|
|
mean_valence = sum(f.valence for f in buffer) / n
|
|
mean_dominance = sum(f.dominance for f in buffer) / n
|
|
|
|
a_delta = current.arousal - mean_arousal
|
|
v_delta = current.valence - mean_valence
|
|
d_delta = current.dominance - mean_dominance
|
|
|
|
a_trend = (
|
|
"rising" if a_delta > _DELTA_THRESHOLD else
|
|
"falling" if a_delta < -_DELTA_THRESHOLD else
|
|
"flat"
|
|
)
|
|
v_trend = (
|
|
"rising" if v_delta > _DELTA_THRESHOLD else
|
|
"falling" if v_delta < -_DELTA_THRESHOLD else
|
|
"flat"
|
|
)
|
|
|
|
# Consecutive movement: check whether the most recent buffered frame
|
|
# was already moving in the same direction as the current frame.
|
|
buf_list = list(buffer)
|
|
prev = buf_list[-1]
|
|
a_consecutive = a_trend == "rising" and (current.arousal - prev.arousal) > 0.03
|
|
v_consecutive = v_trend == "falling" and (current.valence - prev.valence) < -0.03
|
|
|
|
# Composite trend label
|
|
if suppression:
|
|
trend = "suppressed"
|
|
elif a_trend == "rising" and a_consecutive:
|
|
trend = "escalating"
|
|
elif a_trend == "falling" and mean_arousal > 0.55:
|
|
trend = "de-escalating"
|
|
elif v_trend == "falling" and v_consecutive:
|
|
trend = "worsening"
|
|
elif v_trend == "rising" and mean_valence < 0.45:
|
|
trend = "improving"
|
|
else:
|
|
trend = "stable"
|
|
|
|
traj = TrajectorySignal(
|
|
arousal_delta=round(a_delta, 3),
|
|
valence_delta=round(v_delta, 3),
|
|
dominance_delta=round(d_delta, 3),
|
|
arousal_trend=a_trend,
|
|
valence_trend=v_trend,
|
|
trend=trend,
|
|
frames_in_buffer=n,
|
|
baseline_established=True,
|
|
)
|
|
return traj, coher
|
|
|
|
|
|
# ── Internal helpers ───────────────────────────────────────────────────────────
|
|
|
|
|
|
def _is_more_positive(current: str, prior: str) -> bool:
|
|
"""True when the current SER affect is ranked more positive than prior."""
|
|
return _AFFECT_POSITIVITY.get(current, 4) > _AFFECT_POSITIVITY.get(prior, 4)
|