cf-voice/cf_voice/trajectory.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

288 lines
11 KiB
Python

# cf_voice/trajectory.py — affect trajectory and SER/VAD coherence signals
#
# MIT licensed — derived computation only, no inference models.
#
# Two signal families:
#
# 1. TrajectorySignal — rolling arousal/valence trend across the last N windows.
# Detects escalation, de-escalation, suppression, worsening, improving.
#
# 2. CoherenceSignal — cross-model comparison between SER (categorical affect)
# and VAD (continuous dimensional valence). Disagreement indicates affect
# suppression, controlled presentation, or surface-only semantic reframe.
#
# Both signals activate only after BASELINE_MIN windows per speaker are buffered.
# All thresholds are relative to the per-speaker rolling mean, not absolute —
# this is required for ND/neurodivergent speaker safety (see design doc).
#
# Safety note: these signals must never be labelled "deception" in any
# user-facing context. Use: "affect divergence", "controlled presentation",
# "framing shift". The user interprets; the system observes.
from __future__ import annotations
from collections import deque
from dataclasses import dataclass
from cf_voice.dimensional import DimensionalResult
# Rolling window depth per speaker
BUFFER_WINDOW = 5
# Minimum frames before signals activate (relative baseline requirement)
BASELINE_MIN = 3
# Minimum arousal/valence delta per window to count as directional movement
_DELTA_THRESHOLD = 0.05
# Arousal threshold above which "neutral SER + high arousal" = suppression candidate
_SUPPRESSION_AROUSAL_MIN = 0.65
# SER affects that imply low arousal presentation (used for suppression detection)
_LOW_PRESENTATION_AFFECTS = frozenset({"neutral", "scripted", "tired", "apologetic"})
# Expected valence ranges derived from MSP-Podcast emotion distribution.
# Used to determine whether SER affect label and dimensional valence agree.
_AFFECT_VALENCE_PRIOR: dict[str, tuple[float, float]] = {
"warm": (0.60, 1.00),
"genuine": (0.55, 1.00),
"optimistic": (0.55, 0.90),
"neutral": (0.35, 0.65),
"confused": (0.30, 0.60),
"scripted": (0.30, 0.65),
"apologetic": (0.20, 0.55),
"tired": (0.10, 0.50),
"frustrated": (0.10, 0.45),
"dismissive": (0.15, 0.50),
"condescending": (0.10, 0.45),
"urgent": (0.15, 0.55),
}
# Ordinal positivity for reframe direction detection.
# Higher = more positive presentation.
_AFFECT_POSITIVITY: dict[str, int] = {
"urgent": 1,
"frustrated": 1,
"condescending": 1,
"dismissive": 2,
"tired": 2,
"apologetic": 3,
"confused": 3,
"scripted": 4,
"neutral": 4,
"optimistic": 5,
"genuine": 5,
"warm": 6,
}
@dataclass
class TrajectorySignal:
"""
Rolling trend across recent dimensional frames for one speaker.
All delta values: current_frame_value - mean(buffer_values).
Positive arousal_delta = current frame is more activated than baseline.
Negative valence_delta = current frame is more negative than baseline.
trend values:
"calibrating" not enough frames yet (< BASELINE_MIN)
"stable" no significant directional movement
"escalating" arousal rising: current > mean by DELTA_THRESHOLD, consecutive
"de-escalating" arousal falling after elevated period
"worsening" valence falling: current < mean, consecutive
"improving" valence rising after depressed period
"suppressed" SER affect is calm/neutral, VAD arousal is elevated
"""
arousal_delta: float
valence_delta: float
dominance_delta: float
arousal_trend: str # "rising" | "falling" | "flat"
valence_trend: str # "rising" | "falling" | "flat"
trend: str
frames_in_buffer: int
baseline_established: bool
@dataclass
class CoherenceSignal:
"""
Cross-signal comparison: SER categorical affect vs. VAD dimensional valence.
coherence_score:
1.0 = SER label and VAD valence are fully consistent.
0.0 = maximum disagreement.
suppression_flag:
True when the speaker is presenting as calm/neutral (SER) but VAD arousal
is elevated. Indicates controlled presentation with activation underneath.
This is relative to a per-session threshold — not a universal claim.
reframe_type:
"none" no SER category shift this window
"genuine" SER shifted toward more positive AND dimensional valence also
improved (>= DELTA_THRESHOLD in this window)
"surface" SER shifted toward more positive BUT dimensional valence
continued its prior trajectory unchanged or worsening
affect_divergence:
Signed: VAD-implied valence minus SER-implied valence midpoint.
Negative = VAD more negative than SER label implies (masking candidate).
Positive = VAD more positive than SER label implies (unusual).
"""
coherence_score: float
suppression_flag: bool
reframe_type: str # "none" | "genuine" | "surface"
affect_divergence: float
# ── Public helpers ─────────────────────────────────────────────────────────────
def affect_coherence(affect: str, valence: float) -> float:
"""
Compute coherence between a SER affect category and a VAD valence score.
Returns 1.0 when valence falls inside the expected range for the affect.
Returns 0.0 when the gap between valence and the nearest range boundary
exceeds 0.40 (the full range of a typical incoherence gap).
"""
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
if lo <= valence <= hi:
return 1.0
gap = min(abs(valence - lo), abs(valence - hi))
return round(max(0.0, 1.0 - (gap / 0.40)), 3)
def affect_divergence_score(affect: str, valence: float) -> float:
"""
Signed divergence: actual VAD valence minus the midpoint of the expected range.
Negative = VAD more negative than SER label implies.
Positive = VAD more positive than SER label implies.
"""
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
midpoint = (lo + hi) / 2.0
return round(valence - midpoint, 3)
def compute_trajectory(
buffer: deque,
current: DimensionalResult,
ser_affect: str,
prior_ser_affect: str | None,
) -> tuple[TrajectorySignal, CoherenceSignal]:
"""
Compute trajectory and coherence signals for one speaker at one window.
buffer Rolling deque of prior DimensionalResult for this speaker.
Must be updated AFTER this call (append current to buffer).
current DimensionalResult for the window being classified.
ser_affect SER affect label for this window (from ToneClassifier).
prior_ser_affect SER affect label from the previous window, for reframe detection.
Pass None on the first window or when not tracking.
Returns (TrajectorySignal, CoherenceSignal). Both have baseline_established=False
and trend="calibrating" when buffer has fewer than BASELINE_MIN entries.
"""
n = len(buffer)
# Coherence can be computed without a buffer
coh_score = affect_coherence(ser_affect, current.valence)
div_score = affect_divergence_score(ser_affect, current.valence)
suppression = (
ser_affect in _LOW_PRESENTATION_AFFECTS
and current.arousal > _SUPPRESSION_AROUSAL_MIN
and current.valence < 0.50
)
reframe = "none"
if prior_ser_affect and prior_ser_affect != ser_affect:
if _is_more_positive(ser_affect, prior_ser_affect):
# Valence actually improved in this window vs. single prior frame
if n >= 1:
prev_valence = list(buffer)[-1].valence
dim_improved = (current.valence - prev_valence) >= _DELTA_THRESHOLD
else:
dim_improved = False
reframe = "genuine" if dim_improved else "surface"
coher = CoherenceSignal(
coherence_score=coh_score,
suppression_flag=suppression,
reframe_type=reframe,
affect_divergence=div_score,
)
if n < BASELINE_MIN:
traj = TrajectorySignal(
arousal_delta=0.0,
valence_delta=0.0,
dominance_delta=0.0,
arousal_trend="flat",
valence_trend="flat",
trend="calibrating",
frames_in_buffer=n,
baseline_established=False,
)
return traj, coher
mean_arousal = sum(f.arousal for f in buffer) / n
mean_valence = sum(f.valence for f in buffer) / n
mean_dominance = sum(f.dominance for f in buffer) / n
a_delta = current.arousal - mean_arousal
v_delta = current.valence - mean_valence
d_delta = current.dominance - mean_dominance
a_trend = (
"rising" if a_delta > _DELTA_THRESHOLD else
"falling" if a_delta < -_DELTA_THRESHOLD else
"flat"
)
v_trend = (
"rising" if v_delta > _DELTA_THRESHOLD else
"falling" if v_delta < -_DELTA_THRESHOLD else
"flat"
)
# Consecutive movement: check whether the most recent buffered frame
# was already moving in the same direction as the current frame.
buf_list = list(buffer)
prev = buf_list[-1]
a_consecutive = a_trend == "rising" and (current.arousal - prev.arousal) > 0.03
v_consecutive = v_trend == "falling" and (current.valence - prev.valence) < -0.03
# Composite trend label
if suppression:
trend = "suppressed"
elif a_trend == "rising" and a_consecutive:
trend = "escalating"
elif a_trend == "falling" and mean_arousal > 0.55:
trend = "de-escalating"
elif v_trend == "falling" and v_consecutive:
trend = "worsening"
elif v_trend == "rising" and mean_valence < 0.45:
trend = "improving"
else:
trend = "stable"
traj = TrajectorySignal(
arousal_delta=round(a_delta, 3),
valence_delta=round(v_delta, 3),
dominance_delta=round(d_delta, 3),
arousal_trend=a_trend,
valence_trend=v_trend,
trend=trend,
frames_in_buffer=n,
baseline_established=True,
)
return traj, coher
# ── Internal helpers ───────────────────────────────────────────────────────────
def _is_more_positive(current: str, prior: str) -> bool:
"""True when the current SER affect is ranked more positive than prior."""
return _AFFECT_POSITIVITY.get(current, 4) > _AFFECT_POSITIVITY.get(prior, 4)