cf-voice/cf_voice/context.py

# cf_voice/context.py — parallel audio context classifier (orchestrator)
#
# BSL 1.1 when real inference models are integrated.
# Mock mode: MIT licensed (no real inference).
#
# Runs three classifiers in parallel against the same audio window:
#   1. Tone/affect (classify.py) — wav2vec2 SER + librosa prosody
#   2. Queue/environ (acoustic.py) — YAMNet acoustic event detection
#   3. Speaker type/VAD (diarize.py) — pyannote.audio (Navigation v0.2.x)
#
# Combined output is a list[AudioEvent] per window, merged into VoiceFrame
# for the streaming path.
#
# Elcor mode reads from cf-core preferences (cf_voice.prefs) so that the
# annotation format is user-configurable without per-request flags.
from __future__ import annotations

import asyncio
import logging
import os
from typing import AsyncIterator

from cf_voice.acoustic import MockAcousticBackend, make_acoustic
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
from cf_voice.models import VoiceFrame
from cf_voice.prefs import get_elcor_prior_frames, is_elcor_enabled

logger = logging.getLogger(__name__)

# ── Per-model download/load status registry ───────────────────────────────────
# Written by _load_* methods; read by the /health endpoint in app.py.
# Values: "disabled" | "loading" | "ready" | "error"
# Thread-safe: individual str assignment is atomic in CPython.
model_status: dict[str, str] = {}


# ── No-op coroutines for disabled/unavailable classifiers ─────────────────────

async def _noop_stt() -> None:
    """Placeholder when STT is disabled or unavailable."""
    return None


async def _noop_diarize() -> list:
    """Placeholder when diarization is disabled or unavailable."""
    return []


# ─────────────────────────────────────────────────────────────────────────────


class ContextClassifier:
    """
    High-level voice context classifier.

    Wraps a VoiceIO source and runs three parallel classifiers on each audio
    window: tone (SER), queue/environ (YAMNet), and speaker (pyannote).

    In mock mode all classifiers produce synthetic events — no GPU, microphone,
    or HuggingFace token required.

    Usage
    -----
        classifier = ContextClassifier.from_env()
        async for frame in classifier.stream():
            print(frame.label, frame.confidence)

    For the full multi-class event list (queue + speaker + tone):
        events = classifier.classify_chunk(audio_b64, timestamp=4.5)
    """

    def __init__(
        self,
        io: VoiceIO,
        user_id: str | None = None,
        store=None,
    ) -> None:
        self._io = io
        self._user_id = user_id
        self._store = store
        self._acoustic = make_acoustic(
            mock=isinstance(io, MockVoiceIO)
                 or os.environ.get("CF_VOICE_MOCK", "") == "1"
        )
        # Lazy — loaded on first real classify call, then reused.
        self._tone: "ToneClassifier | None" = None
        # STT: loaded if faster-whisper is installed. Controlled by CF_VOICE_STT (default: 1).
        self._stt: "WhisperSTT | None" = None
        self._stt_loaded: bool = False   # False = not yet attempted
        # Diarizer: optional — requires HF_TOKEN and CF_VOICE_DIARIZE=1.
        self._diarizer: "Diarizer | None" = None
        self._diarizer_loaded: bool = False
        # Per-session speaker label tracker — maps pyannote IDs → "Speaker A/B/..."
        # Reset at session end (when the ContextClassifier is stopped).
        from cf_voice.diarize import SpeakerTracker
        self._speaker_tracker: SpeakerTracker = SpeakerTracker()
        # One-at-a-time GPU classify gate. All three models share the same GPU;
        # running them "in parallel" just serializes at the CUDA level while
        # filling the thread pool. Drop incoming frames when a classify is
        # already in flight — freshness beats completeness for real-time audio.
        self._classify_lock: asyncio.Lock = asyncio.Lock()
        # Dimensional classifier (audeering) — lazy, CF_VOICE_DIMENSIONAL=1
        self._dimensional: "DimensionalClassifier | None" = None
        self._dimensional_loaded: bool = False
        # Prosodic extractor (openSMILE) — lazy, CF_VOICE_PROSODY=1
        self._prosodic: "ProsodicExtractor | None" = None
        self._prosodic_loaded: bool = False
        # Per-speaker rolling dimensional buffers for trajectory/coherence signals.
        # Keys are speaker_id strings; values are deques of DimensionalResult.
        # Reset at session end alongside SpeakerTracker.
        from collections import deque as _deque
        from cf_voice.trajectory import BUFFER_WINDOW
        self._dim_buffer: dict[str, "_deque"] = {}
        self._last_ser_affect: dict[str, str] = {}
        self._buffer_window = BUFFER_WINDOW
        # Accent classifier — lazy, gated by CF_VOICE_ACCENT=1
        self._accent: "MockAccentClassifier | AccentClassifier | None" = None
        self._accent_loaded: bool = False

    @classmethod
    def from_env(
        cls,
        interval_s: float = 2.5,
        user_id: str | None = None,
        store=None,
    ) -> "ContextClassifier":
        """
        Create a ContextClassifier from environment.

        CF_VOICE_MOCK=1 activates full mock mode (no GPU, no audio hardware).
        If real audio hardware is unavailable (faster-whisper not installed),
        falls back to mock mode automatically.
        user_id + store are forwarded to cf-core preferences for Elcor/threshold
        lookups.
        """
        if os.environ.get("CF_VOICE_MOCK", "") == "1":
            return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
        try:
            io = make_io(interval_s=interval_s)
        except (NotImplementedError, ImportError):
            # Real audio hardware or inference extras unavailable — fall back to
            # mock mode so the coordinator starts cleanly on headless nodes.
            return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
        return cls(io=io, user_id=user_id, store=store)

    @classmethod
    def mock(
        cls,
        interval_s: float = 2.5,
        seed: int | None = None,
        user_id: str | None = None,
        store=None,
    ) -> "ContextClassifier":
        """Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
        return cls(
            io=MockVoiceIO(interval_s=interval_s, seed=seed),
            user_id=user_id,
            store=store,
        )

    async def stream(self) -> AsyncIterator[VoiceFrame]:
        """
        Yield enriched VoiceFrames continuously.

        Stub: frames from the IO layer pass through unchanged.
        Real (Navigation v0.2.x): acoustic + diarization enrichment runs here.
        """
        async for frame in self._io.stream():
            yield self._enrich(frame)

    async def stop(self) -> None:
        await self._io.stop()
        self._speaker_tracker.reset()
        self._dim_buffer.clear()
        self._last_ser_affect.clear()

    def classify_chunk(
        self,
        audio_b64: str | None = None,
        timestamp: float = 0.0,
        prior_frames: int | None = None,
        elcor: bool | None = None,
        session_id: str = "",
    ) -> list[AudioEvent]:
        """
        Classify a single audio window and return all AudioEvents.

        Returns a heterogeneous list containing zero or one of each:
          - ToneEvent  (event_type="tone")
          - AudioEvent (event_type="queue")
          - AudioEvent (event_type="speaker")
          - AudioEvent (event_type="environ")

        This is the request-response path used by the cf-orch SSE endpoint.
        The streaming path (async generator) is for continuous consumers.

        audio_b64     Base64-encoded PCM int16 mono 16kHz bytes.
                      Pass None in mock mode (ignored).
        timestamp     Session-relative seconds since capture started.
        prior_frames  Rolling context window size for Elcor LLM.
                      Defaults to user preference (PREF_ELCOR_PRIOR_FRAMES).
        elcor         Override Elcor mode for this request.
                      None = read from user preference (PREF_ELCOR_MODE).
        session_id    Caller-assigned correlation ID for the session.
        """
        use_elcor = elcor if elcor is not None else is_elcor_enabled(
            user_id=self._user_id, store=self._store
        )
        context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
            user_id=self._user_id, store=self._store
        )

        if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
            return self._classify_mock(timestamp, context_frames, use_elcor, session_id)

        if not audio_b64:
            return []

        return self._classify_real(audio_b64, timestamp, use_elcor, session_id)

    async def classify_chunk_async(
        self,
        audio_b64: str | None = None,
        timestamp: float = 0.0,
        prior_frames: int | None = None,
        elcor: bool | None = None,
        session_id: str = "",
        language: str | None = None,
        num_speakers: int | None = None,
    ) -> list[AudioEvent]:
        """
        Async variant of classify_chunk.

        Runs tone, STT, diarization, and acoustic classification in parallel
        using asyncio.gather(). Use this from async contexts (FastAPI routes)
        to get true concurrency across all four inference paths.
        """
        use_elcor = elcor if elcor is not None else is_elcor_enabled(
            user_id=self._user_id, store=self._store
        )
        context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
            user_id=self._user_id, store=self._store
        )

        if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
            return self._classify_mock(timestamp, context_frames, use_elcor, session_id)

        if not audio_b64:
            return []

        # Drop frame if a classify is already in flight — GPU models serialize
        # anyway, so queuing just adds latency without improving output.
        if self._classify_lock.locked():
            logger.debug("classify busy — dropping frame at t=%.2f", timestamp)
            return []

        async with self._classify_lock:
            # Diarization (pyannote) can take 3–8 s on first invocations even with GPU.
            # 25 s gives enough headroom without stalling the stream for too long.
            try:
                return await asyncio.wait_for(
                    self._classify_real_async(audio_b64, timestamp, use_elcor, session_id, language, num_speakers),
                    timeout=25.0,
                )
            except asyncio.TimeoutError:
                logger.warning("classify_real_async timed out at t=%.2f — dropping frame", timestamp)
                return []

    def _classify_mock(
        self,
        timestamp: float,
        prior_frames: int,
        elcor: bool,
        session_id: str,
    ) -> list[AudioEvent]:
        """
        Synthetic multi-class event batch.

        Tone event comes from the MockVoiceIO RNG (consistent seed behaviour).
        Queue/speaker/environ come from MockAcousticBackend (call lifecycle simulation).
        """
        rng = self._io._rng  # type: ignore[attr-defined]
        label = rng.choice(self._io._labels)  # type: ignore[attr-defined]
        shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0

        frame = VoiceFrame(
            label=label,
            confidence=rng.uniform(0.6, 0.97),
            speaker_id=rng.choice(self._io._speakers),  # type: ignore[attr-defined]
            shift_magnitude=round(shift, 3),
            timestamp=timestamp,
        )
        tone: ToneEvent = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        tone.session_id = session_id

        acoustic = self._acoustic.classify_window(b"", timestamp=timestamp)

        events: list[AudioEvent] = [tone]
        if acoustic.queue:
            events.append(acoustic.queue)
        if acoustic.speaker:
            events.append(acoustic.speaker)
        if acoustic.environ:
            events.append(acoustic.environ)
        if acoustic.scene:
            events.append(acoustic.scene)
        return events

    def _classify_real(
        self,
        audio_b64: str,
        timestamp: float,
        elcor: bool,
        session_id: str,
    ) -> list[AudioEvent]:
        """
        Real inference path — used when CF_VOICE_MOCK is unset.

        Tone: wav2vec2 SER via ToneClassifier (classify.py).
        Acoustic: YAMNet via YAMNetAcousticBackend (Navigation v0.2.x stub).
        Speaker: pyannote VAD (diarize.py) — merged in ContextClassifier, not here.
        """
        import base64

        import numpy as np

        from cf_voice.classify import ToneClassifier

        pcm = base64.b64decode(audio_b64)
        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0

        if self._tone is None:
            self._tone = ToneClassifier.from_env()
        tone_result = self._tone.classify(audio)

        frame = VoiceFrame(
            label=tone_result.label,
            confidence=tone_result.confidence,
            speaker_id="speaker_a",
            shift_magnitude=0.0,
            timestamp=timestamp,
        )
        tone: ToneEvent = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        tone.session_id = session_id

        events: list[AudioEvent] = [tone]

        # Acoustic events: Navigation v0.2.x (YAMNet not yet implemented)
        # YAMNetAcousticBackend raises NotImplementedError at construction —
        # we catch and log rather than failing the entire classify call.
        try:
            acoustic = self._acoustic.classify_window(audio.tobytes(), timestamp=timestamp)
            if acoustic.queue:
                events.append(acoustic.queue)
            if acoustic.speaker:
                events.append(acoustic.speaker)
            if acoustic.environ:
                events.append(acoustic.environ)
            if acoustic.scene:
                events.append(acoustic.scene)
        except NotImplementedError:
            pass

        return events

    def _load_stt(self) -> "WhisperSTT | None":
        """Lazy-load WhisperSTT once. Returns None if unavailable or disabled."""
        if self._stt_loaded:
            return self._stt
        self._stt_loaded = True
        if os.environ.get("CF_VOICE_STT", "1") != "1":
            model_status["stt"] = "disabled"
            return None
        model_status["stt"] = "loading"
        try:
            from cf_voice.stt import WhisperSTT
            self._stt = WhisperSTT.from_env()
            model_status["stt"] = "ready"
            logger.info("WhisperSTT loaded (model=%s)", os.environ.get("CF_VOICE_WHISPER_MODEL", "small"))
        except Exception as exc:
            model_status["stt"] = "error"
            logger.warning("WhisperSTT unavailable: %s", exc)
        return self._stt

    def _load_diarizer(self) -> "Diarizer | None":
        """Lazy-load Diarizer once. Returns None if HF_TOKEN absent or CF_VOICE_DIARIZE!=1."""
        if self._diarizer_loaded:
            return self._diarizer
        self._diarizer_loaded = True
        if os.environ.get("CF_VOICE_DIARIZE", "0") != "1":
            model_status["diarizer"] = "disabled"
            return None
        model_status["diarizer"] = "loading"
        try:
            from cf_voice.diarize import Diarizer
            self._diarizer = Diarizer.from_env()
            model_status["diarizer"] = "ready"
            logger.info("Diarizer loaded")
        except Exception as exc:
            model_status["diarizer"] = "error"
            logger.warning("Diarizer unavailable: %s", exc)
        return self._diarizer

    def _load_dimensional(self) -> "DimensionalClassifier | None":
        """Lazy-load DimensionalClassifier once. Returns None if CF_VOICE_DIMENSIONAL!=1."""
        if self._dimensional_loaded:
            return self._dimensional
        self._dimensional_loaded = True
        if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
            model_status["dimensional"] = "disabled"
            return None
        model_status["dimensional"] = "loading"
        try:
            from cf_voice.dimensional import DimensionalClassifier
            self._dimensional = DimensionalClassifier()
            model_status["dimensional"] = "ready"
            logger.info("DimensionalClassifier loaded (audeering VAD model)")
        except Exception as exc:
            model_status["dimensional"] = "error"
            logger.warning("DimensionalClassifier unavailable: %s", exc)
        return self._dimensional

    def _load_accent(self) -> "MockAccentClassifier | AccentClassifier | None":
        """Lazy-load AccentClassifier once. Returns None if CF_VOICE_ACCENT!=1."""
        if self._accent_loaded:
            return self._accent
        self._accent_loaded = True
        from cf_voice.accent import make_accent_classifier
        result = make_accent_classifier(
            mock=isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1"
        )
        self._accent = result
        if result is None:
            model_status["accent"] = "disabled"
        else:
            model_status["accent"] = "ready"
            logger.info("AccentClassifier loaded (mock=%s)", isinstance(result, type(result).__mro__[0]))
        return self._accent

    def _load_prosodic(self) -> "ProsodicExtractor | None":
        """Lazy-load ProsodicExtractor once. Returns None if CF_VOICE_PROSODY!=1."""
        if self._prosodic_loaded:
            return self._prosodic
        self._prosodic_loaded = True
        if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
            model_status["prosody"] = "disabled"
            return None
        model_status["prosody"] = "loading"
        try:
            from cf_voice.prosody import ProsodicExtractor
            self._prosodic = ProsodicExtractor()
            model_status["prosody"] = "ready"
            logger.info("ProsodicExtractor loaded (openSMILE eGeMAPS)")
        except Exception as exc:
            model_status["prosody"] = "error"
            logger.warning("ProsodicExtractor unavailable: %s", exc)
        return self._prosodic

    async def prewarm(self) -> None:
        """Pre-load all configured models in a thread-pool so downloads happen at
        startup rather than on the first classify call. Safe to call multiple times
        (each _load_* method is idempotent after the first call)."""
        if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
            return
        loop = asyncio.get_running_loop()
        # Load each model in its own executor slot so status updates are visible
        # as each one completes rather than all at once.
        await loop.run_in_executor(None, self._load_stt)
        await loop.run_in_executor(None, self._load_diarizer)
        await loop.run_in_executor(None, self._load_dimensional)
        await loop.run_in_executor(None, self._load_prosodic)
        logger.info("cf-voice prewarm complete: %s", model_status)

    async def _classify_real_async(
        self,
        audio_b64: str,
        timestamp: float,
        elcor: bool,
        session_id: str,
        language: str | None = None,
        num_speakers: int | None = None,
    ) -> list[AudioEvent]:
        """
        Real inference path running all classifiers in parallel.

        Tone (wav2vec2) + STT (Whisper) + Diarization (pyannote, optional) +
        Acoustic (AST) all run concurrently via asyncio.gather(). Each result
        is type-checked after gather — a single classifier failure does not
        abort the call.

        Transcript text is fed back to ToneClassifier as a weak signal (e.g.
        "unfortunately" biases toward apologetic). Diarizer output sets the
        speaker_id on the VoiceFrame.
        """
        import base64
        from functools import partial

        import numpy as np

        from cf_voice.classify import ToneClassifier, _apply_transcript_hints, _AFFECT_TO_LABEL

        pcm = base64.b64decode(audio_b64)
        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0

        # Lazy-load models on first real call
        if self._tone is None:
            self._tone = ToneClassifier.from_env()
        stt = self._load_stt()
        diarizer = self._load_diarizer()
        dimensional = self._load_dimensional()
        prosodic = self._load_prosodic()
        accent_clf = self._load_accent()

        # Build coroutines — all run in thread pool executors internally.
        # Dimensional, prosodic, and accent run in parallel with SER/STT/diarization.
        tone_coro = self._tone.classify_async(audio)
        stt_coro = stt.transcribe_chunk_async(pcm, language=language) if stt else _noop_stt()
        diarize_coro = diarizer.diarize_async(audio, num_speakers=num_speakers) if diarizer else _noop_diarize()
        loop = asyncio.get_running_loop()
        acoustic_coro = loop.run_in_executor(
            None, partial(self._acoustic.classify_window, audio.tobytes(), timestamp)
        )
        dimensional_coro = dimensional.classify_async(audio) if dimensional else _noop_stt()
        prosodic_coro = prosodic.extract_async(audio) if prosodic else _noop_stt()
        accent_coro = loop.run_in_executor(
            None, partial(accent_clf.classify, audio.tobytes())
        ) if accent_clf else _noop_stt()

        (
            tone_result, stt_result, diarize_segs, acoustic,
            dimensional_result, prosodic_result, accent_result,
        ) = await asyncio.gather(
            tone_coro, stt_coro, diarize_coro, acoustic_coro,
            dimensional_coro, prosodic_coro, accent_coro,
            return_exceptions=True,
        )

        # Extract transcript text (STT optional)
        transcript = ""
        if stt_result and not isinstance(stt_result, BaseException):
            transcript = stt_result.text  # type: ignore[union-attr]

        # Apply transcript weak signal to affect if STT produced text
        if transcript and not isinstance(tone_result, BaseException):
            new_affect = _apply_transcript_hints(tone_result.affect, transcript)  # type: ignore[union-attr]
            if new_affect != tone_result.affect:  # type: ignore[union-attr]
                from cf_voice.classify import ToneResult
                tone_result = ToneResult(  # type: ignore[assignment]
                    label=_AFFECT_TO_LABEL.get(new_affect, tone_result.label),  # type: ignore[union-attr]
                    affect=new_affect,
                    confidence=tone_result.confidence,  # type: ignore[union-attr]
                    prosody_flags=tone_result.prosody_flags,  # type: ignore[union-attr]
                )

        # Get speaker_id from diarization (falls back to "speaker_a")
        speaker_id = "speaker_a"
        if isinstance(diarize_segs, BaseException):
            logger.warning("Diarizer failed in gather: %s", diarize_segs)
        elif diarizer and diarize_segs is not None:
            window_mid = len(audio) / 2.0 / 16_000.0
            speaker_id = diarizer.speaker_at(  # type: ignore[arg-type]
                diarize_segs, window_mid, tracker=self._speaker_tracker
            )
            logger.debug("diarize: segs=%d speaker=%s mid=%.3f", len(diarize_segs), speaker_id, window_mid)

        if isinstance(tone_result, BaseException):
            logger.warning("Tone classifier failed: %s", tone_result)
            return []

        # Unpack dimensional result (None when classifier is disabled or failed)
        dim = None
        if dimensional_result and not isinstance(dimensional_result, BaseException):
            dim = dimensional_result

        # Unpack prosodic result. If dimensional is also available, pass the
        # calm-positive score so sarcasm_risk benefits from both signals.
        pros = None
        if prosodic_result and not isinstance(prosodic_result, BaseException):
            if dim is not None:
                # Re-compute sarcasm_risk with dimensional context
                from cf_voice.prosody import _compute_sarcasm_risk
                calm_pos = dim.calm_positive_score()
                updated_risk = _compute_sarcasm_risk(
                    flat_f0=prosodic_result.flat_f0_score,  # type: ignore[union-attr]
                    calm_positive=calm_pos,
                )
                from cf_voice.prosody import ProsodicSignal
                pros = ProsodicSignal(
                    f0_mean=prosodic_result.f0_mean,  # type: ignore[union-attr]
                    f0_std=prosodic_result.f0_std,  # type: ignore[union-attr]
                    jitter=prosodic_result.jitter,  # type: ignore[union-attr]
                    shimmer=prosodic_result.shimmer,  # type: ignore[union-attr]
                    loudness=prosodic_result.loudness,  # type: ignore[union-attr]
                    flat_f0_score=prosodic_result.flat_f0_score,  # type: ignore[union-attr]
                    sarcasm_risk=updated_risk,
                )
            else:
                pros = prosodic_result

        frame = VoiceFrame(
            label=tone_result.label,  # type: ignore[union-attr]
            confidence=tone_result.confidence,  # type: ignore[union-attr]
            speaker_id=speaker_id,
            shift_magnitude=0.0,
            timestamp=timestamp,
            valence=dim.valence if dim else None,
            arousal=dim.arousal if dim else None,
            dominance=dim.dominance if dim else None,
            sarcasm_risk=pros.sarcasm_risk if pros else None,
            flat_f0_score=pros.flat_f0_score if pros else None,
        )
        tone_event: ToneEvent = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        tone_event.session_id = session_id
        tone_event.speaker_id = speaker_id
        # Attach dimensional and prosodic results to the wire event
        tone_event.valence = frame.valence
        tone_event.arousal = frame.arousal
        tone_event.dominance = frame.dominance
        tone_event.sarcasm_risk = frame.sarcasm_risk
        tone_event.flat_f0_score = frame.flat_f0_score

        # Trajectory and coherence signals — only when dimensional is running
        if dim:
            from collections import deque as _deque
            from cf_voice.trajectory import compute_trajectory

            spk_buffer = self._dim_buffer.setdefault(
                speaker_id, _deque(maxlen=self._buffer_window)
            )
            prior_affect = self._last_ser_affect.get(speaker_id)
            traj, coher = compute_trajectory(
                spk_buffer, dim, tone_result.affect, prior_affect  # type: ignore[union-attr]
            )
            # Update buffer and affect history after computing (not before)
            spk_buffer.append(dim)
            self._last_ser_affect[speaker_id] = tone_result.affect  # type: ignore[union-attr]

            tone_event.arousal_delta = traj.arousal_delta if traj.baseline_established else None
            tone_event.valence_delta = traj.valence_delta if traj.baseline_established else None
            tone_event.trend = traj.trend if traj.baseline_established else None
            tone_event.coherence_score = coher.coherence_score
            tone_event.suppression_flag = coher.suppression_flag
            tone_event.reframe_type = coher.reframe_type if coher.reframe_type != "none" else None
            tone_event.affect_divergence = coher.affect_divergence

            logger.debug(
                "Dimensional: valence=%.3f arousal=%.3f dominance=%.3f quadrant=%s "
                "trend=%s coherence=%.2f suppressed=%s reframe=%s",
                dim.valence, dim.arousal, dim.dominance, dim.affect_quadrant(),
                traj.trend, coher.coherence_score, coher.suppression_flag, coher.reframe_type,
            )

        if pros:
            logger.debug(
                "Prosodic: flat_f0=%.3f sarcasm_risk=%.3f",
                pros.flat_f0_score, pros.sarcasm_risk,
            )

        events: list[AudioEvent] = [tone_event]

        # Emit transcript event so consumers can display live STT
        if transcript:
            events.append(AudioEvent(
                timestamp=timestamp,
                event_type="transcript",  # type: ignore[arg-type]
                label=transcript,
                confidence=1.0,
                speaker_id=speaker_id,
            ))

        # Acoustic events (queue / speaker type / environ / scene)
        scene_label: str | None = None
        environ_labels: list[str] = []
        speaker_label: str | None = None
        if not isinstance(acoustic, BaseException):
            if acoustic.queue:    # type: ignore[union-attr]
                events.append(acoustic.queue)    # type: ignore[union-attr]
            if acoustic.speaker:  # type: ignore[union-attr]
                events.append(acoustic.speaker)  # type: ignore[union-attr]
                speaker_label = acoustic.speaker.label  # type: ignore[union-attr]
            if acoustic.environ:  # type: ignore[union-attr]
                events.append(acoustic.environ)  # type: ignore[union-attr]
                environ_labels = [acoustic.environ.label]  # type: ignore[union-attr]
            if acoustic.scene:    # type: ignore[union-attr]
                events.append(acoustic.scene)    # type: ignore[union-attr]
                scene_label = acoustic.scene.label  # type: ignore[union-attr]

        # Accent event (optional — gated by CF_VOICE_ACCENT=1)
        accent_region: str | None = None
        if accent_result and not isinstance(accent_result, BaseException):
            accent_region = accent_result.region  # type: ignore[union-attr]
            events.append(AudioEvent(
                timestamp=timestamp,
                event_type="accent",  # type: ignore[arg-type]
                label=accent_region,
                confidence=accent_result.confidence,  # type: ignore[union-attr]
                speaker_id=speaker_id,
            ))

        # Privacy risk scoring — local only, never transmitted
        from cf_voice.privacy import score_privacy_risk
        risk = score_privacy_risk(
            scene=scene_label,
            environ_labels=environ_labels,
            speaker=speaker_label,
            accent=accent_region,
        )
        if risk.level != "low":
            logger.info(
                "privacy_risk=%s flags=%s session=%s",
                risk.level, risk.flags, session_id,
            )
        # Attach risk to the tone event so Linnet can surface the gate
        tone_event.prosody_flags = list(tone_event.prosody_flags) + [f"privacy:{risk.level}"]

        return events

    def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
        """
        Apply tone classification to a raw frame (streaming path).

        Stub: identity transform — returns frame unchanged.
        Real (Navigation v0.2.x): replace label + confidence with classifier output.
        """
        return frame