cf-voice/cf_voice/context.py

# cf_voice/context.py — tone classification and context enrichment
#
# BSL 1.1 when real inference models are integrated.
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
#
# Real implementation (Notation v0.1.x) will:
#   - Run YAMNet acoustic event detection on the audio buffer
#   - Run wav2vec2-based SER (speech emotion recognition)
#   - Run librosa prosody extraction (pitch, energy, rate)
#   - Combine into enriched VoiceFrame label + confidence
#   - Support pyannote.audio speaker diarization (Navigation v0.2.x)
from __future__ import annotations

import os
from typing import AsyncIterator

from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
from cf_voice.models import VoiceFrame


class ContextClassifier:
    """
    High-level voice context classifier.

    Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
    In stub mode the frames pass through unchanged — the enrichment pipeline
    (YAMNet + wav2vec2 + librosa) is filled in incrementally.

    Usage
    -----
        classifier = ContextClassifier.from_env()
        async for frame in classifier.stream():
            print(frame.label, frame.confidence)
    """

    def __init__(self, io: VoiceIO) -> None:
        self._io = io

    @classmethod
    def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
        """
        Create a ContextClassifier from environment.
        CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
        """
        io = make_io(interval_s=interval_s)
        return cls(io=io)

    @classmethod
    def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
        """Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
        from cf_voice.io import MockVoiceIO
        return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))

    async def stream(self) -> AsyncIterator[VoiceFrame]:
        """
        Yield enriched VoiceFrames continuously.

        Stub: frames from the IO layer pass through unchanged.
        Real: enrichment pipeline runs here before yield.
        """
        async for frame in self._io.stream():
            yield self._enrich(frame)

    async def stop(self) -> None:
        await self._io.stop()

    def classify_chunk(
        self,
        audio_b64: str,
        timestamp: float = 0.0,
        prior_frames: int = 0,
        elcor: bool = False,
    ) -> list[AudioEvent]:
        """
        Classify a single audio chunk and return AudioEvents.

        This is the request-response path used by the cf-orch endpoint.
        The streaming path (async generator) is for continuous consumers.

        elcor=True switches subtext format to Mass Effect Elcor prefix style.
        Generic tone annotation is always present regardless of elcor flag.
        """
        if isinstance(self._io, MockVoiceIO):
            return self._classify_chunk_mock(timestamp, prior_frames, elcor)

        return self._classify_chunk_real(audio_b64, timestamp, elcor)

    def _classify_chunk_mock(
        self, timestamp: float, prior_frames: int, elcor: bool
    ) -> list[AudioEvent]:
        """Synthetic path — used in mock mode and CI."""
        rng = self._io._rng  # type: ignore[attr-defined]
        import time as _time
        label = rng.choice(self._io._labels)  # type: ignore[attr-defined]
        shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
        frame = VoiceFrame(
            label=label,
            confidence=rng.uniform(0.6, 0.97),
            speaker_id=rng.choice(self._io._speakers),  # type: ignore[attr-defined]
            shift_magnitude=round(shift, 3),
            timestamp=timestamp,
        )
        tone = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        return [tone]

    def _classify_chunk_real(
        self, audio_b64: str, timestamp: float, elcor: bool
    ) -> list[AudioEvent]:
        """Real inference path — used when CF_VOICE_MOCK is unset."""
        import asyncio
        import base64
        import numpy as np
        from cf_voice.classify import ToneClassifier

        pcm = base64.b64decode(audio_b64)
        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0

        # ToneClassifier is stateless per-call, safe to instantiate inline
        classifier = ToneClassifier.from_env()
        tone_result = classifier.classify(audio)

        frame = VoiceFrame(
            label=tone_result.label,
            confidence=tone_result.confidence,
            speaker_id="speaker_a",
            shift_magnitude=0.0,
            timestamp=timestamp,
        )
        event = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        return [event]

    def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
        """
        Apply tone classification to a raw frame.

        Stub: identity transform — returns frame unchanged.
        Real: replace label + confidence with classifier output.
        """
        return frame