# cf_voice/context.py — tone classification and context enrichment # # BSL 1.1 when real inference models are integrated. # Currently a passthrough stub: wraps a VoiceIO source and forwards frames. # # Real implementation (Notation v0.1.x) will: # - Run YAMNet acoustic event detection on the audio buffer # - Run wav2vec2-based SER (speech emotion recognition) # - Run librosa prosody extraction (pitch, energy, rate) # - Combine into enriched VoiceFrame label + confidence # - Support pyannote.audio speaker diarization (Navigation v0.2.x) from __future__ import annotations import os from typing import AsyncIterator from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame from cf_voice.io import MockVoiceIO, VoiceIO, make_io from cf_voice.models import VoiceFrame class ContextClassifier: """ High-level voice context classifier. Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation. In stub mode the frames pass through unchanged — the enrichment pipeline (YAMNet + wav2vec2 + librosa) is filled in incrementally. Usage ----- classifier = ContextClassifier.from_env() async for frame in classifier.stream(): print(frame.label, frame.confidence) """ def __init__(self, io: VoiceIO) -> None: self._io = io @classmethod def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier": """ Create a ContextClassifier from environment. CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed). """ io = make_io(interval_s=interval_s) return cls(io=io) @classmethod def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier": """Create a ContextClassifier backed by MockVoiceIO. Useful in tests.""" from cf_voice.io import MockVoiceIO return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed)) async def stream(self) -> AsyncIterator[VoiceFrame]: """ Yield enriched VoiceFrames continuously. Stub: frames from the IO layer pass through unchanged. Real: enrichment pipeline runs here before yield. """ async for frame in self._io.stream(): yield self._enrich(frame) async def stop(self) -> None: await self._io.stop() def classify_chunk( self, audio_b64: str, timestamp: float = 0.0, prior_frames: int = 0, elcor: bool = False, ) -> list[AudioEvent]: """ Classify a single audio chunk and return AudioEvents. This is the request-response path used by the cf-orch endpoint. The streaming path (async generator) is for continuous consumers. elcor=True switches subtext format to Mass Effect Elcor prefix style. Generic tone annotation is always present regardless of elcor flag. """ if isinstance(self._io, MockVoiceIO): return self._classify_chunk_mock(timestamp, prior_frames, elcor) return self._classify_chunk_real(audio_b64, timestamp, elcor) def _classify_chunk_mock( self, timestamp: float, prior_frames: int, elcor: bool ) -> list[AudioEvent]: """Synthetic path — used in mock mode and CI.""" rng = self._io._rng # type: ignore[attr-defined] import time as _time label = rng.choice(self._io._labels) # type: ignore[attr-defined] shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0 frame = VoiceFrame( label=label, confidence=rng.uniform(0.6, 0.97), speaker_id=rng.choice(self._io._speakers), # type: ignore[attr-defined] shift_magnitude=round(shift, 3), timestamp=timestamp, ) tone = tone_event_from_voice_frame( frame_label=frame.label, frame_confidence=frame.confidence, shift_magnitude=frame.shift_magnitude, timestamp=frame.timestamp, elcor=elcor, ) return [tone] def _classify_chunk_real( self, audio_b64: str, timestamp: float, elcor: bool ) -> list[AudioEvent]: """Real inference path — used when CF_VOICE_MOCK is unset.""" import asyncio import base64 import numpy as np from cf_voice.classify import ToneClassifier pcm = base64.b64decode(audio_b64) audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0 # ToneClassifier is stateless per-call, safe to instantiate inline classifier = ToneClassifier.from_env() tone_result = classifier.classify(audio) frame = VoiceFrame( label=tone_result.label, confidence=tone_result.confidence, speaker_id="speaker_a", shift_magnitude=0.0, timestamp=timestamp, ) event = tone_event_from_voice_frame( frame_label=frame.label, frame_confidence=frame.confidence, shift_magnitude=frame.shift_magnitude, timestamp=frame.timestamp, elcor=elcor, ) return [event] def _enrich(self, frame: VoiceFrame) -> VoiceFrame: """ Apply tone classification to a raw frame. Stub: identity transform — returns frame unchanged. Real: replace label + confidence with classifier output. """ return frame