# cf_voice/context.py — tone classification and context enrichment # # BSL 1.1 when real inference models are integrated. # Currently a passthrough stub: wraps a VoiceIO source and forwards frames. # # Real implementation (Notation v0.1.x) will: # - Run YAMNet acoustic event detection on the audio buffer # - Run wav2vec2-based SER (speech emotion recognition) # - Run librosa prosody extraction (pitch, energy, rate) # - Combine into enriched VoiceFrame label + confidence # - Support pyannote.audio speaker diarization (Navigation v0.2.x) from __future__ import annotations import os from typing import AsyncIterator from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame from cf_voice.io import MockVoiceIO, VoiceIO, make_io from cf_voice.models import VoiceFrame class ContextClassifier: """ High-level voice context classifier. Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation. In stub mode the frames pass through unchanged — the enrichment pipeline (YAMNet + wav2vec2 + librosa) is filled in incrementally. Usage ----- classifier = ContextClassifier.from_env() async for frame in classifier.stream(): print(frame.label, frame.confidence) """ def __init__(self, io: VoiceIO) -> None: self._io = io @classmethod def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier": """ Create a ContextClassifier from environment. CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed). """ io = make_io(interval_s=interval_s) return cls(io=io) @classmethod def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier": """Create a ContextClassifier backed by MockVoiceIO. Useful in tests.""" from cf_voice.io import MockVoiceIO return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed)) async def stream(self) -> AsyncIterator[VoiceFrame]: """ Yield enriched VoiceFrames continuously. Stub: frames from the IO layer pass through unchanged. Real: enrichment pipeline runs here before yield. """ async for frame in self._io.stream(): yield self._enrich(frame) async def stop(self) -> None: await self._io.stop() def classify_chunk( self, audio_b64: str, timestamp: float = 0.0, prior_frames: int = 0, elcor: bool = False, ) -> list[AudioEvent]: """ Classify a single audio chunk and return AudioEvents. This is the request-response path used by the cf-orch endpoint. The streaming path (async generator) is for continuous consumers. Stub: audio_b64 is ignored; returns synthetic events from the mock IO. Real: decode audio, run YAMNet + SER + pyannote, return events. elcor=True switches subtext format to Mass Effect Elcor prefix style. Generic tone annotation is always present regardless of elcor flag. """ if not isinstance(self._io, MockVoiceIO): raise NotImplementedError( "classify_chunk() requires mock mode. " "Real audio inference is not yet implemented." ) # Generate a synthetic VoiceFrame to derive events from rng = self._io._rng import time label = rng.choice(self._io._labels) shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0 frame = VoiceFrame( label=label, confidence=rng.uniform(0.6, 0.97), speaker_id=rng.choice(self._io._speakers), shift_magnitude=round(shift, 3), timestamp=timestamp, ) tone = tone_event_from_voice_frame( frame_label=frame.label, frame_confidence=frame.confidence, shift_magnitude=frame.shift_magnitude, timestamp=frame.timestamp, elcor=elcor, ) return [tone] def _enrich(self, frame: VoiceFrame) -> VoiceFrame: """ Apply tone classification to a raw frame. Stub: identity transform — returns frame unchanged. Real: replace label + confidence with classifier output. """ return frame