# cf_voice/io.py — audio capture and VoiceFrame generation # # MIT licensed. This layer handles audio I/O only — no inference. # # In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are # emitted on a timer. Real audio capture will be added in Notation v0.1.x # once pyannote.audio and faster-whisper are integrated. from __future__ import annotations import asyncio import os import random import time from abc import ABC, abstractmethod from typing import AsyncIterator from cf_voice.models import VoiceFrame # Generic tone labels for the annotation stream. # These are the underlying classifier outputs — the Elcor-style prefix format # ("With barely concealed frustration:") is applied by the UI layer, not here. _MOCK_LABELS = [ "Calm and focused", "Warmly impatient", "Deflecting", "Genuinely curious", "Politely dismissive", "Nervous but cooperative", "Frustrated but contained", "Enthusiastic", "Tired and compliant", "Guardedly optimistic", "Apologetically firm", "Confused but engaged", ] _MOCK_SPEAKERS = ["speaker_a", "speaker_b"] class VoiceIO(ABC): """ Base class for all audio capture sources. Subclasses yield VoiceFrame objects from an async generator. Consumers should use: `async for frame in io_instance.stream(): ...` """ @abstractmethod async def stream(self) -> AsyncIterator[VoiceFrame]: """Yield VoiceFrames continuously until stopped.""" ... async def stop(self) -> None: """Signal the stream to stop. Override if cleanup is needed.""" class MockVoiceIO(VoiceIO): """ Synthetic VoiceFrame generator for development and CI. Emits one frame every `interval_s` seconds with randomised labels, confidence, and simulated speaker transitions. Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated directly in tests. """ def __init__( self, interval_s: float = 2.5, speakers: list[str] | None = None, labels: list[str] | None = None, seed: int | None = None, ) -> None: self._interval_s = interval_s self._speakers = speakers or _MOCK_SPEAKERS self._labels = labels or _MOCK_LABELS self._rng = random.Random(seed) self._running = False async def stream(self) -> AsyncIterator[VoiceFrame]: # type: ignore[override] self._running = True start = time.monotonic() prev_label = self._rng.choice(self._labels) while self._running: await asyncio.sleep(self._interval_s) label = self._rng.choice(self._labels) # shift_magnitude is 0 when the label repeats, higher for big jumps shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9) prev_label = label yield VoiceFrame( label=label, confidence=self._rng.uniform(0.55, 0.98), speaker_id=self._rng.choice(self._speakers), shift_magnitude=round(shift, 3), timestamp=round(time.monotonic() - start, 2), ) async def stop(self) -> None: self._running = False def make_io( mock: bool | None = None, interval_s: float = 2.5, device_index: int | None = None, ) -> VoiceIO: """ Factory: return a VoiceIO instance appropriate for the current environment. mock=True or CF_VOICE_MOCK=1 → MockVoiceIO (no GPU, mic, or HF token needed) Otherwise → MicVoiceIO (requires [inference] extras) """ use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" if use_mock: return MockVoiceIO(interval_s=interval_s) try: from cf_voice.capture import MicVoiceIO return MicVoiceIO(device_index=device_index) except ImportError as exc: raise NotImplementedError( "Real audio capture requires [inference] extras. " "Install with: pip install cf-voice[inference]\n" f"Missing: {exc}" ) from exc