cf-voice/cf_voice/io.py

# cf_voice/io.py — audio capture and VoiceFrame generation
#
# MIT licensed. This layer handles audio I/O only — no inference.
#
# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are
# emitted on a timer. Real audio capture will be added in Notation v0.1.x
# once pyannote.audio and faster-whisper are integrated.
from __future__ import annotations

import asyncio
import os
import random
import time
from abc import ABC, abstractmethod
from typing import AsyncIterator

from cf_voice.models import VoiceFrame

# Generic tone labels for the annotation stream.
# These are the underlying classifier outputs — the Elcor-style prefix format
# ("With barely concealed frustration:") is applied by the UI layer, not here.
_MOCK_LABELS = [
    "Calm and focused",
    "Warmly impatient",
    "Deflecting",
    "Genuinely curious",
    "Politely dismissive",
    "Nervous but cooperative",
    "Frustrated but contained",
    "Enthusiastic",
    "Tired and compliant",
    "Guardedly optimistic",
    "Apologetically firm",
    "Confused but engaged",
]

_MOCK_SPEAKERS = ["speaker_a", "speaker_b"]


class VoiceIO(ABC):
    """
    Base class for all audio capture sources.

    Subclasses yield VoiceFrame objects from an async generator.
    Consumers should use: `async for frame in io_instance.stream(): ...`
    """

    @abstractmethod
    async def stream(self) -> AsyncIterator[VoiceFrame]:
        """Yield VoiceFrames continuously until stopped."""
        ...

    async def stop(self) -> None:
        """Signal the stream to stop. Override if cleanup is needed."""


class MockVoiceIO(VoiceIO):
    """
    Synthetic VoiceFrame generator for development and CI.

    Emits one frame every `interval_s` seconds with randomised labels,
    confidence, and simulated speaker transitions.

    Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated
    directly in tests.
    """

    def __init__(
        self,
        interval_s: float = 2.5,
        speakers: list[str] | None = None,
        labels: list[str] | None = None,
        seed: int | None = None,
    ) -> None:
        self._interval_s = interval_s
        self._speakers = speakers or _MOCK_SPEAKERS
        self._labels = labels or _MOCK_LABELS
        self._rng = random.Random(seed)
        self._running = False

    async def stream(self) -> AsyncIterator[VoiceFrame]:  # type: ignore[override]
        self._running = True
        start = time.monotonic()
        prev_label = self._rng.choice(self._labels)

        while self._running:
            await asyncio.sleep(self._interval_s)

            label = self._rng.choice(self._labels)
            # shift_magnitude is 0 when the label repeats, higher for big jumps
            shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9)
            prev_label = label

            yield VoiceFrame(
                label=label,
                confidence=self._rng.uniform(0.55, 0.98),
                speaker_id=self._rng.choice(self._speakers),
                shift_magnitude=round(shift, 3),
                timestamp=round(time.monotonic() - start, 2),
            )

    async def stop(self) -> None:
        self._running = False


def make_io(
    mock: bool | None = None,
    interval_s: float = 2.5,
    device_index: int | None = None,
) -> VoiceIO:
    """
    Factory: return a VoiceIO instance appropriate for the current environment.

    mock=True or CF_VOICE_MOCK=1  → MockVoiceIO (no GPU, mic, or HF token needed)
    Otherwise                     → MicVoiceIO (requires [inference] extras)
    """
    use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
    if use_mock:
        return MockVoiceIO(interval_s=interval_s)

    from cf_voice.capture import MicVoiceIO
    return MicVoiceIO(device_index=device_index)