feat: real inference pipeline — STT, tone classifier, diarization, mic capture

- cf_voice/stt.py: WhisperSTT async wrapper (faster-whisper, thread-pool executor, rolling 50-word session prompt for cross-chunk context continuity) - cf_voice/classify.py: ToneClassifier — wav2vec2 SER + librosa prosody flags (energy, ZCR speech rate, YIN pitch contour) mapped to AFFECT_LABELS - cf_voice/diarize.py: Diarizer async wrapper around pyannote/speaker-diarization-3.1; speaker_at() helper for Navigation v0.2.x wiring - cf_voice/capture.py: MicVoiceIO — sounddevice 16kHz mono capture, 2s window accumulation, parallel STT+classify tasks, shift_magnitude from confidence delta - cf_voice/io.py: make_io() now returns MicVoiceIO when CF_VOICE_MOCK is unset - cf_voice/context.py: classify_chunk() split into mock/real paths; real path decodes base64 PCM and runs ToneClassifier synchronously (cf-orch endpoint) - pyproject.toml: inference extras expanded (faster-whisper, sounddevice, librosa, python-dotenv) - .env.example: HF_TOKEN, CF_VOICE_WHISPER_MODEL, CF_VOICE_DEVICE, CF_VOICE_MOCK, CF_VOICE_CONFIDENCE_THRESHOLD Prior art ported from: Plex-Scripts/transcription/diarization.py (pyannote setup), devl/ogma/backend/speech/transcription_engine.py (faster-whisper preprocessing and session prompt pattern).
2026-04-06 17:33:51 -07:00 · 2026-04-06 17:33:51 -07:00 · fed6388b99
commit fed6388b99
parent 6e17da9e93
8 changed files with 813 additions and 19 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,31 @@
 # cf-voice environment — copy to .env and fill in values
 # cf-voice itself does not auto-load .env; consumers (Linnet, Osprey, etc.)
 # load it via python-dotenv in their own startup. For standalone cf-voice
 # dev/testing, source this file manually or install python-dotenv.
 # ── HuggingFace ───────────────────────────────────────────────────────────────
 # Required for pyannote.audio speaker diarization model download.
 # Get a free token at https://huggingface.co/settings/tokens
 # Also accept the gated model terms at:
 #   https://huggingface.co/pyannote/speaker-diarization-3.1
 #   https://huggingface.co/pyannote/segmentation-3.0
 HF_TOKEN=
 # ── Whisper STT ───────────────────────────────────────────────────────────────
 # Model size: tiny | base | small | medium | large-v2 | large-v3
 # Smaller = faster / less VRAM; larger = more accurate.
 # Recommended: small (500MB VRAM) for real-time use.
 CF_VOICE_WHISPER_MODEL=small
 # ── Compute ───────────────────────────────────────────────────────────────────
 # auto (detect GPU), cuda, cpu
 CF_VOICE_DEVICE=auto
 # ── Mock mode ─────────────────────────────────────────────────────────────────
 # Set to 1 to use synthetic VoiceFrames — no GPU, mic, or HF token required.
 # Unset or 0 for real audio capture.
 CF_VOICE_MOCK=
 # ── Tone classifier ───────────────────────────────────────────────────────────
 # Minimum confidence to emit a VoiceFrame (below this = frame skipped).
 CF_VOICE_CONFIDENCE_THRESHOLD=0.55
--- a/cf_voice/capture.py
+++ b/cf_voice/capture.py
@ -0,0 +1,192 @@
 # cf_voice/capture.py — real microphone capture
 #
 # MIT licensed. This layer handles audio I/O and buffering only.
 # Inference (STT, classify) runs here but is imported lazily so that mock
 # mode works without the [inference] extras installed.
 from __future__ import annotations
 import asyncio
 import logging
 import time
 from typing import AsyncIterator
 import numpy as np
 from cf_voice.models import VoiceFrame
 from cf_voice.io import VoiceIO
 logger = logging.getLogger(__name__)
 _SAMPLE_RATE = 16_000
 _CHUNK_FRAMES = 1_600       # 100ms of audio at 16kHz
 _WINDOW_CHUNKS = 20         # 20 × 100ms = 2s STT window
 # Skip windows whose RMS is below this (microphone noise floor)
 _SILENCE_RMS = 0.008
 class MicVoiceIO(VoiceIO):
    """
    Real microphone capture producing enriched VoiceFrames.
    Capture loop (sounddevice callback → asyncio.Queue):
      ┌──────────────────────────────────────────────────────────────┐
      │ sounddevice InputStream                                      │
      │   100ms PCM Int16 chunks → _queue                           │
      └──────────────────────────────────────────────────────────────┘
              ↓ accumulated 20× (2s window)
      ┌──────────────────────────────────────────────────────────────┐
      │ Parallel async tasks (both run in thread pool)               │
      │   WhisperSTT.transcribe_chunk_async() → transcript          │
      │   ToneClassifier.classify_async()     → ToneResult          │
      └──────────────────────────────────────────────────────────────┘
              ↓ combined
      VoiceFrame(label, confidence, speaker_id, shift_magnitude, ...)
    speaker_id is always "speaker_a" until Navigation v0.2.x wires in the
    Diarizer. That integration happens in ContextClassifier, not here.
    Requires: pip install cf-voice[inference]
    """
    def __init__(
        self,
        device_index: int | None = None,
    ) -> None:
        # Lazy import so that importing MicVoiceIO doesn't blow up without
        # inference deps — only instantiation requires them.
        try:
            from cf_voice.stt import WhisperSTT
            from cf_voice.classify import ToneClassifier
        except ImportError as exc:
            raise ImportError(
                "Real audio capture requires the [inference] extras. "
                "Install with: pip install cf-voice[inference]"
            ) from exc
        self._stt = WhisperSTT.from_env()
        self._classifier = ToneClassifier.from_env()
        self._device_index = device_index
        self._running = False
        self._queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=200)
        self._prev_label: str = ""
        self._prev_confidence: float = 0.0
    async def stream(self) -> AsyncIterator[VoiceFrame]:  # type: ignore[override]
        try:
            import sounddevice as sd
        except ImportError as exc:
            raise ImportError(
                "sounddevice is required for microphone capture. "
                "Install with: pip install cf-voice[inference]"
            ) from exc
        self._running = True
        self._stt.reset_session()
        loop = asyncio.get_event_loop()
        def _sd_callback(
            indata: np.ndarray, frames: int, time_info, status
        ) -> None:
            if status:
                logger.debug("sounddevice status: %s", status)
            pcm = (indata[:, 0] * 32_767).astype(np.int16).tobytes()
            try:
                loop.call_soon_threadsafe(self._queue.put_nowait, pcm)
            except asyncio.QueueFull:
                logger.warning("Audio queue full — dropping chunk (inference too slow?)")
        input_stream = sd.InputStream(
            samplerate=_SAMPLE_RATE,
            channels=1,
            dtype="float32",
            blocksize=_CHUNK_FRAMES,
            device=self._device_index,
            callback=_sd_callback,
        )
        window: list[bytes] = []
        session_start = time.monotonic()
        with input_stream:
            while self._running:
                try:
                    chunk = await asyncio.wait_for(self._queue.get(), timeout=0.5)
                except asyncio.TimeoutError:
                    continue
                window.append(chunk)
                if len(window) < _WINDOW_CHUNKS:
                    continue
                # 2-second window ready
                raw = b"".join(window)
                window.clear()
                audio = (
                    np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32_768.0
                )
                # Skip silent windows
                rms = float(np.sqrt(np.mean(audio ** 2)))
                if rms < _SILENCE_RMS:
                    continue
                # Run STT and tone classification in parallel
                stt_task = asyncio.create_task(
                    self._stt.transcribe_chunk_async(raw)
                )
                tone_task = asyncio.create_task(
                    self._classifier.classify_async(audio)
                )
                stt_result, tone = await asyncio.gather(stt_task, tone_task)
                # Update transcript on tone result now that STT is done
                # (re-classify with text is cheap enough to do inline)
                if stt_result.text:
                    tone = await self._classifier.classify_async(
                        audio, stt_result.text
                    )
                shift = _compute_shift(
                    self._prev_label,
                    self._prev_confidence,
                    tone.label,
                    tone.confidence,
                )
                self._prev_label = tone.label
                self._prev_confidence = tone.confidence
                yield VoiceFrame(
                    label=tone.label,
                    confidence=tone.confidence,
                    speaker_id="speaker_a",   # Navigation v0.2.x: diarizer wired here
                    shift_magnitude=round(shift, 3),
                    timestamp=round(time.monotonic() - session_start, 2),
                )
    async def stop(self) -> None:
        self._running = False
 # ── Helpers ───────────────────────────────────────────────────────────────────
 def _compute_shift(
    prev_label: str,
    prev_confidence: float,
    curr_label: str,
    curr_confidence: float,
 ) -> float:
    """
    Compute shift_magnitude for a VoiceFrame.
    0.0 when the label hasn't changed.
    Higher when the label changes with high confidence in both directions.
    Capped at 1.0.
    """
    if not prev_label or curr_label == prev_label:
        return 0.0
    # A high-confidence transition in both frames = large shift
    return min(1.0, (prev_confidence + curr_confidence) / 2.0)
--- a/cf_voice/classify.py
+++ b/cf_voice/classify.py
@ -0,0 +1,245 @@
 # cf_voice/classify.py — tone / affect classifier
 #
 # BSL 1.1: real inference. Requires [inference] extras.
 # Stub behaviour: raises NotImplementedError if inference deps not installed.
 #
 # Pipeline: wav2vec2 SER (speech emotion recognition) + librosa prosody
 # features → AFFECT_LABELS defined in cf_voice.events.
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from dataclasses import dataclass, field
 from functools import partial
 import numpy as np
 logger = logging.getLogger(__name__)
 _SAMPLE_RATE = 16_000
 # Confidence floor — results below this are discarded by the caller
 _DEFAULT_THRESHOLD = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
 # wav2vec2 SER model from HuggingFace
 # ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
 # Outputs 7 classes: angry, disgust, fear, happy, neutral, sadness, surprise
 _SER_MODEL_ID = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
 # ── Affect label mapping ──────────────────────────────────────────────────────
 # Maps (emotion, prosody_profile) → affect label from cf_voice.events.AFFECT_LABELS
 # Prosody profile is a tuple of flags present from _extract_prosody_flags().
 _EMOTION_BASE: dict[str, str] = {
    "angry":    "frustrated",
    "disgust":  "dismissive",
    "fear":     "apologetic",
    "happy":    "warm",
    "neutral":  "neutral",
    "sadness":  "tired",
    "surprise": "confused",
 }
 # Prosody-driven overrides: (base_affect, flag) → override affect
 _PROSODY_OVERRIDES: dict[tuple[str, str], str] = {
    ("neutral", "fast_rate"):    "genuine",
    ("neutral", "flat_pitch"):   "scripted",
    ("neutral", "low_energy"):   "tired",
    ("frustrated", "rising"):    "urgent",
    ("warm", "rising"):          "genuine",
    ("tired", "rising"):         "optimistic",
    ("dismissive", "flat_pitch"): "condescending",
 }
 # Affect → human-readable VoiceFrame label (reverse of events._label_to_affect)
 _AFFECT_TO_LABEL: dict[str, str] = {
    "neutral":       "Calm and focused",
    "warm":          "Enthusiastic",
    "frustrated":    "Frustrated but contained",
    "dismissive":    "Politely dismissive",
    "apologetic":    "Nervous but cooperative",
    "urgent":        "Warmly impatient",
    "condescending": "Politely dismissive",
    "scripted":      "Calm and focused",   # scripted reads as neutral to the observer
    "genuine":       "Genuinely curious",
    "confused":      "Confused but engaged",
    "tired":         "Tired and compliant",
    "optimistic":    "Guardedly optimistic",
 }
@dataclass
 class ToneResult:
    label: str               # human-readable VoiceFrame label
    affect: str              # AFFECT_LABELS key
    confidence: float
    prosody_flags: list[str] = field(default_factory=list)
 class ToneClassifier:
    """
    Tone/affect classifier: wav2vec2 SER + librosa prosody.
    Loads the model lazily on first call to avoid import-time GPU allocation.
    Thread-safe for concurrent classify() calls — the pipeline is stateless
    per-call; session state lives in the caller (ContextClassifier).
    """
    def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
        self._threshold = threshold
        self._pipeline = None   # lazy-loaded
    @classmethod
    def from_env(cls) -> "ToneClassifier":
        threshold = float(os.environ.get("CF_VOICE_CONFIDENCE_THRESHOLD", "0.55"))
        return cls(threshold=threshold)
    def _load_pipeline(self) -> None:
        if self._pipeline is not None:
            return
        try:
            from transformers import pipeline as hf_pipeline
        except ImportError as exc:
            raise ImportError(
                "transformers is required for tone classification. "
                "Install with: pip install cf-voice[inference]"
            ) from exc
        device = 0 if _cuda_available() else -1
        logger.info("Loading SER model %s on device %s", _SER_MODEL_ID, device)
        self._pipeline = hf_pipeline(
            "audio-classification",
            model=_SER_MODEL_ID,
            device=device,
        )
    def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
        """
        Classify tone/affect from a float32 16kHz mono audio window.
        transcript is used as a weak signal for ambiguous cases (e.g. words
        like "unfortunately" bias toward apologetic even on a neutral voice).
        """
        self._load_pipeline()
        # Ensure the model sees float32 at the right rate
        assert audio_float32.dtype == np.float32, "audio must be float32"
        # Run SER
        preds = self._pipeline({"raw": audio_float32, "sampling_rate": _SAMPLE_RATE})
        best = max(preds, key=lambda p: p["score"])
        emotion = best["label"].lower()
        confidence = float(best["score"])
        # Extract prosody features from raw audio
        prosody_flags = _extract_prosody_flags(audio_float32)
        # Resolve affect from base emotion + prosody
        affect = _EMOTION_BASE.get(emotion, "neutral")
        for flag in prosody_flags:
            override = _PROSODY_OVERRIDES.get((affect, flag))
            if override:
                affect = override
                break
        # Weak transcript signals
        affect = _apply_transcript_hints(affect, transcript)
        label = _AFFECT_TO_LABEL.get(affect, "Calm and focused")
        return ToneResult(
            label=label,
            affect=affect,
            confidence=confidence,
            prosody_flags=prosody_flags,
        )
    async def classify_async(
        self, audio_float32: np.ndarray, transcript: str = ""
    ) -> ToneResult:
        """classify() without blocking the event loop."""
        loop = asyncio.get_event_loop()
        fn = partial(self.classify, audio_float32, transcript)
        return await loop.run_in_executor(None, fn)
 # ── Prosody helpers ───────────────────────────────────────────────────────────
 def _extract_prosody_flags(audio: np.ndarray) -> list[str]:
    """
    Extract lightweight prosody flags from a float32 16kHz mono window.
    Returns a list of string flags consumed by _PROSODY_OVERRIDES.
    """
    try:
        import librosa
    except ImportError:
        return []
    flags: list[str] = []
    # Energy (RMS)
    rms = float(np.sqrt(np.mean(audio ** 2)))
    if rms < 0.02:
        flags.append("low_energy")
    elif rms > 0.15:
        flags.append("high_energy")
    # Speech rate approximation via zero-crossing rate
    zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio)))
    if zcr > 0.12:
        flags.append("fast_rate")
    elif zcr < 0.04:
        flags.append("slow_rate")
    # Pitch contour via YIN
    try:
        f0 = librosa.yin(
            audio,
            fmin=librosa.note_to_hz("C2"),
            fmax=librosa.note_to_hz("C7"),
            sr=_SAMPLE_RATE,
        )
        voiced = f0[f0 > 0]
        if len(voiced) > 5:
            # Rising: last quarter higher than first quarter
            q = len(voiced) // 4
            if q > 0 and np.mean(voiced[-q:]) > np.mean(voiced[:q]) * 1.15:
                flags.append("rising")
            # Flat: variance less than 15Hz
            if np.std(voiced) < 15:
                flags.append("flat_pitch")
    except Exception:
        pass   # pitch extraction is best-effort
    return flags
 def _apply_transcript_hints(affect: str, transcript: str) -> str:
    """
    Apply weak keyword signals from transcript text to adjust affect.
    Only overrides when affect is already ambiguous (neutral/tired).
    """
    if not transcript or affect not in ("neutral", "tired"):
        return affect
    t = transcript.lower()
    apologetic_words = {"sorry", "apologize", "unfortunately", "afraid", "regret"}
    urgent_words = {"urgent", "immediately", "asap", "right now", "critical"}
    dismissive_words = {"policy", "unable to", "cannot", "not possible", "outside"}
    if any(w in t for w in apologetic_words):
        return "apologetic"
    if any(w in t for w in urgent_words):
        return "urgent"
    if any(w in t for w in dismissive_words):
        return "dismissive"
    return affect
 def _cuda_available() -> bool:
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False
--- a/cf_voice/context.py
+++ b/cf_voice/context.py
@ -78,26 +78,26 @@ class ContextClassifier:
        This is the request-response path used by the cf-orch endpoint.
        The streaming path (async generator) is for continuous consumers.
        Stub: audio_b64 is ignored; returns synthetic events from the mock IO.
        Real: decode audio, run YAMNet + SER + pyannote, return events.
        elcor=True switches subtext format to Mass Effect Elcor prefix style.
        Generic tone annotation is always present regardless of elcor flag.
        """
-        if not isinstance(self._io, MockVoiceIO):
+        if isinstance(self._io, MockVoiceIO):
-            raise NotImplementedError(
+            return self._classify_chunk_mock(timestamp, prior_frames, elcor)
-                "classify_chunk() requires mock mode. "
+
-                "Real audio inference is not yet implemented."
+        return self._classify_chunk_real(audio_b64, timestamp, elcor)
-            )
+
-        # Generate a synthetic VoiceFrame to derive events from
+    def _classify_chunk_mock(
-        rng = self._io._rng
+        self, timestamp: float, prior_frames: int, elcor: bool
-        import time
+    ) -> list[AudioEvent]:
-        label = rng.choice(self._io._labels)
+        """Synthetic path — used in mock mode and CI."""
        rng = self._io._rng  # type: ignore[attr-defined]
        import time as _time
        label = rng.choice(self._io._labels)  # type: ignore[attr-defined]
        shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
        frame = VoiceFrame(
            label=label,
            confidence=rng.uniform(0.6, 0.97),
-            speaker_id=rng.choice(self._io._speakers),
+            speaker_id=rng.choice(self._io._speakers),  # type: ignore[attr-defined]
            shift_magnitude=round(shift, 3),
            timestamp=timestamp,
        )
@ -110,6 +110,38 @@ class ContextClassifier:
        )
        return [tone]
    def _classify_chunk_real(
        self, audio_b64: str, timestamp: float, elcor: bool
    ) -> list[AudioEvent]:
        """Real inference path — used when CF_VOICE_MOCK is unset."""
        import asyncio
        import base64
        import numpy as np
        from cf_voice.classify import ToneClassifier
        pcm = base64.b64decode(audio_b64)
        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
        # ToneClassifier is stateless per-call, safe to instantiate inline
        classifier = ToneClassifier.from_env()
        tone_result = classifier.classify(audio)
        frame = VoiceFrame(
            label=tone_result.label,
            confidence=tone_result.confidence,
            speaker_id="speaker_a",
            shift_magnitude=0.0,
            timestamp=timestamp,
        )
        event = tone_event_from_voice_frame(
            frame_label=frame.label,
            frame_confidence=frame.confidence,
            shift_magnitude=frame.shift_magnitude,
            timestamp=frame.timestamp,
            elcor=elcor,
        )
        return [event]
    def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
        """
        Apply tone classification to a raw frame.
--- a/cf_voice/diarize.py
+++ b/cf_voice/diarize.py
@ -0,0 +1,146 @@
 # cf_voice/diarize.py — async speaker diarization via pyannote.audio
 #
 # BSL 1.1: real inference. Requires HF_TOKEN + pyannote model access.
 # Gate usage with CF_VOICE_MOCK=1 to skip entirely in dev/CI.
 #
 # Model used: pyannote/speaker-diarization-3.1
 # Requires accepting gated model terms at:
 #   https://huggingface.co/pyannote/speaker-diarization-3.1
 #   https://huggingface.co/pyannote/segmentation-3.0
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from dataclasses import dataclass
 import numpy as np
 logger = logging.getLogger(__name__)
 _DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
 _SAMPLE_RATE = 16_000
@dataclass
 class SpeakerSegment:
    """A speaker-labelled time range within an audio window."""
    speaker_id: str    # ephemeral local label, e.g. "SPEAKER_00"
    start_s: float
    end_s: float
    @property
    def duration_s(self) -> float:
        return self.end_s - self.start_s
 class Diarizer:
    """
    Async wrapper around pyannote.audio speaker diarization pipeline.
    PyAnnote's pipeline is synchronous and GPU-bound. We run it in a thread
    pool executor to avoid blocking the asyncio event loop.
    The pipeline is loaded once at construction time (model download on first
    use, then cached by HuggingFace). CUDA is used automatically if available.
    Usage
    -----
        diarizer = Diarizer.from_env()
        segments = await diarizer.diarize_async(audio_float32)
        for seg in segments:
            print(seg.speaker_id, seg.start_s, seg.end_s)
    Navigation v0.2.x wires this into ContextClassifier so that each
    VoiceFrame carries the correct speaker_id from diarization output.
    """
    def __init__(self, hf_token: str) -> None:
        try:
            from pyannote.audio import Pipeline
        except ImportError as exc:
            raise ImportError(
                "pyannote.audio is required for speaker diarization. "
                "Install with: pip install cf-voice[inference]"
            ) from exc
        logger.info("Loading diarization pipeline %s", _DIARIZATION_MODEL)
        self._pipeline = Pipeline.from_pretrained(
            _DIARIZATION_MODEL,
            use_auth_token=hf_token,
        )
        # Move to GPU if available
        try:
            import torch
            if torch.cuda.is_available():
                self._pipeline.to(torch.device("cuda"))
                logger.info("Diarization pipeline on CUDA")
        except ImportError:
            pass
    @classmethod
    def from_env(cls) -> "Diarizer":
        """Construct from HF_TOKEN environment variable."""
        token = os.environ.get("HF_TOKEN", "").strip()
        if not token:
            raise EnvironmentError(
                "HF_TOKEN is required for speaker diarization. "
                "Set it in your .env or environment. "
                "See .env.example for setup instructions."
            )
        return cls(hf_token=token)
    def _diarize_sync(
        self, audio_float32: np.ndarray, sample_rate: int = _SAMPLE_RATE
    ) -> list[SpeakerSegment]:
        """Synchronous diarization — always call via diarize_async."""
        import torch
        # pyannote expects (channels, samples) float32 tensor
        waveform = torch.from_numpy(audio_float32[np.newaxis, :].astype(np.float32))
        diarization = self._pipeline(
            {"waveform": waveform, "sample_rate": sample_rate}
        )
        segments: list[SpeakerSegment] = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            segments.append(
                SpeakerSegment(
                    speaker_id=speaker,
                    start_s=round(turn.start, 3),
                    end_s=round(turn.end, 3),
                )
            )
        return segments
    async def diarize_async(
        self,
        audio_float32: np.ndarray,
        sample_rate: int = _SAMPLE_RATE,
    ) -> list[SpeakerSegment]:
        """
        Diarize an audio window without blocking the event loop.
        audio_float32 should be 16kHz mono float32.
        Typical input is a 2-second window from MicVoiceIO (32000 samples).
        Returns segments ordered by start_s.
        """
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None, self._diarize_sync, audio_float32, sample_rate
        )
    def speaker_at(
        self, segments: list[SpeakerSegment], timestamp_s: float
    ) -> str:
        """
        Return the speaker_id active at a given timestamp within the window.
        Falls back to "speaker_a" if no segment covers the timestamp
        (e.g. during silence or at window boundaries).
        """
        for seg in segments:
            if seg.start_s <= timestamp_s <= seg.end_s:
                return seg.speaker_id
        return "speaker_a"
--- a/cf_voice/io.py
+++ b/cf_voice/io.py
@ -106,17 +106,17 @@ class MockVoiceIO(VoiceIO):
 def make_io(
    mock: bool | None = None,
    interval_s: float = 2.5,
    device_index: int | None = None,
 ) -> VoiceIO:
    """
    Factory: return a VoiceIO instance appropriate for the current environment.
-    mock=True or CF_VOICE_MOCK=1  → MockVoiceIO (no audio hardware needed)
+    mock=True or CF_VOICE_MOCK=1  → MockVoiceIO (no GPU, mic, or HF token needed)
-    Otherwise                     → real audio capture (not yet implemented)
+    Otherwise                     → MicVoiceIO (requires [inference] extras)
    """
    use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
    if use_mock:
        return MockVoiceIO(interval_s=interval_s)
-    raise NotImplementedError(
+
-        "Real audio capture is not yet implemented. "
+    from cf_voice.capture import MicVoiceIO
-        "Set CF_VOICE_MOCK=1 or pass mock=True to use synthetic frames."
+    return MicVoiceIO(device_index=device_index)
    )
--- a/cf_voice/stt.py
+++ b/cf_voice/stt.py
@ -0,0 +1,142 @@
 # cf_voice/stt.py — faster-whisper STT wrapper
 #
 # BSL 1.1 when real inference models are integrated.
 # Requires the [inference] extras: pip install cf-voice[inference]
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from dataclasses import dataclass
 import numpy as np
 logger = logging.getLogger(__name__)
 _VRAM_ESTIMATES_MB: dict[str, int] = {
    "tiny": 150, "base": 300, "small": 500,
    "medium": 1500, "large": 3000, "large-v2": 3000, "large-v3": 3500,
 }
 # Minimum audio duration in seconds before attempting transcription.
 # Whisper hallucinates on very short clips.
 _MIN_DURATION_S = 0.3
@dataclass
 class STTResult:
    text: str
    language: str
    duration_s: float
    is_final: bool
 class WhisperSTT:
    """
    Async wrapper around faster-whisper for real-time chunk transcription.
    Runs transcription in a thread pool executor so it never blocks the event
    loop. Maintains a rolling 50-word session prompt to improve context
    continuity across 2-second windows.
    Usage
    -----
        stt = WhisperSTT.from_env()
        result = await stt.transcribe_chunk_async(pcm_int16_bytes)
        print(result.text)
    """
    def __init__(
        self,
        model_name: str = "small",
        device: str = "auto",
        compute_type: str | None = None,
    ) -> None:
        try:
            from faster_whisper import WhisperModel
        except ImportError as exc:
            raise ImportError(
                "faster-whisper is required for real STT. "
                "Install with: pip install cf-voice[inference]"
            ) from exc
        if device == "auto":
            try:
                import torch
                device = "cuda" if torch.cuda.is_available() else "cpu"
            except ImportError:
                device = "cpu"
        if compute_type is None:
            compute_type = "float16" if device == "cuda" else "int8"
        logger.info("Loading Whisper %s on %s (%s)", model_name, device, compute_type)
        self._model = WhisperModel(
            model_name, device=device, compute_type=compute_type
        )
        self._device = device
        self._model_name = model_name
        self._session_prompt: str = ""
    @classmethod
    def from_env(cls) -> "WhisperSTT":
        """Construct from CF_VOICE_WHISPER_MODEL and CF_VOICE_DEVICE env vars."""
        return cls(
            model_name=os.environ.get("CF_VOICE_WHISPER_MODEL", "small"),
            device=os.environ.get("CF_VOICE_DEVICE", "auto"),
        )
    @property
    def vram_mb(self) -> int:
        """Estimated VRAM usage in MB for this model/compute_type combination."""
        return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
    def _transcribe_sync(self, audio_float32: np.ndarray) -> STTResult:
        """Synchronous transcription — always call via transcribe_chunk_async."""
        duration = len(audio_float32) / 16_000.0
        if duration < _MIN_DURATION_S:
            return STTResult(
                text="", language="en", duration_s=duration, is_final=False
            )
        segments, info = self._model.transcribe(
            audio_float32,
            language=None,
            initial_prompt=self._session_prompt or None,
            vad_filter=False,   # silence gating happens upstream in MicVoiceIO
            word_timestamps=False,
            beam_size=3,
            temperature=0.0,
        )
        text = " ".join(s.text.strip() for s in segments).strip()
        # Rolling context: keep last ~50 words so the next chunk has prior text
        if text:
            words = (self._session_prompt + " " + text).split()
            self._session_prompt = " ".join(words[-50:])
        return STTResult(
            text=text,
            language=info.language,
            duration_s=duration,
            is_final=duration >= 1.0 and info.language_probability > 0.5,
        )
    async def transcribe_chunk_async(self, pcm_int16: bytes) -> STTResult:
        """
        Transcribe a raw PCM Int16 chunk, non-blocking.
        pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
        chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
        """
        audio = (
            np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
        )
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, self._transcribe_sync, audio)
    def reset_session(self) -> None:
        """Clear the rolling prompt. Call at the start of each new conversation."""
        self._session_prompt = ""
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,12 +18,18 @@ dependencies = [
 inference = [
    "torch>=2.0",
    "torchaudio>=2.0",
    "numpy>=1.24",
    "faster-whisper>=1.0",
    "sounddevice>=0.4",
    "transformers>=4.40",
    "librosa>=0.10",
    "pyannote.audio>=3.1",
    "python-dotenv>=1.0",
 ]
 dev = [
    "pytest>=8.0",
    "pytest-asyncio>=0.23",
    "numpy>=1.24",
 ]
 [project.scripts]