cf-voice/cf_voice/stt.py

# cf_voice/stt.py — faster-whisper STT wrapper
#
# BSL 1.1 when real inference models are integrated.
# Requires the [inference] extras: pip install cf-voice[inference]
from __future__ import annotations

import asyncio
import logging
import os
from dataclasses import dataclass

import numpy as np

logger = logging.getLogger(__name__)

_VRAM_ESTIMATES_MB: dict[str, int] = {
    "tiny": 150, "base": 300, "small": 500,
    "medium": 1500, "large": 3000, "large-v2": 3000, "large-v3": 3500,
}

# Minimum audio duration in seconds before attempting transcription.
# Whisper hallucinates on very short clips.
_MIN_DURATION_S = 0.3


@dataclass
class STTResult:
    text: str
    language: str
    duration_s: float
    is_final: bool


class WhisperSTT:
    """
    Async wrapper around faster-whisper for real-time chunk transcription.

    Runs transcription in a thread pool executor so it never blocks the event
    loop. Maintains a rolling 50-word session prompt to improve context
    continuity across 2-second windows.

    Usage
    -----
        stt = WhisperSTT.from_env()
        result = await stt.transcribe_chunk_async(pcm_int16_bytes)
        print(result.text)
    """

    # Known single-token hallucinations that Whisper emits on music/noise with
    # low no_speech_prob (i.e. Whisper thinks it heard speech). These are too
    # short to be real utterances in any supported language context.
    _HALLUCINATION_TOKENS: frozenset[str] = frozenset({
        "ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um",
    })

    # Suppress a transcript if it repeats unchanged across this many consecutive
    # windows — indicates Whisper is locked into a hallucination loop.
    _MAX_REPEATS = 2

    def __init__(
        self,
        model_name: str = "small",
        device: str = "auto",
        compute_type: str | None = None,
    ) -> None:
        try:
            from faster_whisper import WhisperModel
        except ImportError as exc:
            raise ImportError(
                "faster-whisper is required for real STT. "
                "Install with: pip install cf-voice[inference]"
            ) from exc

        if device == "auto":
            try:
                import torch
                device = "cuda" if torch.cuda.is_available() else "cpu"
            except ImportError:
                device = "cpu"

        if compute_type is None:
            compute_type = "float16" if device == "cuda" else "int8"

        logger.info("Loading Whisper %s on %s (%s)", model_name, device, compute_type)
        self._model = WhisperModel(
            model_name, device=device, compute_type=compute_type
        )
        self._device = device
        self._model_name = model_name
        self._session_prompt: str = ""
        self._last_text: str = ""
        self._repeat_count: int = 0

    @classmethod
    def from_env(cls) -> "WhisperSTT":
        """Construct from CF_VOICE_WHISPER_MODEL and CF_VOICE_DEVICE env vars."""
        return cls(
            model_name=os.environ.get("CF_VOICE_WHISPER_MODEL", "small"),
            device=os.environ.get("CF_VOICE_DEVICE", "auto"),
        )

    @property
    def vram_mb(self) -> int:
        """Estimated VRAM usage in MB for this model/compute_type combination."""
        return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)

    # Segments above this no_speech_prob are hallucinations (silence/music/noise).
    # faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks
    # for watching" family without cutting off genuine low-energy speech.
    _NO_SPEECH_THRESHOLD = 0.6

    def _transcribe_sync(
        self, audio_float32: np.ndarray, language: str | None = None
    ) -> STTResult:
        """Synchronous transcription — always call via transcribe_chunk_async."""
        duration = len(audio_float32) / 16_000.0

        if duration < _MIN_DURATION_S:
            return STTResult(
                text="", language="en", duration_s=duration, is_final=False
            )

        # Energy gate: skip Whisper entirely on silent/near-silent audio.
        # In the sidecar path there is no upstream MicVoiceIO silence gate,
        # so we must check here. RMS < 0.005 is inaudible; Whisper will
        # hallucinate "thank you" or "thanks for watching" on silence.
        rms = float(np.sqrt(np.mean(audio_float32 ** 2)))
        if rms < 0.005:
            return STTResult(text="", language="en", duration_s=duration, is_final=False)

        segments, info = self._model.transcribe(
            audio_float32,
            language=language or None,  # None = Whisper auto-detect
            initial_prompt=None,        # No session prompt — on 1s windows it causes
                                        # phrase lock-in (model anchors on prior text
                                        # rather than fresh audio). Reset via reset_session()
                                        # at conversation boundaries instead.
            vad_filter=True,            # Silero VAD — skips non-speech frames
            word_timestamps=False,
            beam_size=3,
            temperature=0.0,
        )

        # Filter hallucinated segments: discard any segment where Whisper itself
        # says there is likely no speech (no_speech_prob > threshold). This is
        # the correct defense against "thank you" / music hallucinations — VAD
        # alone is insufficient because music harmonics look speech-like to Silero.
        text = " ".join(
            s.text.strip()
            for s in segments
            if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD
        ).strip()

        # Gate 1: single-token hallucinations that slip past no_speech_prob.
        if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS:
            text = ""

        # Gate 2: repetition lock — same non-empty text N windows in a row.
        if text and text == self._last_text:
            self._repeat_count += 1
            if self._repeat_count >= self._MAX_REPEATS:
                text = ""
        else:
            self._last_text = text
            self._repeat_count = 0

        return STTResult(
            text=text,
            language=info.language,
            duration_s=duration,
            is_final=duration >= 1.0 and info.language_probability > 0.5,
        )

    async def transcribe_chunk_async(
        self, pcm_int16: bytes, language: str | None = None
    ) -> STTResult:
        """
        Transcribe a raw PCM Int16 chunk, non-blocking.

        pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
        chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).

        language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects,
        which is slower and more hallucination-prone on short clips.
        """
        from functools import partial
        audio = (
            np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
        )
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(
            None, partial(self._transcribe_sync, audio, language)
        )

    def reset_session(self) -> None:
        """Clear rolling state. Call at the start of each new conversation."""
        self._session_prompt = ""
        self._last_text = ""
        self._repeat_count = 0