# cf_voice/stt.py — faster-whisper STT wrapper # # BSL 1.1 when real inference models are integrated. # Requires the [inference] extras: pip install cf-voice[inference] from __future__ import annotations import asyncio import logging import os from dataclasses import dataclass import numpy as np logger = logging.getLogger(__name__) _VRAM_ESTIMATES_MB: dict[str, int] = { "tiny": 150, "base": 300, "small": 500, "medium": 1500, "large": 3000, "large-v2": 3000, "large-v3": 3500, } # Minimum audio duration in seconds before attempting transcription. # Whisper hallucinates on very short clips. _MIN_DURATION_S = 0.3 @dataclass class STTResult: text: str language: str duration_s: float is_final: bool class WhisperSTT: """ Async wrapper around faster-whisper for real-time chunk transcription. Runs transcription in a thread pool executor so it never blocks the event loop. Maintains a rolling 50-word session prompt to improve context continuity across 2-second windows. Usage ----- stt = WhisperSTT.from_env() result = await stt.transcribe_chunk_async(pcm_int16_bytes) print(result.text) """ # Known single-token hallucinations that Whisper emits on music/noise with # low no_speech_prob (i.e. Whisper thinks it heard speech). These are too # short to be real utterances in any supported language context. _HALLUCINATION_TOKENS: frozenset[str] = frozenset({ "ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um", }) # Suppress a transcript if it repeats unchanged across this many consecutive # windows — indicates Whisper is locked into a hallucination loop. _MAX_REPEATS = 2 def __init__( self, model_name: str = "small", device: str = "auto", compute_type: str | None = None, ) -> None: try: from faster_whisper import WhisperModel except ImportError as exc: raise ImportError( "faster-whisper is required for real STT. " "Install with: pip install cf-voice[inference]" ) from exc if device == "auto": try: import torch device = "cuda" if torch.cuda.is_available() else "cpu" except ImportError: device = "cpu" if compute_type is None: compute_type = "float16" if device == "cuda" else "int8" logger.info("Loading Whisper %s on %s (%s)", model_name, device, compute_type) self._model = WhisperModel( model_name, device=device, compute_type=compute_type ) self._device = device self._model_name = model_name self._session_prompt: str = "" self._last_text: str = "" self._repeat_count: int = 0 @classmethod def from_env(cls) -> "WhisperSTT": """Construct from CF_VOICE_WHISPER_MODEL and CF_VOICE_DEVICE env vars.""" return cls( model_name=os.environ.get("CF_VOICE_WHISPER_MODEL", "small"), device=os.environ.get("CF_VOICE_DEVICE", "auto"), ) @property def vram_mb(self) -> int: """Estimated VRAM usage in MB for this model/compute_type combination.""" return _VRAM_ESTIMATES_MB.get(self._model_name, 1500) # Segments above this no_speech_prob are hallucinations (silence/music/noise). # faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks # for watching" family without cutting off genuine low-energy speech. _NO_SPEECH_THRESHOLD = 0.6 def _transcribe_sync( self, audio_float32: np.ndarray, language: str | None = None ) -> STTResult: """Synchronous transcription — always call via transcribe_chunk_async.""" duration = len(audio_float32) / 16_000.0 if duration < _MIN_DURATION_S: return STTResult( text="", language="en", duration_s=duration, is_final=False ) # Energy gate: skip Whisper entirely on silent/near-silent audio. # In the sidecar path there is no upstream MicVoiceIO silence gate, # so we must check here. RMS < 0.005 is inaudible; Whisper will # hallucinate "thank you" or "thanks for watching" on silence. rms = float(np.sqrt(np.mean(audio_float32 ** 2))) if rms < 0.005: return STTResult(text="", language="en", duration_s=duration, is_final=False) segments, info = self._model.transcribe( audio_float32, language=language or None, # None = Whisper auto-detect initial_prompt=None, # No session prompt — on 1s windows it causes # phrase lock-in (model anchors on prior text # rather than fresh audio). Reset via reset_session() # at conversation boundaries instead. vad_filter=True, # Silero VAD — skips non-speech frames word_timestamps=False, beam_size=3, temperature=0.0, ) # Filter hallucinated segments: discard any segment where Whisper itself # says there is likely no speech (no_speech_prob > threshold). This is # the correct defense against "thank you" / music hallucinations — VAD # alone is insufficient because music harmonics look speech-like to Silero. text = " ".join( s.text.strip() for s in segments if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD ).strip() # Gate 1: single-token hallucinations that slip past no_speech_prob. if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS: text = "" # Gate 2: repetition lock — same non-empty text N windows in a row. if text and text == self._last_text: self._repeat_count += 1 if self._repeat_count >= self._MAX_REPEATS: text = "" else: self._last_text = text self._repeat_count = 0 return STTResult( text=text, language=info.language, duration_s=duration, is_final=duration >= 1.0 and info.language_probability > 0.5, ) async def transcribe_chunk_async( self, pcm_int16: bytes, language: str | None = None ) -> STTResult: """ Transcribe a raw PCM Int16 chunk, non-blocking. pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms chunks accumulated by MicVoiceIO (2-second window = 64000 bytes). language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects, which is slower and more hallucination-prone on short clips. """ from functools import partial audio = ( np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0 ) loop = asyncio.get_running_loop() return await loop.run_in_executor( None, partial(self._transcribe_sync, audio, language) ) def reset_session(self) -> None: """Clear rolling state. Call at the start of each new conversation.""" self._session_prompt = "" self._last_text = "" self._repeat_count = 0