New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
199 lines
7.2 KiB
Python
199 lines
7.2 KiB
Python
# cf_voice/stt.py — faster-whisper STT wrapper
|
||
#
|
||
# BSL 1.1 when real inference models are integrated.
|
||
# Requires the [inference] extras: pip install cf-voice[inference]
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
from dataclasses import dataclass
|
||
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
_VRAM_ESTIMATES_MB: dict[str, int] = {
|
||
"tiny": 150, "base": 300, "small": 500,
|
||
"medium": 1500, "large": 3000, "large-v2": 3000, "large-v3": 3500,
|
||
}
|
||
|
||
# Minimum audio duration in seconds before attempting transcription.
|
||
# Whisper hallucinates on very short clips.
|
||
_MIN_DURATION_S = 0.3
|
||
|
||
|
||
@dataclass
|
||
class STTResult:
|
||
text: str
|
||
language: str
|
||
duration_s: float
|
||
is_final: bool
|
||
|
||
|
||
class WhisperSTT:
|
||
"""
|
||
Async wrapper around faster-whisper for real-time chunk transcription.
|
||
|
||
Runs transcription in a thread pool executor so it never blocks the event
|
||
loop. Maintains a rolling 50-word session prompt to improve context
|
||
continuity across 2-second windows.
|
||
|
||
Usage
|
||
-----
|
||
stt = WhisperSTT.from_env()
|
||
result = await stt.transcribe_chunk_async(pcm_int16_bytes)
|
||
print(result.text)
|
||
"""
|
||
|
||
# Known single-token hallucinations that Whisper emits on music/noise with
|
||
# low no_speech_prob (i.e. Whisper thinks it heard speech). These are too
|
||
# short to be real utterances in any supported language context.
|
||
_HALLUCINATION_TOKENS: frozenset[str] = frozenset({
|
||
"ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um",
|
||
})
|
||
|
||
# Suppress a transcript if it repeats unchanged across this many consecutive
|
||
# windows — indicates Whisper is locked into a hallucination loop.
|
||
_MAX_REPEATS = 2
|
||
|
||
def __init__(
|
||
self,
|
||
model_name: str = "small",
|
||
device: str = "auto",
|
||
compute_type: str | None = None,
|
||
) -> None:
|
||
try:
|
||
from faster_whisper import WhisperModel
|
||
except ImportError as exc:
|
||
raise ImportError(
|
||
"faster-whisper is required for real STT. "
|
||
"Install with: pip install cf-voice[inference]"
|
||
) from exc
|
||
|
||
if device == "auto":
|
||
try:
|
||
import torch
|
||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||
except ImportError:
|
||
device = "cpu"
|
||
|
||
if compute_type is None:
|
||
compute_type = "float16" if device == "cuda" else "int8"
|
||
|
||
logger.info("Loading Whisper %s on %s (%s)", model_name, device, compute_type)
|
||
self._model = WhisperModel(
|
||
model_name, device=device, compute_type=compute_type
|
||
)
|
||
self._device = device
|
||
self._model_name = model_name
|
||
self._session_prompt: str = ""
|
||
self._last_text: str = ""
|
||
self._repeat_count: int = 0
|
||
|
||
@classmethod
|
||
def from_env(cls) -> "WhisperSTT":
|
||
"""Construct from CF_VOICE_WHISPER_MODEL and CF_VOICE_DEVICE env vars."""
|
||
return cls(
|
||
model_name=os.environ.get("CF_VOICE_WHISPER_MODEL", "small"),
|
||
device=os.environ.get("CF_VOICE_DEVICE", "auto"),
|
||
)
|
||
|
||
@property
|
||
def vram_mb(self) -> int:
|
||
"""Estimated VRAM usage in MB for this model/compute_type combination."""
|
||
return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
|
||
|
||
# Segments above this no_speech_prob are hallucinations (silence/music/noise).
|
||
# faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks
|
||
# for watching" family without cutting off genuine low-energy speech.
|
||
_NO_SPEECH_THRESHOLD = 0.6
|
||
|
||
def _transcribe_sync(
|
||
self, audio_float32: np.ndarray, language: str | None = None
|
||
) -> STTResult:
|
||
"""Synchronous transcription — always call via transcribe_chunk_async."""
|
||
duration = len(audio_float32) / 16_000.0
|
||
|
||
if duration < _MIN_DURATION_S:
|
||
return STTResult(
|
||
text="", language="en", duration_s=duration, is_final=False
|
||
)
|
||
|
||
# Energy gate: skip Whisper entirely on silent/near-silent audio.
|
||
# In the sidecar path there is no upstream MicVoiceIO silence gate,
|
||
# so we must check here. RMS < 0.005 is inaudible; Whisper will
|
||
# hallucinate "thank you" or "thanks for watching" on silence.
|
||
rms = float(np.sqrt(np.mean(audio_float32 ** 2)))
|
||
if rms < 0.005:
|
||
return STTResult(text="", language="en", duration_s=duration, is_final=False)
|
||
|
||
segments, info = self._model.transcribe(
|
||
audio_float32,
|
||
language=language or None, # None = Whisper auto-detect
|
||
initial_prompt=None, # No session prompt — on 1s windows it causes
|
||
# phrase lock-in (model anchors on prior text
|
||
# rather than fresh audio). Reset via reset_session()
|
||
# at conversation boundaries instead.
|
||
vad_filter=True, # Silero VAD — skips non-speech frames
|
||
word_timestamps=False,
|
||
beam_size=3,
|
||
temperature=0.0,
|
||
)
|
||
|
||
# Filter hallucinated segments: discard any segment where Whisper itself
|
||
# says there is likely no speech (no_speech_prob > threshold). This is
|
||
# the correct defense against "thank you" / music hallucinations — VAD
|
||
# alone is insufficient because music harmonics look speech-like to Silero.
|
||
text = " ".join(
|
||
s.text.strip()
|
||
for s in segments
|
||
if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD
|
||
).strip()
|
||
|
||
# Gate 1: single-token hallucinations that slip past no_speech_prob.
|
||
if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS:
|
||
text = ""
|
||
|
||
# Gate 2: repetition lock — same non-empty text N windows in a row.
|
||
if text and text == self._last_text:
|
||
self._repeat_count += 1
|
||
if self._repeat_count >= self._MAX_REPEATS:
|
||
text = ""
|
||
else:
|
||
self._last_text = text
|
||
self._repeat_count = 0
|
||
|
||
return STTResult(
|
||
text=text,
|
||
language=info.language,
|
||
duration_s=duration,
|
||
is_final=duration >= 1.0 and info.language_probability > 0.5,
|
||
)
|
||
|
||
async def transcribe_chunk_async(
|
||
self, pcm_int16: bytes, language: str | None = None
|
||
) -> STTResult:
|
||
"""
|
||
Transcribe a raw PCM Int16 chunk, non-blocking.
|
||
|
||
pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
|
||
chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
|
||
|
||
language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects,
|
||
which is slower and more hallucination-prone on short clips.
|
||
"""
|
||
from functools import partial
|
||
audio = (
|
||
np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||
)
|
||
loop = asyncio.get_running_loop()
|
||
return await loop.run_in_executor(
|
||
None, partial(self._transcribe_sync, audio, language)
|
||
)
|
||
|
||
def reset_session(self) -> None:
|
||
"""Clear rolling state. Call at the start of each new conversation."""
|
||
self._session_prompt = ""
|
||
self._last_text = ""
|
||
self._repeat_count = 0
|