cf-voice/cf_voice/stt.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

199 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cf_voice/stt.py — faster-whisper STT wrapper
#
# BSL 1.1 when real inference models are integrated.
# Requires the [inference] extras: pip install cf-voice[inference]
from __future__ import annotations
import asyncio
import logging
import os
from dataclasses import dataclass
import numpy as np
logger = logging.getLogger(__name__)
_VRAM_ESTIMATES_MB: dict[str, int] = {
"tiny": 150, "base": 300, "small": 500,
"medium": 1500, "large": 3000, "large-v2": 3000, "large-v3": 3500,
}
# Minimum audio duration in seconds before attempting transcription.
# Whisper hallucinates on very short clips.
_MIN_DURATION_S = 0.3
@dataclass
class STTResult:
text: str
language: str
duration_s: float
is_final: bool
class WhisperSTT:
"""
Async wrapper around faster-whisper for real-time chunk transcription.
Runs transcription in a thread pool executor so it never blocks the event
loop. Maintains a rolling 50-word session prompt to improve context
continuity across 2-second windows.
Usage
-----
stt = WhisperSTT.from_env()
result = await stt.transcribe_chunk_async(pcm_int16_bytes)
print(result.text)
"""
# Known single-token hallucinations that Whisper emits on music/noise with
# low no_speech_prob (i.e. Whisper thinks it heard speech). These are too
# short to be real utterances in any supported language context.
_HALLUCINATION_TOKENS: frozenset[str] = frozenset({
"ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um",
})
# Suppress a transcript if it repeats unchanged across this many consecutive
# windows — indicates Whisper is locked into a hallucination loop.
_MAX_REPEATS = 2
def __init__(
self,
model_name: str = "small",
device: str = "auto",
compute_type: str | None = None,
) -> None:
try:
from faster_whisper import WhisperModel
except ImportError as exc:
raise ImportError(
"faster-whisper is required for real STT. "
"Install with: pip install cf-voice[inference]"
) from exc
if device == "auto":
try:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
device = "cpu"
if compute_type is None:
compute_type = "float16" if device == "cuda" else "int8"
logger.info("Loading Whisper %s on %s (%s)", model_name, device, compute_type)
self._model = WhisperModel(
model_name, device=device, compute_type=compute_type
)
self._device = device
self._model_name = model_name
self._session_prompt: str = ""
self._last_text: str = ""
self._repeat_count: int = 0
@classmethod
def from_env(cls) -> "WhisperSTT":
"""Construct from CF_VOICE_WHISPER_MODEL and CF_VOICE_DEVICE env vars."""
return cls(
model_name=os.environ.get("CF_VOICE_WHISPER_MODEL", "small"),
device=os.environ.get("CF_VOICE_DEVICE", "auto"),
)
@property
def vram_mb(self) -> int:
"""Estimated VRAM usage in MB for this model/compute_type combination."""
return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
# Segments above this no_speech_prob are hallucinations (silence/music/noise).
# faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks
# for watching" family without cutting off genuine low-energy speech.
_NO_SPEECH_THRESHOLD = 0.6
def _transcribe_sync(
self, audio_float32: np.ndarray, language: str | None = None
) -> STTResult:
"""Synchronous transcription — always call via transcribe_chunk_async."""
duration = len(audio_float32) / 16_000.0
if duration < _MIN_DURATION_S:
return STTResult(
text="", language="en", duration_s=duration, is_final=False
)
# Energy gate: skip Whisper entirely on silent/near-silent audio.
# In the sidecar path there is no upstream MicVoiceIO silence gate,
# so we must check here. RMS < 0.005 is inaudible; Whisper will
# hallucinate "thank you" or "thanks for watching" on silence.
rms = float(np.sqrt(np.mean(audio_float32 ** 2)))
if rms < 0.005:
return STTResult(text="", language="en", duration_s=duration, is_final=False)
segments, info = self._model.transcribe(
audio_float32,
language=language or None, # None = Whisper auto-detect
initial_prompt=None, # No session prompt — on 1s windows it causes
# phrase lock-in (model anchors on prior text
# rather than fresh audio). Reset via reset_session()
# at conversation boundaries instead.
vad_filter=True, # Silero VAD — skips non-speech frames
word_timestamps=False,
beam_size=3,
temperature=0.0,
)
# Filter hallucinated segments: discard any segment where Whisper itself
# says there is likely no speech (no_speech_prob > threshold). This is
# the correct defense against "thank you" / music hallucinations — VAD
# alone is insufficient because music harmonics look speech-like to Silero.
text = " ".join(
s.text.strip()
for s in segments
if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD
).strip()
# Gate 1: single-token hallucinations that slip past no_speech_prob.
if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS:
text = ""
# Gate 2: repetition lock — same non-empty text N windows in a row.
if text and text == self._last_text:
self._repeat_count += 1
if self._repeat_count >= self._MAX_REPEATS:
text = ""
else:
self._last_text = text
self._repeat_count = 0
return STTResult(
text=text,
language=info.language,
duration_s=duration,
is_final=duration >= 1.0 and info.language_probability > 0.5,
)
async def transcribe_chunk_async(
self, pcm_int16: bytes, language: str | None = None
) -> STTResult:
"""
Transcribe a raw PCM Int16 chunk, non-blocking.
pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects,
which is slower and more hallucination-prone on short clips.
"""
from functools import partial
audio = (
np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
)
loop = asyncio.get_running_loop()
return await loop.run_in_executor(
None, partial(self._transcribe_sync, audio, language)
)
def reset_session(self) -> None:
"""Clear rolling state. Call at the start of each new conversation."""
self._session_prompt = ""
self._last_text = ""
self._repeat_count = 0