cf-voice/cf_voice/context.py
pyr0ball fed6388b99 feat: real inference pipeline — STT, tone classifier, diarization, mic capture
- cf_voice/stt.py: WhisperSTT async wrapper (faster-whisper, thread-pool executor,
  rolling 50-word session prompt for cross-chunk context continuity)
- cf_voice/classify.py: ToneClassifier — wav2vec2 SER + librosa prosody flags
  (energy, ZCR speech rate, YIN pitch contour) mapped to AFFECT_LABELS
- cf_voice/diarize.py: Diarizer async wrapper around pyannote/speaker-diarization-3.1;
  speaker_at() helper for Navigation v0.2.x wiring
- cf_voice/capture.py: MicVoiceIO — sounddevice 16kHz mono capture, 2s window
  accumulation, parallel STT+classify tasks, shift_magnitude from confidence delta
- cf_voice/io.py: make_io() now returns MicVoiceIO when CF_VOICE_MOCK is unset
- cf_voice/context.py: classify_chunk() split into mock/real paths; real path
  decodes base64 PCM and runs ToneClassifier synchronously (cf-orch endpoint)
- pyproject.toml: inference extras expanded (faster-whisper, sounddevice,
  librosa, python-dotenv)
- .env.example: HF_TOKEN, CF_VOICE_WHISPER_MODEL, CF_VOICE_DEVICE, CF_VOICE_MOCK,
  CF_VOICE_CONFIDENCE_THRESHOLD

Prior art ported from: Plex-Scripts/transcription/diarization.py (pyannote
setup), devl/ogma/backend/speech/transcription_engine.py (faster-whisper
preprocessing and session prompt pattern).
2026-04-06 17:33:51 -07:00

152 lines
5.4 KiB
Python

# cf_voice/context.py — tone classification and context enrichment
#
# BSL 1.1 when real inference models are integrated.
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
#
# Real implementation (Notation v0.1.x) will:
# - Run YAMNet acoustic event detection on the audio buffer
# - Run wav2vec2-based SER (speech emotion recognition)
# - Run librosa prosody extraction (pitch, energy, rate)
# - Combine into enriched VoiceFrame label + confidence
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
from __future__ import annotations
import os
from typing import AsyncIterator
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
from cf_voice.models import VoiceFrame
class ContextClassifier:
"""
High-level voice context classifier.
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
In stub mode the frames pass through unchanged — the enrichment pipeline
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
Usage
-----
classifier = ContextClassifier.from_env()
async for frame in classifier.stream():
print(frame.label, frame.confidence)
"""
def __init__(self, io: VoiceIO) -> None:
self._io = io
@classmethod
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
"""
Create a ContextClassifier from environment.
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
"""
io = make_io(interval_s=interval_s)
return cls(io=io)
@classmethod
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
from cf_voice.io import MockVoiceIO
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
async def stream(self) -> AsyncIterator[VoiceFrame]:
"""
Yield enriched VoiceFrames continuously.
Stub: frames from the IO layer pass through unchanged.
Real: enrichment pipeline runs here before yield.
"""
async for frame in self._io.stream():
yield self._enrich(frame)
async def stop(self) -> None:
await self._io.stop()
def classify_chunk(
self,
audio_b64: str,
timestamp: float = 0.0,
prior_frames: int = 0,
elcor: bool = False,
) -> list[AudioEvent]:
"""
Classify a single audio chunk and return AudioEvents.
This is the request-response path used by the cf-orch endpoint.
The streaming path (async generator) is for continuous consumers.
elcor=True switches subtext format to Mass Effect Elcor prefix style.
Generic tone annotation is always present regardless of elcor flag.
"""
if isinstance(self._io, MockVoiceIO):
return self._classify_chunk_mock(timestamp, prior_frames, elcor)
return self._classify_chunk_real(audio_b64, timestamp, elcor)
def _classify_chunk_mock(
self, timestamp: float, prior_frames: int, elcor: bool
) -> list[AudioEvent]:
"""Synthetic path — used in mock mode and CI."""
rng = self._io._rng # type: ignore[attr-defined]
import time as _time
label = rng.choice(self._io._labels) # type: ignore[attr-defined]
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
frame = VoiceFrame(
label=label,
confidence=rng.uniform(0.6, 0.97),
speaker_id=rng.choice(self._io._speakers), # type: ignore[attr-defined]
shift_magnitude=round(shift, 3),
timestamp=timestamp,
)
tone = tone_event_from_voice_frame(
frame_label=frame.label,
frame_confidence=frame.confidence,
shift_magnitude=frame.shift_magnitude,
timestamp=frame.timestamp,
elcor=elcor,
)
return [tone]
def _classify_chunk_real(
self, audio_b64: str, timestamp: float, elcor: bool
) -> list[AudioEvent]:
"""Real inference path — used when CF_VOICE_MOCK is unset."""
import asyncio
import base64
import numpy as np
from cf_voice.classify import ToneClassifier
pcm = base64.b64decode(audio_b64)
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
# ToneClassifier is stateless per-call, safe to instantiate inline
classifier = ToneClassifier.from_env()
tone_result = classifier.classify(audio)
frame = VoiceFrame(
label=tone_result.label,
confidence=tone_result.confidence,
speaker_id="speaker_a",
shift_magnitude=0.0,
timestamp=timestamp,
)
event = tone_event_from_voice_frame(
frame_label=frame.label,
frame_confidence=frame.confidence,
shift_magnitude=frame.shift_magnitude,
timestamp=frame.timestamp,
elcor=elcor,
)
return [event]
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
"""
Apply tone classification to a raw frame.
Stub: identity transform — returns frame unchanged.
Real: replace label + confidence with classifier output.
"""
return frame