- cf_voice/stt.py: WhisperSTT async wrapper (faster-whisper, thread-pool executor, rolling 50-word session prompt for cross-chunk context continuity) - cf_voice/classify.py: ToneClassifier — wav2vec2 SER + librosa prosody flags (energy, ZCR speech rate, YIN pitch contour) mapped to AFFECT_LABELS - cf_voice/diarize.py: Diarizer async wrapper around pyannote/speaker-diarization-3.1; speaker_at() helper for Navigation v0.2.x wiring - cf_voice/capture.py: MicVoiceIO — sounddevice 16kHz mono capture, 2s window accumulation, parallel STT+classify tasks, shift_magnitude from confidence delta - cf_voice/io.py: make_io() now returns MicVoiceIO when CF_VOICE_MOCK is unset - cf_voice/context.py: classify_chunk() split into mock/real paths; real path decodes base64 PCM and runs ToneClassifier synchronously (cf-orch endpoint) - pyproject.toml: inference extras expanded (faster-whisper, sounddevice, librosa, python-dotenv) - .env.example: HF_TOKEN, CF_VOICE_WHISPER_MODEL, CF_VOICE_DEVICE, CF_VOICE_MOCK, CF_VOICE_CONFIDENCE_THRESHOLD Prior art ported from: Plex-Scripts/transcription/diarization.py (pyannote setup), devl/ogma/backend/speech/transcription_engine.py (faster-whisper preprocessing and session prompt pattern).
152 lines
5.4 KiB
Python
152 lines
5.4 KiB
Python
# cf_voice/context.py — tone classification and context enrichment
|
|
#
|
|
# BSL 1.1 when real inference models are integrated.
|
|
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
|
|
#
|
|
# Real implementation (Notation v0.1.x) will:
|
|
# - Run YAMNet acoustic event detection on the audio buffer
|
|
# - Run wav2vec2-based SER (speech emotion recognition)
|
|
# - Run librosa prosody extraction (pitch, energy, rate)
|
|
# - Combine into enriched VoiceFrame label + confidence
|
|
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from typing import AsyncIterator
|
|
|
|
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
|
|
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
|
|
from cf_voice.models import VoiceFrame
|
|
|
|
|
|
class ContextClassifier:
|
|
"""
|
|
High-level voice context classifier.
|
|
|
|
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
|
|
In stub mode the frames pass through unchanged — the enrichment pipeline
|
|
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
|
|
|
|
Usage
|
|
-----
|
|
classifier = ContextClassifier.from_env()
|
|
async for frame in classifier.stream():
|
|
print(frame.label, frame.confidence)
|
|
"""
|
|
|
|
def __init__(self, io: VoiceIO) -> None:
|
|
self._io = io
|
|
|
|
@classmethod
|
|
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
|
|
"""
|
|
Create a ContextClassifier from environment.
|
|
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
|
|
"""
|
|
io = make_io(interval_s=interval_s)
|
|
return cls(io=io)
|
|
|
|
@classmethod
|
|
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
|
|
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
|
|
from cf_voice.io import MockVoiceIO
|
|
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
|
|
|
|
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
|
"""
|
|
Yield enriched VoiceFrames continuously.
|
|
|
|
Stub: frames from the IO layer pass through unchanged.
|
|
Real: enrichment pipeline runs here before yield.
|
|
"""
|
|
async for frame in self._io.stream():
|
|
yield self._enrich(frame)
|
|
|
|
async def stop(self) -> None:
|
|
await self._io.stop()
|
|
|
|
def classify_chunk(
|
|
self,
|
|
audio_b64: str,
|
|
timestamp: float = 0.0,
|
|
prior_frames: int = 0,
|
|
elcor: bool = False,
|
|
) -> list[AudioEvent]:
|
|
"""
|
|
Classify a single audio chunk and return AudioEvents.
|
|
|
|
This is the request-response path used by the cf-orch endpoint.
|
|
The streaming path (async generator) is for continuous consumers.
|
|
|
|
elcor=True switches subtext format to Mass Effect Elcor prefix style.
|
|
Generic tone annotation is always present regardless of elcor flag.
|
|
"""
|
|
if isinstance(self._io, MockVoiceIO):
|
|
return self._classify_chunk_mock(timestamp, prior_frames, elcor)
|
|
|
|
return self._classify_chunk_real(audio_b64, timestamp, elcor)
|
|
|
|
def _classify_chunk_mock(
|
|
self, timestamp: float, prior_frames: int, elcor: bool
|
|
) -> list[AudioEvent]:
|
|
"""Synthetic path — used in mock mode and CI."""
|
|
rng = self._io._rng # type: ignore[attr-defined]
|
|
import time as _time
|
|
label = rng.choice(self._io._labels) # type: ignore[attr-defined]
|
|
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
|
|
frame = VoiceFrame(
|
|
label=label,
|
|
confidence=rng.uniform(0.6, 0.97),
|
|
speaker_id=rng.choice(self._io._speakers), # type: ignore[attr-defined]
|
|
shift_magnitude=round(shift, 3),
|
|
timestamp=timestamp,
|
|
)
|
|
tone = tone_event_from_voice_frame(
|
|
frame_label=frame.label,
|
|
frame_confidence=frame.confidence,
|
|
shift_magnitude=frame.shift_magnitude,
|
|
timestamp=frame.timestamp,
|
|
elcor=elcor,
|
|
)
|
|
return [tone]
|
|
|
|
def _classify_chunk_real(
|
|
self, audio_b64: str, timestamp: float, elcor: bool
|
|
) -> list[AudioEvent]:
|
|
"""Real inference path — used when CF_VOICE_MOCK is unset."""
|
|
import asyncio
|
|
import base64
|
|
import numpy as np
|
|
from cf_voice.classify import ToneClassifier
|
|
|
|
pcm = base64.b64decode(audio_b64)
|
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
|
|
|
|
# ToneClassifier is stateless per-call, safe to instantiate inline
|
|
classifier = ToneClassifier.from_env()
|
|
tone_result = classifier.classify(audio)
|
|
|
|
frame = VoiceFrame(
|
|
label=tone_result.label,
|
|
confidence=tone_result.confidence,
|
|
speaker_id="speaker_a",
|
|
shift_magnitude=0.0,
|
|
timestamp=timestamp,
|
|
)
|
|
event = tone_event_from_voice_frame(
|
|
frame_label=frame.label,
|
|
frame_confidence=frame.confidence,
|
|
shift_magnitude=frame.shift_magnitude,
|
|
timestamp=frame.timestamp,
|
|
elcor=elcor,
|
|
)
|
|
return [event]
|
|
|
|
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
|
"""
|
|
Apply tone classification to a raw frame.
|
|
|
|
Stub: identity transform — returns frame unchanged.
|
|
Real: replace label + confidence with classifier output.
|
|
"""
|
|
return frame
|