- events.py: AudioEvent dataclass + ToneEvent with affect, shift_magnitude, shift_direction, prosody_flags; make_subtext() for generic/Elcor formats - context.py: classify_chunk(audio_b64, timestamp, prior_frames, elcor) returns list[AudioEvent]; mock mode uses MockVoiceIO RNG, real raises NotImplementedError - ToneEvent.__post_init__ pins event_type='tone' (avoids MRO default-field ordering bug) - Elcor mode: same classifier output, Elcor speech-prefix wording; all tiers
120 lines
4.2 KiB
Python
120 lines
4.2 KiB
Python
# cf_voice/context.py — tone classification and context enrichment
|
|
#
|
|
# BSL 1.1 when real inference models are integrated.
|
|
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
|
|
#
|
|
# Real implementation (Notation v0.1.x) will:
|
|
# - Run YAMNet acoustic event detection on the audio buffer
|
|
# - Run wav2vec2-based SER (speech emotion recognition)
|
|
# - Run librosa prosody extraction (pitch, energy, rate)
|
|
# - Combine into enriched VoiceFrame label + confidence
|
|
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from typing import AsyncIterator
|
|
|
|
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
|
|
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
|
|
from cf_voice.models import VoiceFrame
|
|
|
|
|
|
class ContextClassifier:
|
|
"""
|
|
High-level voice context classifier.
|
|
|
|
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
|
|
In stub mode the frames pass through unchanged — the enrichment pipeline
|
|
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
|
|
|
|
Usage
|
|
-----
|
|
classifier = ContextClassifier.from_env()
|
|
async for frame in classifier.stream():
|
|
print(frame.label, frame.confidence)
|
|
"""
|
|
|
|
def __init__(self, io: VoiceIO) -> None:
|
|
self._io = io
|
|
|
|
@classmethod
|
|
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
|
|
"""
|
|
Create a ContextClassifier from environment.
|
|
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
|
|
"""
|
|
io = make_io(interval_s=interval_s)
|
|
return cls(io=io)
|
|
|
|
@classmethod
|
|
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
|
|
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
|
|
from cf_voice.io import MockVoiceIO
|
|
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
|
|
|
|
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
|
"""
|
|
Yield enriched VoiceFrames continuously.
|
|
|
|
Stub: frames from the IO layer pass through unchanged.
|
|
Real: enrichment pipeline runs here before yield.
|
|
"""
|
|
async for frame in self._io.stream():
|
|
yield self._enrich(frame)
|
|
|
|
async def stop(self) -> None:
|
|
await self._io.stop()
|
|
|
|
def classify_chunk(
|
|
self,
|
|
audio_b64: str,
|
|
timestamp: float = 0.0,
|
|
prior_frames: int = 0,
|
|
elcor: bool = False,
|
|
) -> list[AudioEvent]:
|
|
"""
|
|
Classify a single audio chunk and return AudioEvents.
|
|
|
|
This is the request-response path used by the cf-orch endpoint.
|
|
The streaming path (async generator) is for continuous consumers.
|
|
|
|
Stub: audio_b64 is ignored; returns synthetic events from the mock IO.
|
|
Real: decode audio, run YAMNet + SER + pyannote, return events.
|
|
|
|
elcor=True switches subtext format to Mass Effect Elcor prefix style.
|
|
Generic tone annotation is always present regardless of elcor flag.
|
|
"""
|
|
if not isinstance(self._io, MockVoiceIO):
|
|
raise NotImplementedError(
|
|
"classify_chunk() requires mock mode. "
|
|
"Real audio inference is not yet implemented."
|
|
)
|
|
# Generate a synthetic VoiceFrame to derive events from
|
|
rng = self._io._rng
|
|
import time
|
|
label = rng.choice(self._io._labels)
|
|
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
|
|
frame = VoiceFrame(
|
|
label=label,
|
|
confidence=rng.uniform(0.6, 0.97),
|
|
speaker_id=rng.choice(self._io._speakers),
|
|
shift_magnitude=round(shift, 3),
|
|
timestamp=timestamp,
|
|
)
|
|
tone = tone_event_from_voice_frame(
|
|
frame_label=frame.label,
|
|
frame_confidence=frame.confidence,
|
|
shift_magnitude=frame.shift_magnitude,
|
|
timestamp=frame.timestamp,
|
|
elcor=elcor,
|
|
)
|
|
return [tone]
|
|
|
|
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
|
"""
|
|
Apply tone classification to a raw frame.
|
|
|
|
Stub: identity transform — returns frame unchanged.
|
|
Real: replace label + confidence with classifier output.
|
|
"""
|
|
return frame
|