New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
129 lines
4 KiB
Python
129 lines
4 KiB
Python
# cf_voice/io.py — audio capture and VoiceFrame generation
|
|
#
|
|
# MIT licensed. This layer handles audio I/O only — no inference.
|
|
#
|
|
# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are
|
|
# emitted on a timer. Real audio capture will be added in Notation v0.1.x
|
|
# once pyannote.audio and faster-whisper are integrated.
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import os
|
|
import random
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from typing import AsyncIterator
|
|
|
|
from cf_voice.models import VoiceFrame
|
|
|
|
# Generic tone labels for the annotation stream.
|
|
# These are the underlying classifier outputs — the Elcor-style prefix format
|
|
# ("With barely concealed frustration:") is applied by the UI layer, not here.
|
|
_MOCK_LABELS = [
|
|
"Calm and focused",
|
|
"Warmly impatient",
|
|
"Deflecting",
|
|
"Genuinely curious",
|
|
"Politely dismissive",
|
|
"Nervous but cooperative",
|
|
"Frustrated but contained",
|
|
"Enthusiastic",
|
|
"Tired and compliant",
|
|
"Guardedly optimistic",
|
|
"Apologetically firm",
|
|
"Confused but engaged",
|
|
]
|
|
|
|
_MOCK_SPEAKERS = ["speaker_a", "speaker_b"]
|
|
|
|
|
|
class VoiceIO(ABC):
|
|
"""
|
|
Base class for all audio capture sources.
|
|
|
|
Subclasses yield VoiceFrame objects from an async generator.
|
|
Consumers should use: `async for frame in io_instance.stream(): ...`
|
|
"""
|
|
|
|
@abstractmethod
|
|
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
|
"""Yield VoiceFrames continuously until stopped."""
|
|
...
|
|
|
|
async def stop(self) -> None:
|
|
"""Signal the stream to stop. Override if cleanup is needed."""
|
|
|
|
|
|
class MockVoiceIO(VoiceIO):
|
|
"""
|
|
Synthetic VoiceFrame generator for development and CI.
|
|
|
|
Emits one frame every `interval_s` seconds with randomised labels,
|
|
confidence, and simulated speaker transitions.
|
|
|
|
Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated
|
|
directly in tests.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
interval_s: float = 2.5,
|
|
speakers: list[str] | None = None,
|
|
labels: list[str] | None = None,
|
|
seed: int | None = None,
|
|
) -> None:
|
|
self._interval_s = interval_s
|
|
self._speakers = speakers or _MOCK_SPEAKERS
|
|
self._labels = labels or _MOCK_LABELS
|
|
self._rng = random.Random(seed)
|
|
self._running = False
|
|
|
|
async def stream(self) -> AsyncIterator[VoiceFrame]: # type: ignore[override]
|
|
self._running = True
|
|
start = time.monotonic()
|
|
prev_label = self._rng.choice(self._labels)
|
|
|
|
while self._running:
|
|
await asyncio.sleep(self._interval_s)
|
|
|
|
label = self._rng.choice(self._labels)
|
|
# shift_magnitude is 0 when the label repeats, higher for big jumps
|
|
shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9)
|
|
prev_label = label
|
|
|
|
yield VoiceFrame(
|
|
label=label,
|
|
confidence=self._rng.uniform(0.55, 0.98),
|
|
speaker_id=self._rng.choice(self._speakers),
|
|
shift_magnitude=round(shift, 3),
|
|
timestamp=round(time.monotonic() - start, 2),
|
|
)
|
|
|
|
async def stop(self) -> None:
|
|
self._running = False
|
|
|
|
|
|
def make_io(
|
|
mock: bool | None = None,
|
|
interval_s: float = 2.5,
|
|
device_index: int | None = None,
|
|
) -> VoiceIO:
|
|
"""
|
|
Factory: return a VoiceIO instance appropriate for the current environment.
|
|
|
|
mock=True or CF_VOICE_MOCK=1 → MockVoiceIO (no GPU, mic, or HF token needed)
|
|
Otherwise → MicVoiceIO (requires [inference] extras)
|
|
"""
|
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
|
if use_mock:
|
|
return MockVoiceIO(interval_s=interval_s)
|
|
|
|
try:
|
|
from cf_voice.capture import MicVoiceIO
|
|
return MicVoiceIO(device_index=device_index)
|
|
except ImportError as exc:
|
|
raise NotImplementedError(
|
|
"Real audio capture requires [inference] extras. "
|
|
"Install with: pip install cf-voice[inference]\n"
|
|
f"Missing: {exc}"
|
|
) from exc
|