feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
This commit is contained in:
parent
335d51f02f
commit
24f04b67db
26 changed files with 3974 additions and 111 deletions
40
.env.example
40
.env.example
|
|
@ -3,14 +3,29 @@
|
||||||
# load it via python-dotenv in their own startup. For standalone cf-voice
|
# load it via python-dotenv in their own startup. For standalone cf-voice
|
||||||
# dev/testing, source this file manually or install python-dotenv.
|
# dev/testing, source this file manually or install python-dotenv.
|
||||||
|
|
||||||
# ── HuggingFace ───────────────────────────────────────────────────────────────
|
# ── HuggingFace — free tier / local use ──────────────────────────────────────
|
||||||
# Required for pyannote.audio speaker diarization model download.
|
# Used by the local diarization path (free tier, user's own machine).
|
||||||
# Get a free token at https://huggingface.co/settings/tokens
|
# Each user must:
|
||||||
# Also accept the gated model terms at:
|
# 1. Create a free account at huggingface.co
|
||||||
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
# 2. Accept the gated model terms at:
|
||||||
# https://huggingface.co/pyannote/segmentation-3.0
|
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
||||||
|
# https://huggingface.co/pyannote/segmentation-3.0
|
||||||
|
# 3. Generate a read token at huggingface.co/settings/tokens
|
||||||
HF_TOKEN=
|
HF_TOKEN=
|
||||||
|
|
||||||
|
# ── HuggingFace — paid tier / cf-orch backend ─────────────────────────────────
|
||||||
|
# Used by cf-orch when running diarization as a managed service on Heimdall.
|
||||||
|
# This is a CircuitForge org token — NOT the user's personal token.
|
||||||
|
#
|
||||||
|
# Prerequisites (one-time, manual — tracked in circuitforge-orch#27):
|
||||||
|
# 1. Create CircuitForge org on huggingface.co
|
||||||
|
# 2. Accept pyannote/speaker-diarization-3.1 terms under the org account
|
||||||
|
# 3. Accept pyannote/segmentation-3.0 terms under the org account
|
||||||
|
# 4. Generate a read-only org token and set it here
|
||||||
|
#
|
||||||
|
# Leave blank on local installs — HF_TOKEN above is used instead.
|
||||||
|
CF_HF_TOKEN=
|
||||||
|
|
||||||
# ── Whisper STT ───────────────────────────────────────────────────────────────
|
# ── Whisper STT ───────────────────────────────────────────────────────────────
|
||||||
# Model size: tiny | base | small | medium | large-v2 | large-v3
|
# Model size: tiny | base | small | medium | large-v2 | large-v3
|
||||||
# Smaller = faster / less VRAM; larger = more accurate.
|
# Smaller = faster / less VRAM; larger = more accurate.
|
||||||
|
|
@ -29,3 +44,16 @@ CF_VOICE_MOCK=
|
||||||
# ── Tone classifier ───────────────────────────────────────────────────────────
|
# ── Tone classifier ───────────────────────────────────────────────────────────
|
||||||
# Minimum confidence to emit a VoiceFrame (below this = frame skipped).
|
# Minimum confidence to emit a VoiceFrame (below this = frame skipped).
|
||||||
CF_VOICE_CONFIDENCE_THRESHOLD=0.55
|
CF_VOICE_CONFIDENCE_THRESHOLD=0.55
|
||||||
|
|
||||||
|
# ── Elcor annotation mode ─────────────────────────────────────────────────────
|
||||||
|
# Accessibility feature for autistic and ND users. Switches tone subtext from
|
||||||
|
# generic format ("Tone: Frustrated") to Elcor-style prefix format
|
||||||
|
# ("With barely concealed frustration:"). Opt-in, local-only.
|
||||||
|
# Overridden by cf-core preferences store when circuitforge_core is installed.
|
||||||
|
# 1 = enabled, 0 or unset = disabled (default).
|
||||||
|
CF_VOICE_ELCOR=0
|
||||||
|
|
||||||
|
# Number of prior VoiceFrames to include as context for Elcor label generation.
|
||||||
|
# Larger windows = more contextually aware annotations, higher LLM prompt cost.
|
||||||
|
# Default: 4 frames (~10 seconds of rolling context at 2.5s intervals).
|
||||||
|
CF_VOICE_ELCOR_PRIOR_FRAMES=4
|
||||||
|
|
|
||||||
72
README.md
72
README.md
|
|
@ -126,6 +126,64 @@ Host apps subscribing via `<LinnetWidget />` receive `MessageEvent` with `type =
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Telephony
|
||||||
|
|
||||||
|
`cf_voice.telephony` provides the outbound call abstraction for Osprey, Harrier, Ibis, and Kestrel.
|
||||||
|
|
||||||
|
### Quick start
|
||||||
|
|
||||||
|
```python
|
||||||
|
from cf_voice.telephony import make_telephony
|
||||||
|
|
||||||
|
# Mock mode — no real calls placed (CF_VOICE_MOCK=1 or mock=True)
|
||||||
|
backend = make_telephony(mock=True)
|
||||||
|
|
||||||
|
session = await backend.dial(
|
||||||
|
to="+15551234567",
|
||||||
|
from_="+18005550000",
|
||||||
|
webhook_url="https://yourapp.example.com/voice/events",
|
||||||
|
amd=True, # answering machine detection
|
||||||
|
)
|
||||||
|
|
||||||
|
# Adaptive service identification (osprey#21)
|
||||||
|
await backend.announce(session.call_sid, "This is an automated assistant.")
|
||||||
|
|
||||||
|
# Navigate IVR
|
||||||
|
await backend.send_dtmf(session.call_sid, "2") # Press 2 for billing
|
||||||
|
|
||||||
|
# Bridge to user's phone once human agent answers
|
||||||
|
await backend.bridge(session.call_sid, "+14155550100")
|
||||||
|
|
||||||
|
await backend.hangup(session.call_sid)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backend selection
|
||||||
|
|
||||||
|
`make_telephony()` resolves the backend in this order:
|
||||||
|
|
||||||
|
| Condition | Backend |
|
||||||
|
|---|---|
|
||||||
|
| `CF_VOICE_MOCK=1` or `mock=True` | `MockTelephonyBackend` (dev/CI) |
|
||||||
|
| `CF_SW_PROJECT_ID` env set | `SignalWireBackend` (paid tier) |
|
||||||
|
| `CF_ESL_PASSWORD` env set | `FreeSWITCHBackend` (free tier, self-hosted) |
|
||||||
|
| none | `RuntimeError` |
|
||||||
|
|
||||||
|
### Installing real backends
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Paid tier — SignalWire managed telephony
|
||||||
|
pip install cf-voice[signalwire]
|
||||||
|
|
||||||
|
# Free tier — self-hosted FreeSWITCH (requires compiled ESL bindings)
|
||||||
|
pip install cf-voice[freeswitch]
|
||||||
|
```
|
||||||
|
|
||||||
|
Set credentials in `.env` (see `.env.example`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Mock mode
|
## Mock mode
|
||||||
|
|
||||||
Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode.
|
Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode.
|
||||||
|
|
@ -139,6 +197,7 @@ Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `Voice
|
||||||
| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
|
| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
|
||||||
| `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types |
|
| `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types |
|
||||||
| `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory |
|
| `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory |
|
||||||
|
| `cf_voice.telephony` | MIT (Protocol + Mock), BSL (backends) | `TelephonyBackend` Protocol, `MockTelephonyBackend`, `SignalWireBackend`, `FreeSWITCHBackend`, `make_telephony()` |
|
||||||
| `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing |
|
| `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing |
|
||||||
| `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper |
|
| `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper |
|
||||||
| `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody |
|
| `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody |
|
||||||
|
|
@ -149,6 +208,19 @@ BSL applies to inference modules. IO + types + wire format = MIT.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
Speaker diarization uses [pyannote.audio](https://github.com/pyannote/pyannote-audio) (MIT) and the following gated HuggingFace models (CC BY 4.0):
|
||||||
|
|
||||||
|
- `pyannote/speaker-diarization-3.1` — Hervé Bredin et al.
|
||||||
|
- `pyannote/segmentation-3.0` — Hervé Bredin et al.
|
||||||
|
|
||||||
|
CC BY 4.0 requires attribution in any distributed product. The models are gated: each user must accept the license terms on HuggingFace before their `HF_TOKEN` will authorize a download.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Consumed by
|
## Consumed by
|
||||||
|
|
||||||
- `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer)
|
- `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer)
|
||||||
|
|
|
||||||
152
cf_voice/accent.py
Normal file
152
cf_voice/accent.py
Normal file
|
|
@ -0,0 +1,152 @@
|
||||||
|
# cf_voice/accent.py — accent / language identification classifier
|
||||||
|
#
|
||||||
|
# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference).
|
||||||
|
# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity).
|
||||||
|
#
|
||||||
|
# Accent alone is not high-risk, but combined with birdsong or a quiet rural
|
||||||
|
# background it becomes location-identifying. The privacy scorer accounts for
|
||||||
|
# this compound signal.
|
||||||
|
#
|
||||||
|
# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent
|
||||||
|
# fine-tune for region. Lazy-loaded to keep startup fast.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AccentResult:
|
||||||
|
"""
|
||||||
|
Language + regional accent classification for the primary speaker.
|
||||||
|
|
||||||
|
language: BCP-47 language tag (e.g. "en", "fr", "zh")
|
||||||
|
region: cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other")
|
||||||
|
confidence: float in [0, 1]
|
||||||
|
"""
|
||||||
|
language: str
|
||||||
|
region: str
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
|
class MockAccentClassifier:
|
||||||
|
"""
|
||||||
|
Synthetic accent classifier for development and CI.
|
||||||
|
|
||||||
|
Returns a fixed result so the privacy scorer can exercise all code paths
|
||||||
|
without loading a real model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
|
||||||
|
return AccentResult(language="en", region="en_gb", confidence=0.72)
|
||||||
|
|
||||||
|
|
||||||
|
class AccentClassifier:
|
||||||
|
"""
|
||||||
|
Real accent / language classifier.
|
||||||
|
|
||||||
|
BSL 1.1 — requires [inference] extras.
|
||||||
|
|
||||||
|
Language detection: facebook/mms-lid-126 (126 languages, MIT licensed).
|
||||||
|
Accent region: maps language tag to a regional ACCENT_LABEL.
|
||||||
|
|
||||||
|
VRAM: ~500 MB on CUDA.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LANG_MODEL_ID = "facebook/mms-lid-126"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
try:
|
||||||
|
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"transformers is required for accent classification. "
|
||||||
|
"Install with: pip install cf-voice[inference]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
self._device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device)
|
||||||
|
self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID)
|
||||||
|
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
|
||||||
|
self._LANG_MODEL_ID
|
||||||
|
).to(self._device)
|
||||||
|
self._model.eval()
|
||||||
|
|
||||||
|
def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if isinstance(audio, bytes):
|
||||||
|
audio_np = np.frombuffer(audio, dtype=np.float32)
|
||||||
|
else:
|
||||||
|
audio_np = np.asarray(audio, dtype=np.float32)
|
||||||
|
|
||||||
|
if len(audio_np) < 1600: # need at least 100ms at 16kHz
|
||||||
|
return None
|
||||||
|
|
||||||
|
inputs = self._extractor(
|
||||||
|
audio_np, sampling_rate=16_000, return_tensors="pt", padding=True
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self._model(**inputs).logits
|
||||||
|
probs = torch.softmax(logits, dim=-1)[0]
|
||||||
|
|
||||||
|
top_idx = int(probs.argmax())
|
||||||
|
confidence = float(probs[top_idx])
|
||||||
|
language = self._model.config.id2label.get(top_idx, "other")
|
||||||
|
|
||||||
|
region = _lang_to_region(language)
|
||||||
|
return AccentResult(language=language, region=region, confidence=confidence)
|
||||||
|
|
||||||
|
|
||||||
|
def _lang_to_region(lang: str) -> str:
|
||||||
|
"""Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL."""
|
||||||
|
_MAP: dict[str, str] = {
|
||||||
|
"eng": "en_us", # MMS uses ISO 639-3; sub-regional accent needs fine-tune
|
||||||
|
"fra": "fr",
|
||||||
|
"spa": "es",
|
||||||
|
"deu": "de",
|
||||||
|
"zho": "zh",
|
||||||
|
"jpn": "ja",
|
||||||
|
"en": "en_us",
|
||||||
|
"en-GB": "en_gb",
|
||||||
|
"en-AU": "en_au",
|
||||||
|
"en-CA": "en_ca",
|
||||||
|
"en-IN": "en_in",
|
||||||
|
"fr": "fr",
|
||||||
|
"de": "de",
|
||||||
|
"es": "es",
|
||||||
|
"zh": "zh",
|
||||||
|
"ja": "ja",
|
||||||
|
}
|
||||||
|
return _MAP.get(lang, "other")
|
||||||
|
|
||||||
|
|
||||||
|
def make_accent_classifier(
|
||||||
|
mock: bool | None = None,
|
||||||
|
) -> "MockAccentClassifier | AccentClassifier | None":
|
||||||
|
"""
|
||||||
|
Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None.
|
||||||
|
|
||||||
|
Callers must check for None before invoking classify().
|
||||||
|
"""
|
||||||
|
enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1"
|
||||||
|
if not enabled:
|
||||||
|
return None
|
||||||
|
|
||||||
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
if use_mock:
|
||||||
|
return MockAccentClassifier()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return AccentClassifier()
|
||||||
|
except (ImportError, Exception) as exc:
|
||||||
|
logger.warning("AccentClassifier unavailable (%s) — using mock", exc)
|
||||||
|
return MockAccentClassifier()
|
||||||
366
cf_voice/acoustic.py
Normal file
366
cf_voice/acoustic.py
Normal file
|
|
@ -0,0 +1,366 @@
|
||||||
|
# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier
|
||||||
|
#
|
||||||
|
# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference).
|
||||||
|
# Requires [inference] extras for real mode.
|
||||||
|
#
|
||||||
|
# This module is the AMD (answering machine detection) backbone for Osprey.
|
||||||
|
# It runs in parallel with the STT pipeline — it never processes words,
|
||||||
|
# only acoustic features (pitch, timbre, background, DTMF tones, ringback).
|
||||||
|
#
|
||||||
|
# Navigation v0.2.x wires the real YAMNet model.
|
||||||
|
# Current: mock emits a plausible call-lifecycle sequence.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import AsyncIterator, Protocol, Sequence, runtime_checkable
|
||||||
|
|
||||||
|
from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SAMPLE_RATE = 16_000
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AcousticResult:
|
||||||
|
"""Batch of AudioEvents produced from a single audio window."""
|
||||||
|
queue: AudioEvent | None
|
||||||
|
speaker: AudioEvent | None
|
||||||
|
environ: AudioEvent | None
|
||||||
|
scene: AudioEvent | None
|
||||||
|
timestamp: float
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class AcousticBackend(Protocol):
|
||||||
|
"""
|
||||||
|
Interface for acoustic event classifiers.
|
||||||
|
|
||||||
|
classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an
|
||||||
|
AcousticResult covering one analysis window (~2s). It is synchronous and
|
||||||
|
runs in a thread pool when called from async code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def classify_window(
|
||||||
|
self,
|
||||||
|
audio: "list[float] | bytes",
|
||||||
|
timestamp: float = 0.0,
|
||||||
|
) -> AcousticResult:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class SceneBackend(Protocol):
|
||||||
|
"""
|
||||||
|
Interface for dedicated acoustic scene classifiers.
|
||||||
|
|
||||||
|
Separate from AcousticBackend to allow future swapping to a specialised
|
||||||
|
scene model (e.g. AudioSet acoustic-scene subset) without touching the
|
||||||
|
telephony event classifier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def classify_scene(
|
||||||
|
self,
|
||||||
|
audio: "list[float] | bytes",
|
||||||
|
timestamp: float = 0.0,
|
||||||
|
) -> AudioEvent | None:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# ── Call lifecycle for mock mode ──────────────────────────────────────────────
|
||||||
|
# Approximates what a real outbound call looks like acoustically.
|
||||||
|
# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center
|
||||||
|
|
||||||
|
_MOCK_LIFECYCLE: list[dict] = [
|
||||||
|
# (min_s, max_s): how long to stay in this phase
|
||||||
|
{"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)},
|
||||||
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)},
|
||||||
|
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)},
|
||||||
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)},
|
||||||
|
{"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
|
||||||
|
{"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
|
||||||
|
{"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)},
|
||||||
|
# AMD moment: background_shift is the primary signal
|
||||||
|
{"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)},
|
||||||
|
{"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class MockAcousticBackend:
|
||||||
|
"""
|
||||||
|
Synthetic acoustic classifier for development and CI.
|
||||||
|
|
||||||
|
Cycles through a plausible call lifecycle so Osprey's IVR state machine
|
||||||
|
can be tested without real telephony. The AMD signal (background_shift →
|
||||||
|
human_single) is emitted at the right point in the sequence.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
backend = MockAcousticBackend(seed=42)
|
||||||
|
result = backend.classify_window(b"", timestamp=4.5)
|
||||||
|
print(result.environ.label) # → "hold_music", "background_shift", etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, seed: int | None = None) -> None:
|
||||||
|
self._rng = random.Random(seed)
|
||||||
|
self._phase_idx = 0
|
||||||
|
self._phase_start = time.monotonic()
|
||||||
|
self._phase_dur = self._draw_phase_dur(0)
|
||||||
|
|
||||||
|
def _draw_phase_dur(self, idx: int) -> float:
|
||||||
|
lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"]
|
||||||
|
return self._rng.uniform(lo, hi)
|
||||||
|
|
||||||
|
def _current_phase(self) -> dict:
|
||||||
|
now = time.monotonic()
|
||||||
|
elapsed = now - self._phase_start
|
||||||
|
if elapsed >= self._phase_dur:
|
||||||
|
self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE)
|
||||||
|
self._phase_start = now
|
||||||
|
self._phase_dur = self._draw_phase_dur(self._phase_idx)
|
||||||
|
return _MOCK_LIFECYCLE[self._phase_idx]
|
||||||
|
|
||||||
|
def _make_event(
|
||||||
|
self,
|
||||||
|
event_type: str,
|
||||||
|
label: str,
|
||||||
|
timestamp: float,
|
||||||
|
) -> AudioEvent:
|
||||||
|
return AudioEvent(
|
||||||
|
timestamp=timestamp,
|
||||||
|
event_type=event_type, # type: ignore[arg-type]
|
||||||
|
label=label,
|
||||||
|
confidence=self._rng.uniform(0.72, 0.97),
|
||||||
|
)
|
||||||
|
|
||||||
|
def classify_window(
|
||||||
|
self,
|
||||||
|
audio: "list[float] | bytes",
|
||||||
|
timestamp: float = 0.0,
|
||||||
|
) -> AcousticResult:
|
||||||
|
phase = self._current_phase()
|
||||||
|
return AcousticResult(
|
||||||
|
queue=self._make_event("queue", phase["queue"], timestamp),
|
||||||
|
speaker=self._make_event("speaker", phase["speaker"], timestamp),
|
||||||
|
environ=self._make_event("environ", phase["environ"], timestamp),
|
||||||
|
scene=self._make_event("scene", phase["scene"], timestamp),
|
||||||
|
timestamp=timestamp,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── AST acoustic backend (BSL 1.1) ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class ASTAcousticBackend:
|
||||||
|
"""
|
||||||
|
Audio Spectrogram Transformer acoustic event classifier.
|
||||||
|
|
||||||
|
BSL 1.1 — requires [inference] extras.
|
||||||
|
|
||||||
|
Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to
|
||||||
|
classify queue state, speaker type, and background environment from a
|
||||||
|
single forward pass. Top-15 predictions are scanned; the highest-confidence
|
||||||
|
match per event category is emitted.
|
||||||
|
|
||||||
|
Model: MIT/ast-finetuned-audioset-10-10-0.4593
|
||||||
|
VRAM: ~300 MB on CUDA (fp32)
|
||||||
|
Input: float32 16kHz mono audio (any length; feature extractor pads/truncates)
|
||||||
|
|
||||||
|
Replaces the YAMNet stub. Synchronous — run from a thread pool executor
|
||||||
|
when called from async code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
||||||
|
_SAMPLE_RATE = 16_000
|
||||||
|
_TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range
|
||||||
|
|
||||||
|
# Minimum confidence below which an event is suppressed even if it's the
|
||||||
|
# top match in its category.
|
||||||
|
_MIN_CONFIDENCE: dict[str, float] = {
|
||||||
|
"queue": 0.10,
|
||||||
|
"speaker": 0.08,
|
||||||
|
"environ": 0.12,
|
||||||
|
"scene": 0.08, # scenes fire reliably — lower bar is fine
|
||||||
|
}
|
||||||
|
|
||||||
|
# AudioSet class name → (event_type, cf-voice label).
|
||||||
|
# Top-K predictions are scanned; highest confidence per category wins.
|
||||||
|
# "call_center" requires dedicated call-centre acoustics, not generic indoor.
|
||||||
|
# "Music" was previously duplicated (queue + environ) — Python dicts keep the
|
||||||
|
# last entry, silently losing the queue mapping. Fixed: use the specific
|
||||||
|
# "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ.
|
||||||
|
_LABEL_MAP: dict[str, tuple[str, str]] = {
|
||||||
|
# ── Queue / call-state labels ──────────────────────────────────────────
|
||||||
|
"Ringtone": ("queue", "ringback"),
|
||||||
|
"Telephone bell ringing": ("queue", "ringback"),
|
||||||
|
"Busy signal": ("queue", "busy"),
|
||||||
|
"Dial tone": ("queue", "dtmf_tone"),
|
||||||
|
"DTMF": ("queue", "dtmf_tone"),
|
||||||
|
"Silence": ("queue", "silence"),
|
||||||
|
# ── Speaker type labels ────────────────────────────────────────────────
|
||||||
|
"Speech": ("speaker", "human_single"),
|
||||||
|
"Male speech, man speaking": ("speaker", "human_single"),
|
||||||
|
"Female speech, woman speaking": ("speaker", "human_single"),
|
||||||
|
"Child speech, kid speaking": ("speaker", "human_single"),
|
||||||
|
"Crowd": ("speaker", "human_multi"),
|
||||||
|
"Hubbub, speech noise, speech babble": ("speaker", "human_multi"),
|
||||||
|
"Laughter": ("speaker", "human_multi"),
|
||||||
|
"Chuckle, chortle": ("speaker", "human_multi"),
|
||||||
|
"Speech synthesizer": ("speaker", "ivr_synth"),
|
||||||
|
# ── Environmental labels ───────────────────────────────────────────────
|
||||||
|
# Telephony — requires specific call-centre acoustics, not generic indoor
|
||||||
|
"Telephone": ("environ", "call_center"),
|
||||||
|
"Telephone dialing, DTMF": ("environ", "call_center"),
|
||||||
|
"Reverberation": ("environ", "background_shift"),
|
||||||
|
"Echo": ("environ", "background_shift"),
|
||||||
|
"Background noise": ("environ", "noise_floor_change"),
|
||||||
|
"Noise": ("environ", "noise_floor_change"),
|
||||||
|
"White noise": ("environ", "noise_floor_change"),
|
||||||
|
"Pink noise": ("environ", "noise_floor_change"),
|
||||||
|
"Static": ("environ", "noise_floor_change"),
|
||||||
|
"Music": ("environ", "music"),
|
||||||
|
# Nature
|
||||||
|
"Bird": ("environ", "birdsong"),
|
||||||
|
"Bird vocalization, bird call, bird song": ("environ", "birdsong"),
|
||||||
|
"Chirp, tweet": ("environ", "birdsong"),
|
||||||
|
"Wind": ("environ", "wind"),
|
||||||
|
"Wind noise (microphone)": ("environ", "wind"),
|
||||||
|
"Rain": ("environ", "rain"),
|
||||||
|
"Rain on surface": ("environ", "rain"),
|
||||||
|
"Water": ("environ", "water"),
|
||||||
|
"Stream": ("environ", "water"),
|
||||||
|
# Urban
|
||||||
|
"Traffic noise, roadway noise": ("environ", "traffic"),
|
||||||
|
"Vehicle": ("environ", "traffic"),
|
||||||
|
"Crowd": ("environ", "crowd_chatter"),
|
||||||
|
"Chatter": ("environ", "crowd_chatter"),
|
||||||
|
"Construction": ("environ", "construction"),
|
||||||
|
"Drill": ("environ", "construction"),
|
||||||
|
# Indoor
|
||||||
|
"Air conditioning": ("environ", "hvac"),
|
||||||
|
"Mechanical fan": ("environ", "hvac"),
|
||||||
|
"Computer keyboard": ("environ", "keyboard_typing"),
|
||||||
|
"Typing": ("environ", "keyboard_typing"),
|
||||||
|
"Restaurant": ("environ", "restaurant"),
|
||||||
|
"Dishes, pots, and pans": ("environ", "restaurant"),
|
||||||
|
# ── Acoustic scene labels ──────────────────────────────────────────────
|
||||||
|
# "Inside, small/large room" moved from environ to scene — they correctly
|
||||||
|
# describe the acoustic scene but are NOT specific enough for call_center.
|
||||||
|
"Inside, small room": ("scene", "indoor_quiet"),
|
||||||
|
"Inside, large room or hall": ("scene", "indoor_crowd"),
|
||||||
|
"Outside, urban or manmade": ("scene", "outdoor_urban"),
|
||||||
|
"Field recording": ("scene", "outdoor_nature"),
|
||||||
|
"Rail transport": ("scene", "public_transit"),
|
||||||
|
"Bus": ("scene", "public_transit"),
|
||||||
|
"Train": ("scene", "public_transit"),
|
||||||
|
"Car": ("scene", "vehicle"),
|
||||||
|
"Truck": ("scene", "vehicle"),
|
||||||
|
"Motorcycle": ("scene", "vehicle"),
|
||||||
|
# Music in the queue sense — "Musical instrument" is more specific
|
||||||
|
# than the ambiguous top-level "Music" class
|
||||||
|
"Musical instrument": ("queue", "hold_music"),
|
||||||
|
"Piano": ("queue", "hold_music"),
|
||||||
|
"Guitar": ("queue", "hold_music"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
try:
|
||||||
|
from transformers import ASTFeatureExtractor, ASTForAudioClassification
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"transformers is required for AST acoustic classification. "
|
||||||
|
"Install with: pip install cf-voice[inference]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
self._device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device)
|
||||||
|
self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID)
|
||||||
|
self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to(
|
||||||
|
self._device
|
||||||
|
)
|
||||||
|
self._model.eval()
|
||||||
|
|
||||||
|
def classify_window(
|
||||||
|
self,
|
||||||
|
audio: "list[float] | bytes",
|
||||||
|
timestamp: float = 0.0,
|
||||||
|
) -> AcousticResult:
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if isinstance(audio, bytes):
|
||||||
|
audio_np = np.frombuffer(audio, dtype=np.float32)
|
||||||
|
else:
|
||||||
|
audio_np = np.asarray(audio, dtype=np.float32)
|
||||||
|
|
||||||
|
if len(audio_np) == 0:
|
||||||
|
return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp)
|
||||||
|
|
||||||
|
inputs = self._extractor(
|
||||||
|
audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt"
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self._model(**inputs).logits
|
||||||
|
probs = torch.softmax(logits, dim=-1)[0]
|
||||||
|
id2label = self._model.config.id2label
|
||||||
|
|
||||||
|
top_k = min(self._TOP_K, len(probs))
|
||||||
|
top_indices = probs.topk(top_k).indices.tolist()
|
||||||
|
predictions = [(id2label[i], float(probs[i])) for i in top_indices]
|
||||||
|
|
||||||
|
# Take highest-confidence match per category
|
||||||
|
best: dict[str, tuple[str, float]] = {} # event_type → (label, conf)
|
||||||
|
for ast_label, conf in predictions:
|
||||||
|
mapping = self._LABEL_MAP.get(ast_label)
|
||||||
|
if mapping is None:
|
||||||
|
continue
|
||||||
|
etype, cf_label = mapping
|
||||||
|
if etype not in best or conf > best[etype][1]:
|
||||||
|
best[etype] = (cf_label, conf)
|
||||||
|
|
||||||
|
def _make_event(etype: str, label: str, conf: float) -> AudioEvent:
|
||||||
|
return AudioEvent(
|
||||||
|
timestamp=timestamp,
|
||||||
|
event_type=etype, # type: ignore[arg-type]
|
||||||
|
label=label,
|
||||||
|
confidence=round(conf, 4),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _above_threshold(etype: str) -> bool:
|
||||||
|
if etype not in best:
|
||||||
|
return False
|
||||||
|
_, conf = best[etype]
|
||||||
|
return conf >= self._MIN_CONFIDENCE.get(etype, 0.10)
|
||||||
|
|
||||||
|
return AcousticResult(
|
||||||
|
queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None,
|
||||||
|
speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None,
|
||||||
|
environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None,
|
||||||
|
scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None,
|
||||||
|
timestamp=timestamp,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend":
|
||||||
|
"""
|
||||||
|
Factory: return an AcousticBackend for the current environment.
|
||||||
|
|
||||||
|
mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend
|
||||||
|
Otherwise → ASTAcousticBackend (falls back to mock on import error)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
if use_mock:
|
||||||
|
return MockAcousticBackend()
|
||||||
|
try:
|
||||||
|
return ASTAcousticBackend()
|
||||||
|
except (ImportError, Exception) as exc:
|
||||||
|
logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc)
|
||||||
|
return MockAcousticBackend()
|
||||||
197
cf_voice/app.py
Normal file
197
cf_voice/app.py
Normal file
|
|
@ -0,0 +1,197 @@
|
||||||
|
"""
|
||||||
|
cf-voice FastAPI service — managed by cf-orch.
|
||||||
|
|
||||||
|
Tone/affect classification sidecar for Linnet and any product that needs
|
||||||
|
real-time audio context annotation. Wraps ContextClassifier so it runs as an
|
||||||
|
independent managed process rather than embedded in the consumer's process.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
GET /health → {"status": "ok", "mode": "mock"|"real"}
|
||||||
|
POST /classify → ClassifyResponse
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m cf_voice.app --port 8007 --gpu-id 0
|
||||||
|
|
||||||
|
Mock mode (no GPU, no audio hardware required):
|
||||||
|
CF_VOICE_MOCK=1 python -m cf_voice.app --port 8007
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from cf_voice.context import ContextClassifier, model_status
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_classifier: ContextClassifier | None = None
|
||||||
|
_mock_mode: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
# ── Request / response models ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifyRequest(BaseModel):
|
||||||
|
audio_chunk: str | None = None # base64-encoded PCM int16 mono 16kHz; None in mock mode
|
||||||
|
timestamp: float = 0.0
|
||||||
|
elcor: bool | None = None
|
||||||
|
prior_frames: int | None = None
|
||||||
|
session_id: str = ""
|
||||||
|
language: str | None = None # BCP-47 hint for Whisper ("en", "es", …); None = auto-detect
|
||||||
|
num_speakers: int | None = None # pyannote hint: None = auto; 1–8 = fixed min+max
|
||||||
|
|
||||||
|
|
||||||
|
class AudioEventOut(BaseModel):
|
||||||
|
event_type: str
|
||||||
|
label: str
|
||||||
|
confidence: float
|
||||||
|
timestamp: float
|
||||||
|
speaker_id: str = "speaker_a"
|
||||||
|
subtext: str | None = None
|
||||||
|
affect: str | None = None
|
||||||
|
shift_magnitude: float | None = None
|
||||||
|
shift_direction: str | None = None
|
||||||
|
prosody_flags: list[str] = []
|
||||||
|
# Dimensional emotion (audeering model) — None when classifier disabled
|
||||||
|
valence: float | None = None
|
||||||
|
arousal: float | None = None
|
||||||
|
dominance: float | None = None
|
||||||
|
# Prosodic signals (openSMILE) — None when extractor disabled
|
||||||
|
sarcasm_risk: float | None = None
|
||||||
|
flat_f0_score: float | None = None
|
||||||
|
# Trajectory signals — None until BASELINE_MIN frames buffered per speaker
|
||||||
|
arousal_delta: float | None = None
|
||||||
|
valence_delta: float | None = None
|
||||||
|
trend: str | None = None
|
||||||
|
# Coherence signals (SER vs VAD)
|
||||||
|
coherence_score: float | None = None
|
||||||
|
suppression_flag: bool | None = None
|
||||||
|
reframe_type: str | None = None
|
||||||
|
affect_divergence: float | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifyResponse(BaseModel):
|
||||||
|
events: list[AudioEventOut]
|
||||||
|
|
||||||
|
|
||||||
|
# ── App factory ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def create_app(gpu_id: int = 0, mock: bool = False) -> FastAPI:
|
||||||
|
global _classifier, _mock_mode
|
||||||
|
|
||||||
|
# Signal GPU to the inference backends (wav2vec2 loads via transformers pipeline)
|
||||||
|
if not mock:
|
||||||
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
|
||||||
|
|
||||||
|
_mock_mode = mock or os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
_classifier = ContextClassifier.mock() if _mock_mode else ContextClassifier.from_env()
|
||||||
|
logger.info("cf-voice ready: mode=%s", "mock" if _mock_mode else "real")
|
||||||
|
|
||||||
|
app = FastAPI(title="cf-voice", version="0.1.0")
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def _startup_prewarm() -> None:
|
||||||
|
"""Pre-warm all configured models so downloads happen at startup, not
|
||||||
|
on the first classify call (which has a hard 9-second timeout)."""
|
||||||
|
if _classifier is not None:
|
||||||
|
import asyncio as _asyncio
|
||||||
|
_asyncio.create_task(_classifier.prewarm())
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health() -> dict:
|
||||||
|
result: dict = {
|
||||||
|
"status": "ok",
|
||||||
|
"mode": "mock" if _mock_mode else "real",
|
||||||
|
"models": dict(model_status),
|
||||||
|
}
|
||||||
|
# Surface misconfigured-but-silent diarizer so Linnet can warn the user.
|
||||||
|
# Check env vars only — no model loading needed at health-check time.
|
||||||
|
warnings: list[str] = []
|
||||||
|
if os.environ.get("CF_VOICE_DIARIZE", "0") == "1":
|
||||||
|
token = os.environ.get("HF_TOKEN", "").strip()
|
||||||
|
if not token:
|
||||||
|
warnings.append(
|
||||||
|
"Diarization is enabled (CF_VOICE_DIARIZE=1) but HF_TOKEN is not set. "
|
||||||
|
"Speaker identity badges will not appear. "
|
||||||
|
"Set HF_TOKEN in your .env and accept pyannote model terms at huggingface.co."
|
||||||
|
)
|
||||||
|
if warnings:
|
||||||
|
result["warnings"] = warnings
|
||||||
|
return result
|
||||||
|
|
||||||
|
@app.post("/classify")
|
||||||
|
async def classify(req: ClassifyRequest) -> ClassifyResponse:
|
||||||
|
if _classifier is None:
|
||||||
|
raise HTTPException(503, detail="classifier not initialised")
|
||||||
|
try:
|
||||||
|
events = await _classifier.classify_chunk_async(
|
||||||
|
audio_b64=req.audio_chunk,
|
||||||
|
timestamp=req.timestamp,
|
||||||
|
prior_frames=req.prior_frames,
|
||||||
|
elcor=req.elcor,
|
||||||
|
session_id=req.session_id,
|
||||||
|
language=req.language,
|
||||||
|
num_speakers=req.num_speakers,
|
||||||
|
)
|
||||||
|
except NotImplementedError as exc:
|
||||||
|
raise HTTPException(501, detail=str(exc))
|
||||||
|
|
||||||
|
from cf_voice.events import ToneEvent
|
||||||
|
|
||||||
|
out: list[AudioEventOut] = []
|
||||||
|
for e in events:
|
||||||
|
is_tone = isinstance(e, ToneEvent)
|
||||||
|
out.append(AudioEventOut(
|
||||||
|
event_type=e.event_type,
|
||||||
|
label=e.label,
|
||||||
|
confidence=round(e.confidence, 4),
|
||||||
|
timestamp=e.timestamp,
|
||||||
|
speaker_id=getattr(e, "speaker_id", "speaker_a") or "speaker_a",
|
||||||
|
subtext=getattr(e, "subtext", None),
|
||||||
|
affect=getattr(e, "affect", None) if is_tone else None,
|
||||||
|
shift_magnitude=getattr(e, "shift_magnitude", None) if is_tone else None,
|
||||||
|
shift_direction=getattr(e, "shift_direction", None) if is_tone else None,
|
||||||
|
prosody_flags=getattr(e, "prosody_flags", []) if is_tone else [],
|
||||||
|
valence=getattr(e, "valence", None) if is_tone else None,
|
||||||
|
arousal=getattr(e, "arousal", None) if is_tone else None,
|
||||||
|
dominance=getattr(e, "dominance", None) if is_tone else None,
|
||||||
|
sarcasm_risk=getattr(e, "sarcasm_risk", None) if is_tone else None,
|
||||||
|
flat_f0_score=getattr(e, "flat_f0_score", None) if is_tone else None,
|
||||||
|
arousal_delta=getattr(e, "arousal_delta", None) if is_tone else None,
|
||||||
|
valence_delta=getattr(e, "valence_delta", None) if is_tone else None,
|
||||||
|
trend=getattr(e, "trend", None) if is_tone else None,
|
||||||
|
coherence_score=getattr(e, "coherence_score", None) if is_tone else None,
|
||||||
|
suppression_flag=getattr(e, "suppression_flag", None) if is_tone else None,
|
||||||
|
reframe_type=getattr(e, "reframe_type", None) if is_tone else None,
|
||||||
|
affect_divergence=getattr(e, "affect_divergence", None) if is_tone else None,
|
||||||
|
))
|
||||||
|
return ClassifyResponse(events=out)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
# ── CLI entrypoint ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="cf-voice tone classification server")
|
||||||
|
parser.add_argument("--port", type=int, default=8007)
|
||||||
|
parser.add_argument("--host", default="0.0.0.0")
|
||||||
|
parser.add_argument("--gpu-id", type=int, default=0)
|
||||||
|
parser.add_argument("--mock", action="store_true",
|
||||||
|
help="Run in mock mode (no GPU, no audio hardware needed)")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(name)s — %(message)s")
|
||||||
|
args = _parse_args()
|
||||||
|
app = create_app(gpu_id=args.gpu_id, mock=args.mock)
|
||||||
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||||
|
|
@ -82,13 +82,21 @@ class ToneClassifier:
|
||||||
Tone/affect classifier: wav2vec2 SER + librosa prosody.
|
Tone/affect classifier: wav2vec2 SER + librosa prosody.
|
||||||
|
|
||||||
Loads the model lazily on first call to avoid import-time GPU allocation.
|
Loads the model lazily on first call to avoid import-time GPU allocation.
|
||||||
Thread-safe for concurrent classify() calls — the pipeline is stateless
|
Thread-safe for concurrent classify() calls — the model is stateless
|
||||||
per-call; session state lives in the caller (ContextClassifier).
|
per-call; session state lives in the caller (ContextClassifier).
|
||||||
|
|
||||||
|
Uses AutoFeatureExtractor + AutoModelForAudioClassification directly
|
||||||
|
rather than hf_pipeline to avoid torchcodec audio backend initialization.
|
||||||
|
torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x
|
||||||
|
systems. Calling the model directly bypasses the pipeline's audio backend
|
||||||
|
selection entirely since we already have float32 at 16kHz.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
|
def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
|
||||||
self._threshold = threshold
|
self._threshold = threshold
|
||||||
self._pipeline = None # lazy-loaded
|
self._feature_extractor = None # lazy-loaded
|
||||||
|
self._model = None # lazy-loaded
|
||||||
|
self._device: str = "cpu"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_env(cls) -> "ToneClassifier":
|
def from_env(cls) -> "ToneClassifier":
|
||||||
|
|
@ -96,23 +104,41 @@ class ToneClassifier:
|
||||||
return cls(threshold=threshold)
|
return cls(threshold=threshold)
|
||||||
|
|
||||||
def _load_pipeline(self) -> None:
|
def _load_pipeline(self) -> None:
|
||||||
if self._pipeline is not None:
|
if self._model is not None:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
from transformers import pipeline as hf_pipeline
|
from transformers import (
|
||||||
|
AutoFeatureExtractor,
|
||||||
|
AutoModelForAudioClassification,
|
||||||
|
)
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"transformers is required for tone classification. "
|
"transformers is required for tone classification. "
|
||||||
"Install with: pip install cf-voice[inference]"
|
"Install with: pip install cf-voice[inference]"
|
||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
device = 0 if _cuda_available() else -1
|
import torch
|
||||||
logger.info("Loading SER model %s on device %s", _SER_MODEL_ID, device)
|
|
||||||
self._pipeline = hf_pipeline(
|
if _cuda_available():
|
||||||
"audio-classification",
|
self._device = "cuda:0"
|
||||||
model=_SER_MODEL_ID,
|
# fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000.
|
||||||
device=device,
|
# Only supported on CUDA — CPU must stay float32.
|
||||||
|
torch_dtype = torch.float16
|
||||||
|
else:
|
||||||
|
self._device = "cpu"
|
||||||
|
torch_dtype = torch.float32
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Loading SER model %s on device=%s dtype=%s",
|
||||||
|
_SER_MODEL_ID, self._device, torch_dtype,
|
||||||
)
|
)
|
||||||
|
self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID)
|
||||||
|
self._model = AutoModelForAudioClassification.from_pretrained(
|
||||||
|
_SER_MODEL_ID,
|
||||||
|
torch_dtype=torch_dtype,
|
||||||
|
).to(self._device)
|
||||||
|
# Switch to inference mode — disables dropout, batch norm tracks running stats
|
||||||
|
self._model.train(False)
|
||||||
|
|
||||||
def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
|
def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
|
||||||
"""
|
"""
|
||||||
|
|
@ -121,13 +147,33 @@ class ToneClassifier:
|
||||||
transcript is used as a weak signal for ambiguous cases (e.g. words
|
transcript is used as a weak signal for ambiguous cases (e.g. words
|
||||||
like "unfortunately" bias toward apologetic even on a neutral voice).
|
like "unfortunately" bias toward apologetic even on a neutral voice).
|
||||||
"""
|
"""
|
||||||
|
import torch
|
||||||
|
|
||||||
self._load_pipeline()
|
self._load_pipeline()
|
||||||
|
|
||||||
# Ensure the model sees float32 at the right rate
|
# Ensure the model sees float32 at the right rate
|
||||||
assert audio_float32.dtype == np.float32, "audio must be float32"
|
assert audio_float32.dtype == np.float32, "audio must be float32"
|
||||||
|
|
||||||
# Run SER
|
# Run SER — call feature extractor + model directly to bypass the
|
||||||
preds = self._pipeline({"raw": audio_float32, "sampling_rate": _SAMPLE_RATE})
|
# hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency).
|
||||||
|
inputs = self._feature_extractor(
|
||||||
|
audio_float32,
|
||||||
|
sampling_rate=_SAMPLE_RATE,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
||||||
|
if self._model.dtype == torch.float16:
|
||||||
|
inputs = {k: v.to(torch.float16) for k, v in inputs.items()}
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self._model(**inputs).logits
|
||||||
|
probs = torch.softmax(logits, dim=-1)[0]
|
||||||
|
id2label = self._model.config.id2label
|
||||||
|
preds = [
|
||||||
|
{"label": id2label[i], "score": float(probs[i])}
|
||||||
|
for i in range(len(probs))
|
||||||
|
]
|
||||||
|
|
||||||
best = max(preds, key=lambda p: p["score"])
|
best = max(preds, key=lambda p: p["score"])
|
||||||
emotion = best["label"].lower()
|
emotion = best["label"].lower()
|
||||||
confidence = float(best["score"])
|
confidence = float(best["score"])
|
||||||
|
|
@ -158,7 +204,7 @@ class ToneClassifier:
|
||||||
self, audio_float32: np.ndarray, transcript: str = ""
|
self, audio_float32: np.ndarray, transcript: str = ""
|
||||||
) -> ToneResult:
|
) -> ToneResult:
|
||||||
"""classify() without blocking the event loop."""
|
"""classify() without blocking the event loop."""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_running_loop()
|
||||||
fn = partial(self.classify, audio_float32, transcript)
|
fn = partial(self.classify, audio_float32, transcript)
|
||||||
return await loop.run_in_executor(None, fn)
|
return await loop.run_in_executor(None, fn)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,99 +1,289 @@
|
||||||
# cf_voice/context.py — tone classification and context enrichment
|
# cf_voice/context.py — parallel audio context classifier (orchestrator)
|
||||||
#
|
#
|
||||||
# BSL 1.1 when real inference models are integrated.
|
# BSL 1.1 when real inference models are integrated.
|
||||||
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
|
# Mock mode: MIT licensed (no real inference).
|
||||||
#
|
#
|
||||||
# Real implementation (Notation v0.1.x) will:
|
# Runs three classifiers in parallel against the same audio window:
|
||||||
# - Run YAMNet acoustic event detection on the audio buffer
|
# 1. Tone/affect (classify.py) — wav2vec2 SER + librosa prosody
|
||||||
# - Run wav2vec2-based SER (speech emotion recognition)
|
# 2. Queue/environ (acoustic.py) — YAMNet acoustic event detection
|
||||||
# - Run librosa prosody extraction (pitch, energy, rate)
|
# 3. Speaker type/VAD (diarize.py) — pyannote.audio (Navigation v0.2.x)
|
||||||
# - Combine into enriched VoiceFrame label + confidence
|
#
|
||||||
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
|
# Combined output is a list[AudioEvent] per window, merged into VoiceFrame
|
||||||
|
# for the streaming path.
|
||||||
|
#
|
||||||
|
# Elcor mode reads from cf-core preferences (cf_voice.prefs) so that the
|
||||||
|
# annotation format is user-configurable without per-request flags.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import AsyncIterator
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
from cf_voice.acoustic import MockAcousticBackend, make_acoustic
|
||||||
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
|
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
|
||||||
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
|
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
|
||||||
from cf_voice.models import VoiceFrame
|
from cf_voice.models import VoiceFrame
|
||||||
|
from cf_voice.prefs import get_elcor_prior_frames, is_elcor_enabled
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── Per-model download/load status registry ───────────────────────────────────
|
||||||
|
# Written by _load_* methods; read by the /health endpoint in app.py.
|
||||||
|
# Values: "disabled" | "loading" | "ready" | "error"
|
||||||
|
# Thread-safe: individual str assignment is atomic in CPython.
|
||||||
|
model_status: dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
|
# ── No-op coroutines for disabled/unavailable classifiers ─────────────────────
|
||||||
|
|
||||||
|
async def _noop_stt() -> None:
|
||||||
|
"""Placeholder when STT is disabled or unavailable."""
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def _noop_diarize() -> list:
|
||||||
|
"""Placeholder when diarization is disabled or unavailable."""
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
class ContextClassifier:
|
class ContextClassifier:
|
||||||
"""
|
"""
|
||||||
High-level voice context classifier.
|
High-level voice context classifier.
|
||||||
|
|
||||||
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
|
Wraps a VoiceIO source and runs three parallel classifiers on each audio
|
||||||
In stub mode the frames pass through unchanged — the enrichment pipeline
|
window: tone (SER), queue/environ (YAMNet), and speaker (pyannote).
|
||||||
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
|
|
||||||
|
In mock mode all classifiers produce synthetic events — no GPU, microphone,
|
||||||
|
or HuggingFace token required.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
classifier = ContextClassifier.from_env()
|
classifier = ContextClassifier.from_env()
|
||||||
async for frame in classifier.stream():
|
async for frame in classifier.stream():
|
||||||
print(frame.label, frame.confidence)
|
print(frame.label, frame.confidence)
|
||||||
|
|
||||||
|
For the full multi-class event list (queue + speaker + tone):
|
||||||
|
events = classifier.classify_chunk(audio_b64, timestamp=4.5)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, io: VoiceIO) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
io: VoiceIO,
|
||||||
|
user_id: str | None = None,
|
||||||
|
store=None,
|
||||||
|
) -> None:
|
||||||
self._io = io
|
self._io = io
|
||||||
|
self._user_id = user_id
|
||||||
|
self._store = store
|
||||||
|
self._acoustic = make_acoustic(
|
||||||
|
mock=isinstance(io, MockVoiceIO)
|
||||||
|
or os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
)
|
||||||
|
# Lazy — loaded on first real classify call, then reused.
|
||||||
|
self._tone: "ToneClassifier | None" = None
|
||||||
|
# STT: loaded if faster-whisper is installed. Controlled by CF_VOICE_STT (default: 1).
|
||||||
|
self._stt: "WhisperSTT | None" = None
|
||||||
|
self._stt_loaded: bool = False # False = not yet attempted
|
||||||
|
# Diarizer: optional — requires HF_TOKEN and CF_VOICE_DIARIZE=1.
|
||||||
|
self._diarizer: "Diarizer | None" = None
|
||||||
|
self._diarizer_loaded: bool = False
|
||||||
|
# Per-session speaker label tracker — maps pyannote IDs → "Speaker A/B/..."
|
||||||
|
# Reset at session end (when the ContextClassifier is stopped).
|
||||||
|
from cf_voice.diarize import SpeakerTracker
|
||||||
|
self._speaker_tracker: SpeakerTracker = SpeakerTracker()
|
||||||
|
# One-at-a-time GPU classify gate. All three models share the same GPU;
|
||||||
|
# running them "in parallel" just serializes at the CUDA level while
|
||||||
|
# filling the thread pool. Drop incoming frames when a classify is
|
||||||
|
# already in flight — freshness beats completeness for real-time audio.
|
||||||
|
self._classify_lock: asyncio.Lock = asyncio.Lock()
|
||||||
|
# Dimensional classifier (audeering) — lazy, CF_VOICE_DIMENSIONAL=1
|
||||||
|
self._dimensional: "DimensionalClassifier | None" = None
|
||||||
|
self._dimensional_loaded: bool = False
|
||||||
|
# Prosodic extractor (openSMILE) — lazy, CF_VOICE_PROSODY=1
|
||||||
|
self._prosodic: "ProsodicExtractor | None" = None
|
||||||
|
self._prosodic_loaded: bool = False
|
||||||
|
# Per-speaker rolling dimensional buffers for trajectory/coherence signals.
|
||||||
|
# Keys are speaker_id strings; values are deques of DimensionalResult.
|
||||||
|
# Reset at session end alongside SpeakerTracker.
|
||||||
|
from collections import deque as _deque
|
||||||
|
from cf_voice.trajectory import BUFFER_WINDOW
|
||||||
|
self._dim_buffer: dict[str, "_deque"] = {}
|
||||||
|
self._last_ser_affect: dict[str, str] = {}
|
||||||
|
self._buffer_window = BUFFER_WINDOW
|
||||||
|
# Accent classifier — lazy, gated by CF_VOICE_ACCENT=1
|
||||||
|
self._accent: "MockAccentClassifier | AccentClassifier | None" = None
|
||||||
|
self._accent_loaded: bool = False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
|
def from_env(
|
||||||
|
cls,
|
||||||
|
interval_s: float = 2.5,
|
||||||
|
user_id: str | None = None,
|
||||||
|
store=None,
|
||||||
|
) -> "ContextClassifier":
|
||||||
"""
|
"""
|
||||||
Create a ContextClassifier from environment.
|
Create a ContextClassifier from environment.
|
||||||
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
|
|
||||||
|
CF_VOICE_MOCK=1 activates full mock mode (no GPU, no audio hardware).
|
||||||
|
If real audio hardware is unavailable (faster-whisper not installed),
|
||||||
|
falls back to mock mode automatically.
|
||||||
|
user_id + store are forwarded to cf-core preferences for Elcor/threshold
|
||||||
|
lookups.
|
||||||
"""
|
"""
|
||||||
io = make_io(interval_s=interval_s)
|
if os.environ.get("CF_VOICE_MOCK", "") == "1":
|
||||||
return cls(io=io)
|
return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
|
||||||
|
try:
|
||||||
|
io = make_io(interval_s=interval_s)
|
||||||
|
except (NotImplementedError, ImportError):
|
||||||
|
# Real audio hardware or inference extras unavailable — fall back to
|
||||||
|
# mock mode so the coordinator starts cleanly on headless nodes.
|
||||||
|
return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
|
||||||
|
return cls(io=io, user_id=user_id, store=store)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
|
def mock(
|
||||||
|
cls,
|
||||||
|
interval_s: float = 2.5,
|
||||||
|
seed: int | None = None,
|
||||||
|
user_id: str | None = None,
|
||||||
|
store=None,
|
||||||
|
) -> "ContextClassifier":
|
||||||
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
|
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
|
||||||
from cf_voice.io import MockVoiceIO
|
return cls(
|
||||||
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
|
io=MockVoiceIO(interval_s=interval_s, seed=seed),
|
||||||
|
user_id=user_id,
|
||||||
|
store=store,
|
||||||
|
)
|
||||||
|
|
||||||
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
||||||
"""
|
"""
|
||||||
Yield enriched VoiceFrames continuously.
|
Yield enriched VoiceFrames continuously.
|
||||||
|
|
||||||
Stub: frames from the IO layer pass through unchanged.
|
Stub: frames from the IO layer pass through unchanged.
|
||||||
Real: enrichment pipeline runs here before yield.
|
Real (Navigation v0.2.x): acoustic + diarization enrichment runs here.
|
||||||
"""
|
"""
|
||||||
async for frame in self._io.stream():
|
async for frame in self._io.stream():
|
||||||
yield self._enrich(frame)
|
yield self._enrich(frame)
|
||||||
|
|
||||||
async def stop(self) -> None:
|
async def stop(self) -> None:
|
||||||
await self._io.stop()
|
await self._io.stop()
|
||||||
|
self._speaker_tracker.reset()
|
||||||
|
self._dim_buffer.clear()
|
||||||
|
self._last_ser_affect.clear()
|
||||||
|
|
||||||
def classify_chunk(
|
def classify_chunk(
|
||||||
self,
|
self,
|
||||||
audio_b64: str,
|
audio_b64: str | None = None,
|
||||||
timestamp: float = 0.0,
|
timestamp: float = 0.0,
|
||||||
prior_frames: int = 0,
|
prior_frames: int | None = None,
|
||||||
elcor: bool = False,
|
elcor: bool | None = None,
|
||||||
|
session_id: str = "",
|
||||||
) -> list[AudioEvent]:
|
) -> list[AudioEvent]:
|
||||||
"""
|
"""
|
||||||
Classify a single audio chunk and return AudioEvents.
|
Classify a single audio window and return all AudioEvents.
|
||||||
|
|
||||||
This is the request-response path used by the cf-orch endpoint.
|
Returns a heterogeneous list containing zero or one of each:
|
||||||
|
- ToneEvent (event_type="tone")
|
||||||
|
- AudioEvent (event_type="queue")
|
||||||
|
- AudioEvent (event_type="speaker")
|
||||||
|
- AudioEvent (event_type="environ")
|
||||||
|
|
||||||
|
This is the request-response path used by the cf-orch SSE endpoint.
|
||||||
The streaming path (async generator) is for continuous consumers.
|
The streaming path (async generator) is for continuous consumers.
|
||||||
|
|
||||||
elcor=True switches subtext format to Mass Effect Elcor prefix style.
|
audio_b64 Base64-encoded PCM int16 mono 16kHz bytes.
|
||||||
Generic tone annotation is always present regardless of elcor flag.
|
Pass None in mock mode (ignored).
|
||||||
|
timestamp Session-relative seconds since capture started.
|
||||||
|
prior_frames Rolling context window size for Elcor LLM.
|
||||||
|
Defaults to user preference (PREF_ELCOR_PRIOR_FRAMES).
|
||||||
|
elcor Override Elcor mode for this request.
|
||||||
|
None = read from user preference (PREF_ELCOR_MODE).
|
||||||
|
session_id Caller-assigned correlation ID for the session.
|
||||||
"""
|
"""
|
||||||
if isinstance(self._io, MockVoiceIO):
|
use_elcor = elcor if elcor is not None else is_elcor_enabled(
|
||||||
return self._classify_chunk_mock(timestamp, prior_frames, elcor)
|
user_id=self._user_id, store=self._store
|
||||||
|
)
|
||||||
|
context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
|
||||||
|
user_id=self._user_id, store=self._store
|
||||||
|
)
|
||||||
|
|
||||||
return self._classify_chunk_real(audio_b64, timestamp, elcor)
|
if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
|
||||||
|
return self._classify_mock(timestamp, context_frames, use_elcor, session_id)
|
||||||
|
|
||||||
def _classify_chunk_mock(
|
if not audio_b64:
|
||||||
self, timestamp: float, prior_frames: int, elcor: bool
|
return []
|
||||||
|
|
||||||
|
return self._classify_real(audio_b64, timestamp, use_elcor, session_id)
|
||||||
|
|
||||||
|
async def classify_chunk_async(
|
||||||
|
self,
|
||||||
|
audio_b64: str | None = None,
|
||||||
|
timestamp: float = 0.0,
|
||||||
|
prior_frames: int | None = None,
|
||||||
|
elcor: bool | None = None,
|
||||||
|
session_id: str = "",
|
||||||
|
language: str | None = None,
|
||||||
|
num_speakers: int | None = None,
|
||||||
) -> list[AudioEvent]:
|
) -> list[AudioEvent]:
|
||||||
"""Synthetic path — used in mock mode and CI."""
|
"""
|
||||||
|
Async variant of classify_chunk.
|
||||||
|
|
||||||
|
Runs tone, STT, diarization, and acoustic classification in parallel
|
||||||
|
using asyncio.gather(). Use this from async contexts (FastAPI routes)
|
||||||
|
to get true concurrency across all four inference paths.
|
||||||
|
"""
|
||||||
|
use_elcor = elcor if elcor is not None else is_elcor_enabled(
|
||||||
|
user_id=self._user_id, store=self._store
|
||||||
|
)
|
||||||
|
context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
|
||||||
|
user_id=self._user_id, store=self._store
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
|
||||||
|
return self._classify_mock(timestamp, context_frames, use_elcor, session_id)
|
||||||
|
|
||||||
|
if not audio_b64:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Drop frame if a classify is already in flight — GPU models serialize
|
||||||
|
# anyway, so queuing just adds latency without improving output.
|
||||||
|
if self._classify_lock.locked():
|
||||||
|
logger.debug("classify busy — dropping frame at t=%.2f", timestamp)
|
||||||
|
return []
|
||||||
|
|
||||||
|
async with self._classify_lock:
|
||||||
|
# Diarization (pyannote) can take 3–8 s on first invocations even with GPU.
|
||||||
|
# 25 s gives enough headroom without stalling the stream for too long.
|
||||||
|
try:
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
self._classify_real_async(audio_b64, timestamp, use_elcor, session_id, language, num_speakers),
|
||||||
|
timeout=25.0,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("classify_real_async timed out at t=%.2f — dropping frame", timestamp)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _classify_mock(
|
||||||
|
self,
|
||||||
|
timestamp: float,
|
||||||
|
prior_frames: int,
|
||||||
|
elcor: bool,
|
||||||
|
session_id: str,
|
||||||
|
) -> list[AudioEvent]:
|
||||||
|
"""
|
||||||
|
Synthetic multi-class event batch.
|
||||||
|
|
||||||
|
Tone event comes from the MockVoiceIO RNG (consistent seed behaviour).
|
||||||
|
Queue/speaker/environ come from MockAcousticBackend (call lifecycle simulation).
|
||||||
|
"""
|
||||||
rng = self._io._rng # type: ignore[attr-defined]
|
rng = self._io._rng # type: ignore[attr-defined]
|
||||||
import time as _time
|
|
||||||
label = rng.choice(self._io._labels) # type: ignore[attr-defined]
|
label = rng.choice(self._io._labels) # type: ignore[attr-defined]
|
||||||
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
|
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
|
||||||
|
|
||||||
frame = VoiceFrame(
|
frame = VoiceFrame(
|
||||||
label=label,
|
label=label,
|
||||||
confidence=rng.uniform(0.6, 0.97),
|
confidence=rng.uniform(0.6, 0.97),
|
||||||
|
|
@ -101,30 +291,54 @@ class ContextClassifier:
|
||||||
shift_magnitude=round(shift, 3),
|
shift_magnitude=round(shift, 3),
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
)
|
)
|
||||||
tone = tone_event_from_voice_frame(
|
tone: ToneEvent = tone_event_from_voice_frame(
|
||||||
frame_label=frame.label,
|
frame_label=frame.label,
|
||||||
frame_confidence=frame.confidence,
|
frame_confidence=frame.confidence,
|
||||||
shift_magnitude=frame.shift_magnitude,
|
shift_magnitude=frame.shift_magnitude,
|
||||||
timestamp=frame.timestamp,
|
timestamp=frame.timestamp,
|
||||||
elcor=elcor,
|
elcor=elcor,
|
||||||
)
|
)
|
||||||
return [tone]
|
tone.session_id = session_id
|
||||||
|
|
||||||
def _classify_chunk_real(
|
acoustic = self._acoustic.classify_window(b"", timestamp=timestamp)
|
||||||
self, audio_b64: str, timestamp: float, elcor: bool
|
|
||||||
|
events: list[AudioEvent] = [tone]
|
||||||
|
if acoustic.queue:
|
||||||
|
events.append(acoustic.queue)
|
||||||
|
if acoustic.speaker:
|
||||||
|
events.append(acoustic.speaker)
|
||||||
|
if acoustic.environ:
|
||||||
|
events.append(acoustic.environ)
|
||||||
|
if acoustic.scene:
|
||||||
|
events.append(acoustic.scene)
|
||||||
|
return events
|
||||||
|
|
||||||
|
def _classify_real(
|
||||||
|
self,
|
||||||
|
audio_b64: str,
|
||||||
|
timestamp: float,
|
||||||
|
elcor: bool,
|
||||||
|
session_id: str,
|
||||||
) -> list[AudioEvent]:
|
) -> list[AudioEvent]:
|
||||||
"""Real inference path — used when CF_VOICE_MOCK is unset."""
|
"""
|
||||||
import asyncio
|
Real inference path — used when CF_VOICE_MOCK is unset.
|
||||||
|
|
||||||
|
Tone: wav2vec2 SER via ToneClassifier (classify.py).
|
||||||
|
Acoustic: YAMNet via YAMNetAcousticBackend (Navigation v0.2.x stub).
|
||||||
|
Speaker: pyannote VAD (diarize.py) — merged in ContextClassifier, not here.
|
||||||
|
"""
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from cf_voice.classify import ToneClassifier
|
from cf_voice.classify import ToneClassifier
|
||||||
|
|
||||||
pcm = base64.b64decode(audio_b64)
|
pcm = base64.b64decode(audio_b64)
|
||||||
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
|
||||||
|
|
||||||
# ToneClassifier is stateless per-call, safe to instantiate inline
|
if self._tone is None:
|
||||||
classifier = ToneClassifier.from_env()
|
self._tone = ToneClassifier.from_env()
|
||||||
tone_result = classifier.classify(audio)
|
tone_result = self._tone.classify(audio)
|
||||||
|
|
||||||
frame = VoiceFrame(
|
frame = VoiceFrame(
|
||||||
label=tone_result.label,
|
label=tone_result.label,
|
||||||
|
|
@ -133,20 +347,398 @@ class ContextClassifier:
|
||||||
shift_magnitude=0.0,
|
shift_magnitude=0.0,
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
)
|
)
|
||||||
event = tone_event_from_voice_frame(
|
tone: ToneEvent = tone_event_from_voice_frame(
|
||||||
frame_label=frame.label,
|
frame_label=frame.label,
|
||||||
frame_confidence=frame.confidence,
|
frame_confidence=frame.confidence,
|
||||||
shift_magnitude=frame.shift_magnitude,
|
shift_magnitude=frame.shift_magnitude,
|
||||||
timestamp=frame.timestamp,
|
timestamp=frame.timestamp,
|
||||||
elcor=elcor,
|
elcor=elcor,
|
||||||
)
|
)
|
||||||
return [event]
|
tone.session_id = session_id
|
||||||
|
|
||||||
|
events: list[AudioEvent] = [tone]
|
||||||
|
|
||||||
|
# Acoustic events: Navigation v0.2.x (YAMNet not yet implemented)
|
||||||
|
# YAMNetAcousticBackend raises NotImplementedError at construction —
|
||||||
|
# we catch and log rather than failing the entire classify call.
|
||||||
|
try:
|
||||||
|
acoustic = self._acoustic.classify_window(audio.tobytes(), timestamp=timestamp)
|
||||||
|
if acoustic.queue:
|
||||||
|
events.append(acoustic.queue)
|
||||||
|
if acoustic.speaker:
|
||||||
|
events.append(acoustic.speaker)
|
||||||
|
if acoustic.environ:
|
||||||
|
events.append(acoustic.environ)
|
||||||
|
if acoustic.scene:
|
||||||
|
events.append(acoustic.scene)
|
||||||
|
except NotImplementedError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return events
|
||||||
|
|
||||||
|
def _load_stt(self) -> "WhisperSTT | None":
|
||||||
|
"""Lazy-load WhisperSTT once. Returns None if unavailable or disabled."""
|
||||||
|
if self._stt_loaded:
|
||||||
|
return self._stt
|
||||||
|
self._stt_loaded = True
|
||||||
|
if os.environ.get("CF_VOICE_STT", "1") != "1":
|
||||||
|
model_status["stt"] = "disabled"
|
||||||
|
return None
|
||||||
|
model_status["stt"] = "loading"
|
||||||
|
try:
|
||||||
|
from cf_voice.stt import WhisperSTT
|
||||||
|
self._stt = WhisperSTT.from_env()
|
||||||
|
model_status["stt"] = "ready"
|
||||||
|
logger.info("WhisperSTT loaded (model=%s)", os.environ.get("CF_VOICE_WHISPER_MODEL", "small"))
|
||||||
|
except Exception as exc:
|
||||||
|
model_status["stt"] = "error"
|
||||||
|
logger.warning("WhisperSTT unavailable: %s", exc)
|
||||||
|
return self._stt
|
||||||
|
|
||||||
|
def _load_diarizer(self) -> "Diarizer | None":
|
||||||
|
"""Lazy-load Diarizer once. Returns None if HF_TOKEN absent or CF_VOICE_DIARIZE!=1."""
|
||||||
|
if self._diarizer_loaded:
|
||||||
|
return self._diarizer
|
||||||
|
self._diarizer_loaded = True
|
||||||
|
if os.environ.get("CF_VOICE_DIARIZE", "0") != "1":
|
||||||
|
model_status["diarizer"] = "disabled"
|
||||||
|
return None
|
||||||
|
model_status["diarizer"] = "loading"
|
||||||
|
try:
|
||||||
|
from cf_voice.diarize import Diarizer
|
||||||
|
self._diarizer = Diarizer.from_env()
|
||||||
|
model_status["diarizer"] = "ready"
|
||||||
|
logger.info("Diarizer loaded")
|
||||||
|
except Exception as exc:
|
||||||
|
model_status["diarizer"] = "error"
|
||||||
|
logger.warning("Diarizer unavailable: %s", exc)
|
||||||
|
return self._diarizer
|
||||||
|
|
||||||
|
def _load_dimensional(self) -> "DimensionalClassifier | None":
|
||||||
|
"""Lazy-load DimensionalClassifier once. Returns None if CF_VOICE_DIMENSIONAL!=1."""
|
||||||
|
if self._dimensional_loaded:
|
||||||
|
return self._dimensional
|
||||||
|
self._dimensional_loaded = True
|
||||||
|
if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
|
||||||
|
model_status["dimensional"] = "disabled"
|
||||||
|
return None
|
||||||
|
model_status["dimensional"] = "loading"
|
||||||
|
try:
|
||||||
|
from cf_voice.dimensional import DimensionalClassifier
|
||||||
|
self._dimensional = DimensionalClassifier()
|
||||||
|
model_status["dimensional"] = "ready"
|
||||||
|
logger.info("DimensionalClassifier loaded (audeering VAD model)")
|
||||||
|
except Exception as exc:
|
||||||
|
model_status["dimensional"] = "error"
|
||||||
|
logger.warning("DimensionalClassifier unavailable: %s", exc)
|
||||||
|
return self._dimensional
|
||||||
|
|
||||||
|
def _load_accent(self) -> "MockAccentClassifier | AccentClassifier | None":
|
||||||
|
"""Lazy-load AccentClassifier once. Returns None if CF_VOICE_ACCENT!=1."""
|
||||||
|
if self._accent_loaded:
|
||||||
|
return self._accent
|
||||||
|
self._accent_loaded = True
|
||||||
|
from cf_voice.accent import make_accent_classifier
|
||||||
|
result = make_accent_classifier(
|
||||||
|
mock=isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
)
|
||||||
|
self._accent = result
|
||||||
|
if result is None:
|
||||||
|
model_status["accent"] = "disabled"
|
||||||
|
else:
|
||||||
|
model_status["accent"] = "ready"
|
||||||
|
logger.info("AccentClassifier loaded (mock=%s)", isinstance(result, type(result).__mro__[0]))
|
||||||
|
return self._accent
|
||||||
|
|
||||||
|
def _load_prosodic(self) -> "ProsodicExtractor | None":
|
||||||
|
"""Lazy-load ProsodicExtractor once. Returns None if CF_VOICE_PROSODY!=1."""
|
||||||
|
if self._prosodic_loaded:
|
||||||
|
return self._prosodic
|
||||||
|
self._prosodic_loaded = True
|
||||||
|
if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
|
||||||
|
model_status["prosody"] = "disabled"
|
||||||
|
return None
|
||||||
|
model_status["prosody"] = "loading"
|
||||||
|
try:
|
||||||
|
from cf_voice.prosody import ProsodicExtractor
|
||||||
|
self._prosodic = ProsodicExtractor()
|
||||||
|
model_status["prosody"] = "ready"
|
||||||
|
logger.info("ProsodicExtractor loaded (openSMILE eGeMAPS)")
|
||||||
|
except Exception as exc:
|
||||||
|
model_status["prosody"] = "error"
|
||||||
|
logger.warning("ProsodicExtractor unavailable: %s", exc)
|
||||||
|
return self._prosodic
|
||||||
|
|
||||||
|
async def prewarm(self) -> None:
|
||||||
|
"""Pre-load all configured models in a thread-pool so downloads happen at
|
||||||
|
startup rather than on the first classify call. Safe to call multiple times
|
||||||
|
(each _load_* method is idempotent after the first call)."""
|
||||||
|
if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
|
||||||
|
return
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
# Load each model in its own executor slot so status updates are visible
|
||||||
|
# as each one completes rather than all at once.
|
||||||
|
await loop.run_in_executor(None, self._load_stt)
|
||||||
|
await loop.run_in_executor(None, self._load_diarizer)
|
||||||
|
await loop.run_in_executor(None, self._load_dimensional)
|
||||||
|
await loop.run_in_executor(None, self._load_prosodic)
|
||||||
|
logger.info("cf-voice prewarm complete: %s", model_status)
|
||||||
|
|
||||||
|
async def _classify_real_async(
|
||||||
|
self,
|
||||||
|
audio_b64: str,
|
||||||
|
timestamp: float,
|
||||||
|
elcor: bool,
|
||||||
|
session_id: str,
|
||||||
|
language: str | None = None,
|
||||||
|
num_speakers: int | None = None,
|
||||||
|
) -> list[AudioEvent]:
|
||||||
|
"""
|
||||||
|
Real inference path running all classifiers in parallel.
|
||||||
|
|
||||||
|
Tone (wav2vec2) + STT (Whisper) + Diarization (pyannote, optional) +
|
||||||
|
Acoustic (AST) all run concurrently via asyncio.gather(). Each result
|
||||||
|
is type-checked after gather — a single classifier failure does not
|
||||||
|
abort the call.
|
||||||
|
|
||||||
|
Transcript text is fed back to ToneClassifier as a weak signal (e.g.
|
||||||
|
"unfortunately" biases toward apologetic). Diarizer output sets the
|
||||||
|
speaker_id on the VoiceFrame.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from cf_voice.classify import ToneClassifier, _apply_transcript_hints, _AFFECT_TO_LABEL
|
||||||
|
|
||||||
|
pcm = base64.b64decode(audio_b64)
|
||||||
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
|
||||||
|
|
||||||
|
# Lazy-load models on first real call
|
||||||
|
if self._tone is None:
|
||||||
|
self._tone = ToneClassifier.from_env()
|
||||||
|
stt = self._load_stt()
|
||||||
|
diarizer = self._load_diarizer()
|
||||||
|
dimensional = self._load_dimensional()
|
||||||
|
prosodic = self._load_prosodic()
|
||||||
|
accent_clf = self._load_accent()
|
||||||
|
|
||||||
|
# Build coroutines — all run in thread pool executors internally.
|
||||||
|
# Dimensional, prosodic, and accent run in parallel with SER/STT/diarization.
|
||||||
|
tone_coro = self._tone.classify_async(audio)
|
||||||
|
stt_coro = stt.transcribe_chunk_async(pcm, language=language) if stt else _noop_stt()
|
||||||
|
diarize_coro = diarizer.diarize_async(audio, num_speakers=num_speakers) if diarizer else _noop_diarize()
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
acoustic_coro = loop.run_in_executor(
|
||||||
|
None, partial(self._acoustic.classify_window, audio.tobytes(), timestamp)
|
||||||
|
)
|
||||||
|
dimensional_coro = dimensional.classify_async(audio) if dimensional else _noop_stt()
|
||||||
|
prosodic_coro = prosodic.extract_async(audio) if prosodic else _noop_stt()
|
||||||
|
accent_coro = loop.run_in_executor(
|
||||||
|
None, partial(accent_clf.classify, audio.tobytes())
|
||||||
|
) if accent_clf else _noop_stt()
|
||||||
|
|
||||||
|
(
|
||||||
|
tone_result, stt_result, diarize_segs, acoustic,
|
||||||
|
dimensional_result, prosodic_result, accent_result,
|
||||||
|
) = await asyncio.gather(
|
||||||
|
tone_coro, stt_coro, diarize_coro, acoustic_coro,
|
||||||
|
dimensional_coro, prosodic_coro, accent_coro,
|
||||||
|
return_exceptions=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract transcript text (STT optional)
|
||||||
|
transcript = ""
|
||||||
|
if stt_result and not isinstance(stt_result, BaseException):
|
||||||
|
transcript = stt_result.text # type: ignore[union-attr]
|
||||||
|
|
||||||
|
# Apply transcript weak signal to affect if STT produced text
|
||||||
|
if transcript and not isinstance(tone_result, BaseException):
|
||||||
|
new_affect = _apply_transcript_hints(tone_result.affect, transcript) # type: ignore[union-attr]
|
||||||
|
if new_affect != tone_result.affect: # type: ignore[union-attr]
|
||||||
|
from cf_voice.classify import ToneResult
|
||||||
|
tone_result = ToneResult( # type: ignore[assignment]
|
||||||
|
label=_AFFECT_TO_LABEL.get(new_affect, tone_result.label), # type: ignore[union-attr]
|
||||||
|
affect=new_affect,
|
||||||
|
confidence=tone_result.confidence, # type: ignore[union-attr]
|
||||||
|
prosody_flags=tone_result.prosody_flags, # type: ignore[union-attr]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get speaker_id from diarization (falls back to "speaker_a")
|
||||||
|
speaker_id = "speaker_a"
|
||||||
|
if isinstance(diarize_segs, BaseException):
|
||||||
|
logger.warning("Diarizer failed in gather: %s", diarize_segs)
|
||||||
|
elif diarizer and diarize_segs is not None:
|
||||||
|
window_mid = len(audio) / 2.0 / 16_000.0
|
||||||
|
speaker_id = diarizer.speaker_at( # type: ignore[arg-type]
|
||||||
|
diarize_segs, window_mid, tracker=self._speaker_tracker
|
||||||
|
)
|
||||||
|
logger.debug("diarize: segs=%d speaker=%s mid=%.3f", len(diarize_segs), speaker_id, window_mid)
|
||||||
|
|
||||||
|
if isinstance(tone_result, BaseException):
|
||||||
|
logger.warning("Tone classifier failed: %s", tone_result)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Unpack dimensional result (None when classifier is disabled or failed)
|
||||||
|
dim = None
|
||||||
|
if dimensional_result and not isinstance(dimensional_result, BaseException):
|
||||||
|
dim = dimensional_result
|
||||||
|
|
||||||
|
# Unpack prosodic result. If dimensional is also available, pass the
|
||||||
|
# calm-positive score so sarcasm_risk benefits from both signals.
|
||||||
|
pros = None
|
||||||
|
if prosodic_result and not isinstance(prosodic_result, BaseException):
|
||||||
|
if dim is not None:
|
||||||
|
# Re-compute sarcasm_risk with dimensional context
|
||||||
|
from cf_voice.prosody import _compute_sarcasm_risk
|
||||||
|
calm_pos = dim.calm_positive_score()
|
||||||
|
updated_risk = _compute_sarcasm_risk(
|
||||||
|
flat_f0=prosodic_result.flat_f0_score, # type: ignore[union-attr]
|
||||||
|
calm_positive=calm_pos,
|
||||||
|
)
|
||||||
|
from cf_voice.prosody import ProsodicSignal
|
||||||
|
pros = ProsodicSignal(
|
||||||
|
f0_mean=prosodic_result.f0_mean, # type: ignore[union-attr]
|
||||||
|
f0_std=prosodic_result.f0_std, # type: ignore[union-attr]
|
||||||
|
jitter=prosodic_result.jitter, # type: ignore[union-attr]
|
||||||
|
shimmer=prosodic_result.shimmer, # type: ignore[union-attr]
|
||||||
|
loudness=prosodic_result.loudness, # type: ignore[union-attr]
|
||||||
|
flat_f0_score=prosodic_result.flat_f0_score, # type: ignore[union-attr]
|
||||||
|
sarcasm_risk=updated_risk,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pros = prosodic_result
|
||||||
|
|
||||||
|
frame = VoiceFrame(
|
||||||
|
label=tone_result.label, # type: ignore[union-attr]
|
||||||
|
confidence=tone_result.confidence, # type: ignore[union-attr]
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
shift_magnitude=0.0,
|
||||||
|
timestamp=timestamp,
|
||||||
|
valence=dim.valence if dim else None,
|
||||||
|
arousal=dim.arousal if dim else None,
|
||||||
|
dominance=dim.dominance if dim else None,
|
||||||
|
sarcasm_risk=pros.sarcasm_risk if pros else None,
|
||||||
|
flat_f0_score=pros.flat_f0_score if pros else None,
|
||||||
|
)
|
||||||
|
tone_event: ToneEvent = tone_event_from_voice_frame(
|
||||||
|
frame_label=frame.label,
|
||||||
|
frame_confidence=frame.confidence,
|
||||||
|
shift_magnitude=frame.shift_magnitude,
|
||||||
|
timestamp=frame.timestamp,
|
||||||
|
elcor=elcor,
|
||||||
|
)
|
||||||
|
tone_event.session_id = session_id
|
||||||
|
tone_event.speaker_id = speaker_id
|
||||||
|
# Attach dimensional and prosodic results to the wire event
|
||||||
|
tone_event.valence = frame.valence
|
||||||
|
tone_event.arousal = frame.arousal
|
||||||
|
tone_event.dominance = frame.dominance
|
||||||
|
tone_event.sarcasm_risk = frame.sarcasm_risk
|
||||||
|
tone_event.flat_f0_score = frame.flat_f0_score
|
||||||
|
|
||||||
|
# Trajectory and coherence signals — only when dimensional is running
|
||||||
|
if dim:
|
||||||
|
from collections import deque as _deque
|
||||||
|
from cf_voice.trajectory import compute_trajectory
|
||||||
|
|
||||||
|
spk_buffer = self._dim_buffer.setdefault(
|
||||||
|
speaker_id, _deque(maxlen=self._buffer_window)
|
||||||
|
)
|
||||||
|
prior_affect = self._last_ser_affect.get(speaker_id)
|
||||||
|
traj, coher = compute_trajectory(
|
||||||
|
spk_buffer, dim, tone_result.affect, prior_affect # type: ignore[union-attr]
|
||||||
|
)
|
||||||
|
# Update buffer and affect history after computing (not before)
|
||||||
|
spk_buffer.append(dim)
|
||||||
|
self._last_ser_affect[speaker_id] = tone_result.affect # type: ignore[union-attr]
|
||||||
|
|
||||||
|
tone_event.arousal_delta = traj.arousal_delta if traj.baseline_established else None
|
||||||
|
tone_event.valence_delta = traj.valence_delta if traj.baseline_established else None
|
||||||
|
tone_event.trend = traj.trend if traj.baseline_established else None
|
||||||
|
tone_event.coherence_score = coher.coherence_score
|
||||||
|
tone_event.suppression_flag = coher.suppression_flag
|
||||||
|
tone_event.reframe_type = coher.reframe_type if coher.reframe_type != "none" else None
|
||||||
|
tone_event.affect_divergence = coher.affect_divergence
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Dimensional: valence=%.3f arousal=%.3f dominance=%.3f quadrant=%s "
|
||||||
|
"trend=%s coherence=%.2f suppressed=%s reframe=%s",
|
||||||
|
dim.valence, dim.arousal, dim.dominance, dim.affect_quadrant(),
|
||||||
|
traj.trend, coher.coherence_score, coher.suppression_flag, coher.reframe_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if pros:
|
||||||
|
logger.debug(
|
||||||
|
"Prosodic: flat_f0=%.3f sarcasm_risk=%.3f",
|
||||||
|
pros.flat_f0_score, pros.sarcasm_risk,
|
||||||
|
)
|
||||||
|
|
||||||
|
events: list[AudioEvent] = [tone_event]
|
||||||
|
|
||||||
|
# Emit transcript event so consumers can display live STT
|
||||||
|
if transcript:
|
||||||
|
events.append(AudioEvent(
|
||||||
|
timestamp=timestamp,
|
||||||
|
event_type="transcript", # type: ignore[arg-type]
|
||||||
|
label=transcript,
|
||||||
|
confidence=1.0,
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Acoustic events (queue / speaker type / environ / scene)
|
||||||
|
scene_label: str | None = None
|
||||||
|
environ_labels: list[str] = []
|
||||||
|
speaker_label: str | None = None
|
||||||
|
if not isinstance(acoustic, BaseException):
|
||||||
|
if acoustic.queue: # type: ignore[union-attr]
|
||||||
|
events.append(acoustic.queue) # type: ignore[union-attr]
|
||||||
|
if acoustic.speaker: # type: ignore[union-attr]
|
||||||
|
events.append(acoustic.speaker) # type: ignore[union-attr]
|
||||||
|
speaker_label = acoustic.speaker.label # type: ignore[union-attr]
|
||||||
|
if acoustic.environ: # type: ignore[union-attr]
|
||||||
|
events.append(acoustic.environ) # type: ignore[union-attr]
|
||||||
|
environ_labels = [acoustic.environ.label] # type: ignore[union-attr]
|
||||||
|
if acoustic.scene: # type: ignore[union-attr]
|
||||||
|
events.append(acoustic.scene) # type: ignore[union-attr]
|
||||||
|
scene_label = acoustic.scene.label # type: ignore[union-attr]
|
||||||
|
|
||||||
|
# Accent event (optional — gated by CF_VOICE_ACCENT=1)
|
||||||
|
accent_region: str | None = None
|
||||||
|
if accent_result and not isinstance(accent_result, BaseException):
|
||||||
|
accent_region = accent_result.region # type: ignore[union-attr]
|
||||||
|
events.append(AudioEvent(
|
||||||
|
timestamp=timestamp,
|
||||||
|
event_type="accent", # type: ignore[arg-type]
|
||||||
|
label=accent_region,
|
||||||
|
confidence=accent_result.confidence, # type: ignore[union-attr]
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Privacy risk scoring — local only, never transmitted
|
||||||
|
from cf_voice.privacy import score_privacy_risk
|
||||||
|
risk = score_privacy_risk(
|
||||||
|
scene=scene_label,
|
||||||
|
environ_labels=environ_labels,
|
||||||
|
speaker=speaker_label,
|
||||||
|
accent=accent_region,
|
||||||
|
)
|
||||||
|
if risk.level != "low":
|
||||||
|
logger.info(
|
||||||
|
"privacy_risk=%s flags=%s session=%s",
|
||||||
|
risk.level, risk.flags, session_id,
|
||||||
|
)
|
||||||
|
# Attach risk to the tone event so Linnet can surface the gate
|
||||||
|
tone_event.prosody_flags = list(tone_event.prosody_flags) + [f"privacy:{risk.level}"]
|
||||||
|
|
||||||
|
return events
|
||||||
|
|
||||||
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
||||||
"""
|
"""
|
||||||
Apply tone classification to a raw frame.
|
Apply tone classification to a raw frame (streaming path).
|
||||||
|
|
||||||
Stub: identity transform — returns frame unchanged.
|
Stub: identity transform — returns frame unchanged.
|
||||||
Real: replace label + confidence with classifier output.
|
Real (Navigation v0.2.x): replace label + confidence with classifier output.
|
||||||
"""
|
"""
|
||||||
return frame
|
return frame
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,16 @@
|
||||||
# Requires accepting gated model terms at:
|
# Requires accepting gated model terms at:
|
||||||
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
||||||
# https://huggingface.co/pyannote/segmentation-3.0
|
# https://huggingface.co/pyannote/segmentation-3.0
|
||||||
|
#
|
||||||
|
# Enable with: CF_VOICE_DIARIZE=1 (default off)
|
||||||
|
# Requires: HF_TOKEN set in environment
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
import string
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
@ -21,11 +25,16 @@ logger = logging.getLogger(__name__)
|
||||||
_DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
|
_DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
|
||||||
_SAMPLE_RATE = 16_000
|
_SAMPLE_RATE = 16_000
|
||||||
|
|
||||||
|
# Label returned when two speakers overlap in the same window
|
||||||
|
SPEAKER_MULTIPLE = "Multiple"
|
||||||
|
# Label returned when no speaker segment covers the timestamp (silence / VAD miss)
|
||||||
|
SPEAKER_UNKNOWN = "speaker_a"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SpeakerSegment:
|
class SpeakerSegment:
|
||||||
"""A speaker-labelled time range within an audio window."""
|
"""A speaker-labelled time range within an audio window."""
|
||||||
speaker_id: str # ephemeral local label, e.g. "SPEAKER_00"
|
speaker_id: str # raw pyannote label, e.g. "SPEAKER_00"
|
||||||
start_s: float
|
start_s: float
|
||||||
end_s: float
|
end_s: float
|
||||||
|
|
||||||
|
|
@ -34,6 +43,51 @@ class SpeakerSegment:
|
||||||
return self.end_s - self.start_s
|
return self.end_s - self.start_s
|
||||||
|
|
||||||
|
|
||||||
|
class SpeakerTracker:
|
||||||
|
"""
|
||||||
|
Maps ephemeral pyannote speaker IDs to stable per-session friendly labels.
|
||||||
|
|
||||||
|
pyannote returns IDs like "SPEAKER_00", "SPEAKER_01" which are opaque and
|
||||||
|
may differ across audio windows. SpeakerTracker assigns a consistent
|
||||||
|
friendly label ("Speaker A", "Speaker B", ...) for the lifetime of one
|
||||||
|
session, based on first-seen order.
|
||||||
|
|
||||||
|
Speaker embeddings are never stored — only the raw_id → label string map,
|
||||||
|
which contains no biometric information. Call reset() at session end to
|
||||||
|
discard the map.
|
||||||
|
|
||||||
|
For sessions with more than 26 speakers, labels wrap to "Speaker AA",
|
||||||
|
"Speaker AB", etc. (unlikely in practice but handled gracefully).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._map: dict[str, str] = {}
|
||||||
|
self._counter: int = 0
|
||||||
|
|
||||||
|
def label(self, raw_id: str) -> str:
|
||||||
|
"""Return the friendly label for a pyannote speaker ID."""
|
||||||
|
if raw_id not in self._map:
|
||||||
|
self._map[raw_id] = self._next_label()
|
||||||
|
return self._map[raw_id]
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
"""Discard all label mappings. Call at session end."""
|
||||||
|
self._map.clear()
|
||||||
|
self._counter = 0
|
||||||
|
|
||||||
|
def _next_label(self) -> str:
|
||||||
|
idx = self._counter
|
||||||
|
self._counter += 1
|
||||||
|
letters = string.ascii_uppercase
|
||||||
|
n = len(letters)
|
||||||
|
if idx < n:
|
||||||
|
return f"Speaker {letters[idx]}"
|
||||||
|
# Two-letter suffix for >26 speakers
|
||||||
|
outer = idx // n
|
||||||
|
inner = idx % n
|
||||||
|
return f"Speaker {letters[outer - 1]}{letters[inner]}"
|
||||||
|
|
||||||
|
|
||||||
class Diarizer:
|
class Diarizer:
|
||||||
"""
|
"""
|
||||||
Async wrapper around pyannote.audio speaker diarization pipeline.
|
Async wrapper around pyannote.audio speaker diarization pipeline.
|
||||||
|
|
@ -47,9 +101,9 @@ class Diarizer:
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
diarizer = Diarizer.from_env()
|
diarizer = Diarizer.from_env()
|
||||||
|
tracker = SpeakerTracker()
|
||||||
segments = await diarizer.diarize_async(audio_float32)
|
segments = await diarizer.diarize_async(audio_float32)
|
||||||
for seg in segments:
|
label = diarizer.speaker_at(segments, timestamp_s=1.0, tracker=tracker)
|
||||||
print(seg.speaker_id, seg.start_s, seg.end_s)
|
|
||||||
|
|
||||||
Navigation v0.2.x wires this into ContextClassifier so that each
|
Navigation v0.2.x wires this into ContextClassifier so that each
|
||||||
VoiceFrame carries the correct speaker_id from diarization output.
|
VoiceFrame carries the correct speaker_id from diarization output.
|
||||||
|
|
@ -67,7 +121,7 @@ class Diarizer:
|
||||||
logger.info("Loading diarization pipeline %s", _DIARIZATION_MODEL)
|
logger.info("Loading diarization pipeline %s", _DIARIZATION_MODEL)
|
||||||
self._pipeline = Pipeline.from_pretrained(
|
self._pipeline = Pipeline.from_pretrained(
|
||||||
_DIARIZATION_MODEL,
|
_DIARIZATION_MODEL,
|
||||||
use_auth_token=hf_token,
|
token=hf_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Move to GPU if available
|
# Move to GPU if available
|
||||||
|
|
@ -92,16 +146,29 @@ class Diarizer:
|
||||||
return cls(hf_token=token)
|
return cls(hf_token=token)
|
||||||
|
|
||||||
def _diarize_sync(
|
def _diarize_sync(
|
||||||
self, audio_float32: np.ndarray, sample_rate: int = _SAMPLE_RATE
|
self,
|
||||||
|
audio_float32: np.ndarray,
|
||||||
|
sample_rate: int = _SAMPLE_RATE,
|
||||||
|
num_speakers: int | None = None,
|
||||||
) -> list[SpeakerSegment]:
|
) -> list[SpeakerSegment]:
|
||||||
"""Synchronous diarization — always call via diarize_async."""
|
"""Synchronous diarization — always call via diarize_async.
|
||||||
|
|
||||||
|
num_speakers: when set, passed as min_speakers=max_speakers to pyannote,
|
||||||
|
which skips the agglomeration heuristic and improves boundary accuracy
|
||||||
|
for known-size conversations (e.g. 2-person call).
|
||||||
|
"""
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
# pyannote expects (channels, samples) float32 tensor
|
# pyannote expects (channels, samples) float32 tensor
|
||||||
waveform = torch.from_numpy(audio_float32[np.newaxis, :].astype(np.float32))
|
waveform = torch.from_numpy(audio_float32[np.newaxis, :].astype(np.float32))
|
||||||
diarization = self._pipeline(
|
pipeline_kwargs: dict = {"waveform": waveform, "sample_rate": sample_rate}
|
||||||
{"waveform": waveform, "sample_rate": sample_rate}
|
if num_speakers and num_speakers > 0:
|
||||||
)
|
pipeline_kwargs["min_speakers"] = num_speakers
|
||||||
|
pipeline_kwargs["max_speakers"] = num_speakers
|
||||||
|
output = self._pipeline(pipeline_kwargs)
|
||||||
|
# pyannote >= 3.3 wraps results in DiarizeOutput; earlier versions return
|
||||||
|
# Annotation directly. Normalise to Annotation before iterating.
|
||||||
|
diarization = getattr(output, "speaker_diarization", output)
|
||||||
|
|
||||||
segments: list[SpeakerSegment] = []
|
segments: list[SpeakerSegment] = []
|
||||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||||
|
|
@ -118,6 +185,7 @@ class Diarizer:
|
||||||
self,
|
self,
|
||||||
audio_float32: np.ndarray,
|
audio_float32: np.ndarray,
|
||||||
sample_rate: int = _SAMPLE_RATE,
|
sample_rate: int = _SAMPLE_RATE,
|
||||||
|
num_speakers: int | None = None,
|
||||||
) -> list[SpeakerSegment]:
|
) -> list[SpeakerSegment]:
|
||||||
"""
|
"""
|
||||||
Diarize an audio window without blocking the event loop.
|
Diarize an audio window without blocking the event loop.
|
||||||
|
|
@ -125,22 +193,58 @@ class Diarizer:
|
||||||
audio_float32 should be 16kHz mono float32.
|
audio_float32 should be 16kHz mono float32.
|
||||||
Typical input is a 2-second window from MicVoiceIO (32000 samples).
|
Typical input is a 2-second window from MicVoiceIO (32000 samples).
|
||||||
Returns segments ordered by start_s.
|
Returns segments ordered by start_s.
|
||||||
|
|
||||||
|
num_speakers: passed through to pyannote as min_speakers=max_speakers
|
||||||
|
when set and > 0. Improves accuracy for known speaker counts.
|
||||||
"""
|
"""
|
||||||
loop = asyncio.get_event_loop()
|
from functools import partial
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
return await loop.run_in_executor(
|
return await loop.run_in_executor(
|
||||||
None, self._diarize_sync, audio_float32, sample_rate
|
None,
|
||||||
|
partial(self._diarize_sync, audio_float32, sample_rate, num_speakers),
|
||||||
)
|
)
|
||||||
|
|
||||||
def speaker_at(
|
def speaker_at(
|
||||||
self, segments: list[SpeakerSegment], timestamp_s: float
|
self,
|
||||||
|
segments: list[SpeakerSegment],
|
||||||
|
timestamp_s: float,
|
||||||
|
tracker: SpeakerTracker | None = None,
|
||||||
|
window_s: float = 1.0,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Return the speaker_id active at a given timestamp within the window.
|
Return the friendly speaker label dominating a window around timestamp_s.
|
||||||
|
|
||||||
Falls back to "speaker_a" if no segment covers the timestamp
|
Strategy (in order):
|
||||||
(e.g. during silence or at window boundaries).
|
1. If segments directly cover timestamp_s: use majority rule among them.
|
||||||
|
2. If timestamp_s falls in a silence gap: use the speaker with the most
|
||||||
|
total speaking time across the whole window [0, window_s]. This handles
|
||||||
|
pauses between pyannote segments without falling back to "speaker_a".
|
||||||
|
3. No segments at all: SPEAKER_UNKNOWN.
|
||||||
|
|
||||||
|
tracker is optional; if omitted, raw pyannote IDs are returned as-is.
|
||||||
"""
|
"""
|
||||||
|
if not segments:
|
||||||
|
return SPEAKER_UNKNOWN
|
||||||
|
|
||||||
|
covering = [seg for seg in segments if seg.start_s <= timestamp_s <= seg.end_s]
|
||||||
|
|
||||||
|
if len(covering) >= 2:
|
||||||
|
return SPEAKER_MULTIPLE
|
||||||
|
|
||||||
|
if len(covering) == 1:
|
||||||
|
raw_id = covering[0].speaker_id
|
||||||
|
return tracker.label(raw_id) if tracker else raw_id
|
||||||
|
|
||||||
|
# Midpoint fell in a silence gap — find dominant speaker over the window.
|
||||||
|
from collections import defaultdict
|
||||||
|
duration_by_speaker: dict[str, float] = defaultdict(float)
|
||||||
|
win_start = max(0.0, timestamp_s - window_s / 2)
|
||||||
|
win_end = timestamp_s + window_s / 2
|
||||||
for seg in segments:
|
for seg in segments:
|
||||||
if seg.start_s <= timestamp_s <= seg.end_s:
|
overlap = min(seg.end_s, win_end) - max(seg.start_s, win_start)
|
||||||
return seg.speaker_id
|
if overlap > 0:
|
||||||
return "speaker_a"
|
duration_by_speaker[seg.speaker_id] += overlap
|
||||||
|
if not duration_by_speaker:
|
||||||
|
return SPEAKER_UNKNOWN
|
||||||
|
raw_id = max(duration_by_speaker, key=lambda k: duration_by_speaker[k])
|
||||||
|
return tracker.label(raw_id) if tracker else raw_id
|
||||||
|
|
|
||||||
190
cf_voice/dimensional.py
Normal file
190
cf_voice/dimensional.py
Normal file
|
|
@ -0,0 +1,190 @@
|
||||||
|
# cf_voice/dimensional.py — audeering dimensional emotion model
|
||||||
|
#
|
||||||
|
# BSL 1.1: real inference. Requires [inference] extras.
|
||||||
|
#
|
||||||
|
# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
|
||||||
|
# Outputs three continuous 0-1 scores:
|
||||||
|
# valence: negative (0) to positive (1)
|
||||||
|
# arousal: low energy (0) to high energy (1)
|
||||||
|
# dominance: submissive (0) to dominant (1)
|
||||||
|
#
|
||||||
|
# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech.
|
||||||
|
# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP.
|
||||||
|
#
|
||||||
|
# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is
|
||||||
|
# downloaded — ~1.5GB, adds ~800MB GPU VRAM)
|
||||||
|
#
|
||||||
|
# HuggingFace model page:
|
||||||
|
# https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SAMPLE_RATE = 16_000
|
||||||
|
_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DimensionalResult:
|
||||||
|
"""
|
||||||
|
Output of the audeering dimensional emotion model.
|
||||||
|
|
||||||
|
All scores are 0.0-1.0 continuous values:
|
||||||
|
valence: negative affect (0) to positive affect (1)
|
||||||
|
arousal: low energy / calm (0) to high energy / excited (1)
|
||||||
|
dominance: submissive / uncertain (0) to dominant / assertive (1)
|
||||||
|
|
||||||
|
Sarcasm signal: low arousal + higher valence = "calm-positive" profile.
|
||||||
|
Combined with flat F0 (prosody.py) and text divergence (linnet#22) for
|
||||||
|
the full multi-signal sarcasm heuristic.
|
||||||
|
"""
|
||||||
|
valence: float
|
||||||
|
arousal: float
|
||||||
|
dominance: float
|
||||||
|
|
||||||
|
def affect_quadrant(self) -> str:
|
||||||
|
"""
|
||||||
|
Map VAD position to a descriptive quadrant label.
|
||||||
|
|
||||||
|
These are reference labels for logging and debugging, not user-facing.
|
||||||
|
The annotation layer (Elcor) handles user-facing interpretation.
|
||||||
|
"""
|
||||||
|
v_high = self.valence >= 0.5
|
||||||
|
a_high = self.arousal >= 0.5
|
||||||
|
if v_high and a_high:
|
||||||
|
return "enthusiastic"
|
||||||
|
if v_high and not a_high:
|
||||||
|
return "calm_positive" # sarcasm candidate when paired with flat F0
|
||||||
|
if not v_high and a_high:
|
||||||
|
return "frustrated_urgent"
|
||||||
|
return "sad_resigned"
|
||||||
|
|
||||||
|
def calm_positive_score(self) -> float:
|
||||||
|
"""
|
||||||
|
0-1 score indicating how strongly the VAD position matches the
|
||||||
|
calm-positive sarcasm candidate profile (low arousal, higher valence).
|
||||||
|
|
||||||
|
Used as one component of the combined sarcasm heuristic.
|
||||||
|
"""
|
||||||
|
valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive
|
||||||
|
arousal_low = 1.0 - self.arousal # how calm
|
||||||
|
return (valence_pos * 0.5 + arousal_low * 0.5)
|
||||||
|
|
||||||
|
|
||||||
|
class DimensionalClassifier:
|
||||||
|
"""
|
||||||
|
Async wrapper around the audeering wav2vec2 dimensional emotion model.
|
||||||
|
|
||||||
|
The model runs in a thread pool executor to avoid blocking asyncio.
|
||||||
|
Loaded once on first call and reused; the underlying wav2vec2 model
|
||||||
|
lands on CUDA when available (same device as the SER model in classify.py).
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
clf = DimensionalClassifier.from_env()
|
||||||
|
result = await clf.classify_async(audio_float32)
|
||||||
|
print(result.valence, result.arousal, result.dominance)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
self._processor = None
|
||||||
|
self._loaded = False
|
||||||
|
|
||||||
|
def _ensure_loaded(self) -> None:
|
||||||
|
"""Load model and processor on first inference call (not at construction)."""
|
||||||
|
if self._loaded:
|
||||||
|
return
|
||||||
|
self._loaded = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"transformers is required for dimensional emotion classification. "
|
||||||
|
"Install with: pip install cf-voice[inference]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID)
|
||||||
|
self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID)
|
||||||
|
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
self._model = self._model.to(torch.device("cuda"))
|
||||||
|
logger.info("Dimensional model on CUDA")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self._model.eval()
|
||||||
|
|
||||||
|
def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult:
|
||||||
|
"""
|
||||||
|
Synchronous inference. Always call via classify_async.
|
||||||
|
|
||||||
|
The audeering model outputs [valence, arousal, dominance] as logits
|
||||||
|
in the range 0-1 (sigmoid regression heads, not softmax). The model was
|
||||||
|
fine-tuned on MSP-Podcast with per-dimension regression, not classification.
|
||||||
|
"""
|
||||||
|
self._ensure_loaded()
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError("torch is required for dimensional inference") from exc
|
||||||
|
|
||||||
|
inputs = self._processor(
|
||||||
|
audio_float32,
|
||||||
|
sampling_rate=_SAMPLE_RATE,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self._model(**inputs).logits
|
||||||
|
|
||||||
|
# Model outputs [valence, arousal, dominance] in a single (1, 3) tensor
|
||||||
|
scores = logits[0].cpu().float().numpy()
|
||||||
|
valence = float(np.clip(scores[0], 0.0, 1.0))
|
||||||
|
arousal = float(np.clip(scores[1], 0.0, 1.0))
|
||||||
|
dominance = float(np.clip(scores[2], 0.0, 1.0))
|
||||||
|
|
||||||
|
return DimensionalResult(
|
||||||
|
valence=round(valence, 4),
|
||||||
|
arousal=round(arousal, 4),
|
||||||
|
dominance=round(dominance, 4),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult:
|
||||||
|
"""
|
||||||
|
Classify audio without blocking the event loop.
|
||||||
|
|
||||||
|
Runs in a thread pool executor. Designed to be gathered alongside
|
||||||
|
the SER and diarization coroutines in context._classify_real_async().
|
||||||
|
"""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
return await loop.run_in_executor(
|
||||||
|
None, partial(self._classify_sync, audio_float32)
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_env(cls) -> "DimensionalClassifier":
|
||||||
|
"""Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set."""
|
||||||
|
if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
|
||||||
|
raise EnvironmentError(
|
||||||
|
"CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. "
|
||||||
|
"Add it to your .env file. The model requires ~800MB GPU VRAM."
|
||||||
|
)
|
||||||
|
return cls()
|
||||||
|
|
@ -10,10 +10,10 @@ from __future__ import annotations
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
EventType = Literal["queue", "speaker", "environ", "tone"]
|
EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"]
|
||||||
|
|
||||||
# ── Queue state labels ────────────────────────────────────────────────────────
|
# ── Queue state labels ────────────────────────────────────────────────────────
|
||||||
# Detected from YAMNet acoustic event classification
|
# Detected from AST acoustic event classification
|
||||||
QUEUE_LABELS = Literal[
|
QUEUE_LABELS = Literal[
|
||||||
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
|
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
|
||||||
]
|
]
|
||||||
|
|
@ -21,13 +21,36 @@ QUEUE_LABELS = Literal[
|
||||||
# ── Speaker type labels ───────────────────────────────────────────────────────
|
# ── Speaker type labels ───────────────────────────────────────────────────────
|
||||||
# Detected from pyannote VAD + custom IVR-vs-human head
|
# Detected from pyannote VAD + custom IVR-vs-human head
|
||||||
SPEAKER_LABELS = Literal[
|
SPEAKER_LABELS = Literal[
|
||||||
"ivr_synth", "human_single", "human_multi", "transfer", "no_speaker"
|
"ivr_synth", "human_single", "human_multi", "transfer", "no_speaker",
|
||||||
|
"background_voices",
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── Environmental labels ──────────────────────────────────────────────────────
|
# ── Environmental labels ──────────────────────────────────────────────────────
|
||||||
# Background shift is the primary AMD (answering machine detection) signal
|
# Background shift is the primary AMD (answering machine detection) signal.
|
||||||
|
# Telephony labels + general-purpose acoustic scene labels.
|
||||||
ENVIRON_LABELS = Literal[
|
ENVIRON_LABELS = Literal[
|
||||||
"call_center", "music", "background_shift", "noise_floor_change", "quiet"
|
# Telephony
|
||||||
|
"call_center", "music", "background_shift", "noise_floor_change", "quiet",
|
||||||
|
# Nature
|
||||||
|
"birdsong", "wind", "rain", "water",
|
||||||
|
# Urban
|
||||||
|
"traffic", "crowd_chatter", "street_signal", "construction",
|
||||||
|
# Indoor
|
||||||
|
"hvac", "keyboard_typing", "restaurant",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Acoustic scene labels ─────────────────────────────────────────────────────
|
||||||
|
# Broad scene category — primary input to privacy risk scoring.
|
||||||
|
SCENE_LABELS = Literal[
|
||||||
|
"indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature",
|
||||||
|
"vehicle", "public_transit",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Accent / language labels ──────────────────────────────────────────────────
|
||||||
|
# Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1.
|
||||||
|
ACCENT_LABELS = Literal[
|
||||||
|
"en_gb", "en_us", "en_au", "en_ca", "en_in",
|
||||||
|
"fr", "es", "de", "zh", "ja", "other",
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── Tone / affect labels ──────────────────────────────────────────────────────
|
# ── Tone / affect labels ──────────────────────────────────────────────────────
|
||||||
|
|
@ -86,12 +109,35 @@ class ToneEvent(AudioEvent):
|
||||||
|
|
||||||
The subtext field carries the human-readable annotation.
|
The subtext field carries the human-readable annotation.
|
||||||
Format is controlled by the caller (elcor flag in the classify request).
|
Format is controlled by the caller (elcor flag in the classify request).
|
||||||
|
|
||||||
|
Dimensional emotion (Navigation v0.2.x — audeering model):
|
||||||
|
valence / arousal / dominance are None when the dimensional classifier
|
||||||
|
is not enabled (CF_VOICE_DIMENSIONAL != "1").
|
||||||
|
|
||||||
|
Prosodic signals (Navigation v0.2.x — openSMILE):
|
||||||
|
sarcasm_risk / flat_f0_score are None when extractor is not enabled.
|
||||||
"""
|
"""
|
||||||
affect: str = "neutral"
|
affect: str = "neutral"
|
||||||
shift_magnitude: float = 0.0
|
shift_magnitude: float = 0.0
|
||||||
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
|
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
|
||||||
prosody_flags: list[str] = field(default_factory=list)
|
prosody_flags: list[str] = field(default_factory=list)
|
||||||
session_id: str = "" # caller-assigned; correlates events to a session
|
session_id: str = "" # caller-assigned; correlates events to a session
|
||||||
|
# Dimensional emotion scores (audeering, optional)
|
||||||
|
valence: float | None = None
|
||||||
|
arousal: float | None = None
|
||||||
|
dominance: float | None = None
|
||||||
|
# Prosodic signals (openSMILE, optional)
|
||||||
|
sarcasm_risk: float | None = None
|
||||||
|
flat_f0_score: float | None = None
|
||||||
|
# Trajectory signals (rolling buffer — activates after BASELINE_MIN frames)
|
||||||
|
arousal_delta: float | None = None
|
||||||
|
valence_delta: float | None = None
|
||||||
|
trend: str | None = None # "stable"|"escalating"|"suppressed"|…
|
||||||
|
# Coherence signals (SER vs VAD cross-comparison)
|
||||||
|
coherence_score: float | None = None
|
||||||
|
suppression_flag: bool | None = None
|
||||||
|
reframe_type: str | None = None # "none"|"genuine"|"surface"
|
||||||
|
affect_divergence: float | None = None
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
# Force event_type to "tone" regardless of what the caller passed.
|
# Force event_type to "tone" regardless of what the caller passed.
|
||||||
|
|
|
||||||
|
|
@ -118,5 +118,12 @@ def make_io(
|
||||||
if use_mock:
|
if use_mock:
|
||||||
return MockVoiceIO(interval_s=interval_s)
|
return MockVoiceIO(interval_s=interval_s)
|
||||||
|
|
||||||
from cf_voice.capture import MicVoiceIO
|
try:
|
||||||
return MicVoiceIO(device_index=device_index)
|
from cf_voice.capture import MicVoiceIO
|
||||||
|
return MicVoiceIO(device_index=device_index)
|
||||||
|
except ImportError as exc:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Real audio capture requires [inference] extras. "
|
||||||
|
"Install with: pip install cf-voice[inference]\n"
|
||||||
|
f"Missing: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
|
||||||
|
|
@ -13,19 +13,30 @@ class VoiceFrame:
|
||||||
A single annotated moment in a voice stream.
|
A single annotated moment in a voice stream.
|
||||||
|
|
||||||
Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
|
Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
|
||||||
(tone classification, speaker diarization).
|
(tone classification, speaker diarization, dimensional emotion).
|
||||||
|
|
||||||
Fields
|
Fields
|
||||||
------
|
------
|
||||||
label Tone annotation, e.g. "Warmly impatient" or "Deflecting".
|
label Tone annotation, e.g. "Warmly impatient" or "Deflecting".
|
||||||
Generic by default; Elcor-style prefix format is an
|
Generic by default; Elcor-style prefix format is an
|
||||||
easter egg surfaced by the product UI, not set here.
|
easter egg surfaced by the product UI, not set here.
|
||||||
confidence 0.0–1.0. Below ~0.5 the annotation is speculative.
|
confidence 0.0-1.0. Below ~0.5 the annotation is speculative.
|
||||||
speaker_id Ephemeral local label ("speaker_a", "speaker_b").
|
speaker_id Ephemeral local label ("speaker_a", "speaker_b").
|
||||||
Not tied to identity — resets each session.
|
Not tied to identity — resets each session.
|
||||||
shift_magnitude Delta from the previous frame's tone, 0.0–1.0.
|
shift_magnitude Delta from the previous frame's tone, 0.0-1.0.
|
||||||
High values indicate a meaningful register shift.
|
High values indicate a meaningful register shift.
|
||||||
timestamp Session-relative seconds since capture started.
|
timestamp Session-relative seconds since capture started.
|
||||||
|
|
||||||
|
Dimensional emotion (audeering model — Navigation v0.2.x, optional):
|
||||||
|
valence 0.0-1.0. Negative affect (0) to positive affect (1).
|
||||||
|
arousal 0.0-1.0. Low energy / calm (0) to high energy / excited (1).
|
||||||
|
dominance 0.0-1.0. Submissive / uncertain (0) to assertive / dominant (1).
|
||||||
|
|
||||||
|
Prosodic features (openSMILE eGeMAPS — Navigation v0.2.x, optional):
|
||||||
|
sarcasm_risk 0.0-1.0 heuristic score: flat F0 + calm-positive VAD +
|
||||||
|
text divergence (linnet#22). All three signals required for
|
||||||
|
high confidence — audio-only signals are weak priors.
|
||||||
|
flat_f0_score Normalised F0 flatness: 1.0 = maximally flat pitch.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
label: str
|
label: str
|
||||||
|
|
@ -34,6 +45,15 @@ class VoiceFrame:
|
||||||
shift_magnitude: float
|
shift_magnitude: float
|
||||||
timestamp: float
|
timestamp: float
|
||||||
|
|
||||||
|
# Dimensional emotion scores — None when dimensional classifier is disabled
|
||||||
|
valence: float | None = None
|
||||||
|
arousal: float | None = None
|
||||||
|
dominance: float | None = None
|
||||||
|
|
||||||
|
# Prosodic signals — None when prosodic extractor is disabled
|
||||||
|
sarcasm_risk: float | None = None
|
||||||
|
flat_f0_score: float | None = None
|
||||||
|
|
||||||
def is_reliable(self, threshold: float = 0.6) -> bool:
|
def is_reliable(self, threshold: float = 0.6) -> bool:
|
||||||
"""Return True when confidence meets the given threshold."""
|
"""Return True when confidence meets the given threshold."""
|
||||||
return self.confidence >= threshold
|
return self.confidence >= threshold
|
||||||
|
|
|
||||||
181
cf_voice/prefs.py
Normal file
181
cf_voice/prefs.py
Normal file
|
|
@ -0,0 +1,181 @@
|
||||||
|
# cf_voice/prefs.py — user preference hooks for cf-core preferences module
|
||||||
|
#
|
||||||
|
# MIT licensed. Provides voice-specific preference keys and helpers.
|
||||||
|
#
|
||||||
|
# When circuitforge_core is installed, reads/writes from the shared preference
|
||||||
|
# store (LocalFileStore or cloud backend). When it is not installed (standalone
|
||||||
|
# cf-voice use), falls back to environment variables only.
|
||||||
|
#
|
||||||
|
# Preference paths use dot-separated notation (cf-core convention):
|
||||||
|
# "voice.elcor_mode" bool — Elcor-style tone annotations
|
||||||
|
# "voice.confidence_threshold" float — minimum confidence to emit a frame
|
||||||
|
# "voice.whisper_model" str — faster-whisper model size
|
||||||
|
# "voice.elcor_prior_frames" int — rolling context window for Elcor LLM
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── Preference key constants ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
PREF_ELCOR_MODE = "voice.elcor_mode"
|
||||||
|
PREF_CONFIDENCE_THRESHOLD = "voice.confidence_threshold"
|
||||||
|
PREF_WHISPER_MODEL = "voice.whisper_model"
|
||||||
|
PREF_ELCOR_PRIOR_FRAMES = "voice.elcor_prior_frames"
|
||||||
|
|
||||||
|
# Defaults used when neither preference store nor environment has a value
|
||||||
|
_DEFAULTS: dict[str, Any] = {
|
||||||
|
PREF_ELCOR_MODE: False,
|
||||||
|
PREF_CONFIDENCE_THRESHOLD: 0.55,
|
||||||
|
PREF_WHISPER_MODEL: "small",
|
||||||
|
PREF_ELCOR_PRIOR_FRAMES: 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Environment variable fallbacks ────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENV_KEYS: dict[str, str] = {
|
||||||
|
PREF_ELCOR_MODE: "CF_VOICE_ELCOR",
|
||||||
|
PREF_CONFIDENCE_THRESHOLD: "CF_VOICE_CONFIDENCE_THRESHOLD",
|
||||||
|
PREF_WHISPER_MODEL: "CF_VOICE_WHISPER_MODEL",
|
||||||
|
PREF_ELCOR_PRIOR_FRAMES: "CF_VOICE_ELCOR_PRIOR_FRAMES",
|
||||||
|
}
|
||||||
|
|
||||||
|
_COERCE: dict[str, type] = {
|
||||||
|
PREF_ELCOR_MODE: bool,
|
||||||
|
PREF_CONFIDENCE_THRESHOLD: float,
|
||||||
|
PREF_WHISPER_MODEL: str,
|
||||||
|
PREF_ELCOR_PRIOR_FRAMES: int,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _from_env(pref_path: str) -> Any:
|
||||||
|
"""Read a preference from its environment variable fallback."""
|
||||||
|
env_key = _ENV_KEYS.get(pref_path)
|
||||||
|
if env_key is None:
|
||||||
|
return None
|
||||||
|
raw = os.environ.get(env_key)
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
coerce = _COERCE.get(pref_path, str)
|
||||||
|
try:
|
||||||
|
if coerce is bool:
|
||||||
|
return raw.strip().lower() in ("1", "true", "yes")
|
||||||
|
return coerce(raw)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
logger.warning("prefs: could not parse env %s=%r as %s", env_key, raw, coerce)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _cf_core_store():
|
||||||
|
"""Return the cf-core default preference store, or None if not available."""
|
||||||
|
try:
|
||||||
|
from circuitforge_core.preferences import store as _store_mod
|
||||||
|
return _store_mod._DEFAULT_STORE
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def get_voice_pref(
|
||||||
|
pref_path: str,
|
||||||
|
user_id: str | None = None,
|
||||||
|
store=None,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Read a voice preference value.
|
||||||
|
|
||||||
|
Resolution order:
|
||||||
|
1. Explicit store (passed in by caller — used for testing or cloud backends)
|
||||||
|
2. cf-core LocalFileStore (if circuitforge_core is installed)
|
||||||
|
3. Environment variable fallback
|
||||||
|
4. Built-in default
|
||||||
|
|
||||||
|
pref_path One of the PREF_* constants, e.g. PREF_ELCOR_MODE.
|
||||||
|
user_id Passed to the store for cloud backends; local store ignores it.
|
||||||
|
"""
|
||||||
|
# 1. Explicit store
|
||||||
|
if store is not None:
|
||||||
|
val = store.get(user_id=user_id, path=pref_path, default=None)
|
||||||
|
if val is not None:
|
||||||
|
return val
|
||||||
|
|
||||||
|
# 2. cf-core default store
|
||||||
|
cf_store = _cf_core_store()
|
||||||
|
if cf_store is not None:
|
||||||
|
val = cf_store.get(user_id=user_id, path=pref_path, default=None)
|
||||||
|
if val is not None:
|
||||||
|
return val
|
||||||
|
|
||||||
|
# 3. Environment variable
|
||||||
|
env_val = _from_env(pref_path)
|
||||||
|
if env_val is not None:
|
||||||
|
return env_val
|
||||||
|
|
||||||
|
# 4. Built-in default
|
||||||
|
return _DEFAULTS.get(pref_path)
|
||||||
|
|
||||||
|
|
||||||
|
def set_voice_pref(
|
||||||
|
pref_path: str,
|
||||||
|
value: Any,
|
||||||
|
user_id: str | None = None,
|
||||||
|
store=None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Write a voice preference value.
|
||||||
|
|
||||||
|
Writes to the explicit store if provided, otherwise to the cf-core default
|
||||||
|
store. Raises RuntimeError if neither is available (env-only mode has no
|
||||||
|
writable persistence).
|
||||||
|
"""
|
||||||
|
target = store or _cf_core_store()
|
||||||
|
if target is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"No writable preference store available. "
|
||||||
|
"Install circuitforge_core or pass a store explicitly."
|
||||||
|
)
|
||||||
|
target.set(user_id=user_id, path=pref_path, value=value)
|
||||||
|
|
||||||
|
|
||||||
|
def is_elcor_enabled(user_id: str | None = None, store=None) -> bool:
|
||||||
|
"""
|
||||||
|
Convenience: return True if the user has Elcor annotation mode enabled.
|
||||||
|
|
||||||
|
Elcor mode switches tone subtext from generic format ("Tone: Frustrated")
|
||||||
|
to the Mass Effect Elcor prefix format ("With barely concealed frustration:").
|
||||||
|
It is an accessibility feature for autistic and ND users who benefit from
|
||||||
|
explicit tonal annotation. Opt-in, local-only — no data leaves the device.
|
||||||
|
|
||||||
|
Defaults to False.
|
||||||
|
"""
|
||||||
|
return bool(get_voice_pref(PREF_ELCOR_MODE, user_id=user_id, store=store))
|
||||||
|
|
||||||
|
|
||||||
|
def get_confidence_threshold(user_id: str | None = None, store=None) -> float:
|
||||||
|
"""Return the minimum confidence threshold for emitting VoiceFrames (0.0–1.0)."""
|
||||||
|
return float(
|
||||||
|
get_voice_pref(PREF_CONFIDENCE_THRESHOLD, user_id=user_id, store=store)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_whisper_model(user_id: str | None = None, store=None) -> str:
|
||||||
|
"""Return the faster-whisper model name to use (e.g. "small", "medium")."""
|
||||||
|
return str(get_voice_pref(PREF_WHISPER_MODEL, user_id=user_id, store=store))
|
||||||
|
|
||||||
|
|
||||||
|
def get_elcor_prior_frames(user_id: str | None = None, store=None) -> int:
|
||||||
|
"""
|
||||||
|
Return the number of prior VoiceFrames to include as context for Elcor
|
||||||
|
label generation. Larger windows produce more contextually aware annotations
|
||||||
|
but increase LLM prompt length and latency.
|
||||||
|
|
||||||
|
Default: 4 frames (~8–10 seconds of rolling context at 2s intervals).
|
||||||
|
"""
|
||||||
|
return int(
|
||||||
|
get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, user_id=user_id, store=store)
|
||||||
|
)
|
||||||
115
cf_voice/privacy.py
Normal file
115
cf_voice/privacy.py
Normal file
|
|
@ -0,0 +1,115 @@
|
||||||
|
# cf_voice/privacy.py — local acoustic privacy risk scoring
|
||||||
|
#
|
||||||
|
# MIT licensed. Never transmitted to cloud. Never logged server-side.
|
||||||
|
#
|
||||||
|
# Derives a privacy_risk level (low / moderate / high) from the combined
|
||||||
|
# acoustic fingerprint: scene + environ labels + speaker type + accent.
|
||||||
|
#
|
||||||
|
# Design rationale (#20):
|
||||||
|
# - "outdoor_urban" + "crowd_chatter" + "traffic" → low: clearly public
|
||||||
|
# - "indoor_quiet" + "background_voices" → moderate: conversation overheard
|
||||||
|
# - "outdoor_nature" + "birdsong" + regional accent → moderate-high: location-identifying compound
|
||||||
|
# - "indoor_quiet" + no background voices → low
|
||||||
|
#
|
||||||
|
# Risk gates (Linnet):
|
||||||
|
# high: warn before sending audio chunk to cloud STT; offer local-only fallback
|
||||||
|
# moderate: attach privacy_flags to session state, no blocking action
|
||||||
|
# low: proceed normally
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
PrivacyLevel = Literal["low", "moderate", "high"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PrivacyRisk:
|
||||||
|
"""
|
||||||
|
Locally-computed privacy risk for a single audio window.
|
||||||
|
|
||||||
|
level: aggregate risk level
|
||||||
|
flags: ordered list of contributing signal descriptions
|
||||||
|
"""
|
||||||
|
level: PrivacyLevel
|
||||||
|
flags: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Signal sets ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_PUBLIC_SCENES = {"outdoor_urban", "public_transit"}
|
||||||
|
_NATURE_SCENES = {"outdoor_nature"}
|
||||||
|
_QUIET_SCENES = {"indoor_quiet"}
|
||||||
|
|
||||||
|
_LOCATION_ENVIRON = {"birdsong", "wind", "rain", "water"}
|
||||||
|
_URBAN_ENVIRON = {"traffic", "crowd_chatter", "street_signal", "construction"}
|
||||||
|
|
||||||
|
|
||||||
|
def score_privacy_risk(
|
||||||
|
scene: str | None,
|
||||||
|
environ_labels: list[str],
|
||||||
|
speaker: str | None,
|
||||||
|
accent: str | None,
|
||||||
|
) -> PrivacyRisk:
|
||||||
|
"""
|
||||||
|
Derive a PrivacyRisk from the current acoustic fingerprint.
|
||||||
|
|
||||||
|
All inputs are nullable — this function handles partial signals gracefully.
|
||||||
|
Called per audio window; results are never persisted or transmitted.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scene: SCENE_LABEL string or None
|
||||||
|
environ_labels: list of ENVIRON_LABEL strings active in this window
|
||||||
|
speaker: SPEAKER_LABEL string or None
|
||||||
|
accent: ACCENT_LABEL string or None (None when CF_VOICE_ACCENT disabled)
|
||||||
|
"""
|
||||||
|
flags: list[str] = []
|
||||||
|
score = 0 # internal accumulator; maps to level at the end
|
||||||
|
|
||||||
|
environ_set = set(environ_labels)
|
||||||
|
|
||||||
|
# ── Clearly public environments → reduce risk ─────────────────────────────
|
||||||
|
if scene in _PUBLIC_SCENES or environ_set & _URBAN_ENVIRON:
|
||||||
|
flags.append("public_environment")
|
||||||
|
score -= 1
|
||||||
|
|
||||||
|
# ── Background voices: conversation may be overheard ─────────────────────
|
||||||
|
if speaker == "background_voices":
|
||||||
|
flags.append("background_voices_detected")
|
||||||
|
score += 2
|
||||||
|
|
||||||
|
# ── Quiet indoor: no background noise reduces identifiability ────────────
|
||||||
|
if scene in _QUIET_SCENES and speaker not in ("background_voices", "human_multi"):
|
||||||
|
flags.append("controlled_environment")
|
||||||
|
# No score change — neutral
|
||||||
|
|
||||||
|
# ── Nature sounds: alone they suggest a quiet, potentially identifiable location
|
||||||
|
nature_match = environ_set & _LOCATION_ENVIRON
|
||||||
|
if nature_match:
|
||||||
|
flags.append(f"location_signal: {', '.join(sorted(nature_match))}")
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# ── Nature scene + nature sounds: compound location-identifying signal ────
|
||||||
|
if scene in _NATURE_SCENES and nature_match:
|
||||||
|
flags.append("compound_location_signal")
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# ── Regional accent + nature: narrows location to region + environment ────
|
||||||
|
if accent and accent not in ("en_us", "other") and nature_match:
|
||||||
|
flags.append(f"accent_plus_location: {accent}")
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# ── Quiet indoor + background voices: overheard conversation ─────────────
|
||||||
|
if scene in _QUIET_SCENES and speaker == "background_voices":
|
||||||
|
flags.append("overheard_conversation")
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# ── Map score to level ────────────────────────────────────────────────────
|
||||||
|
if score <= 0:
|
||||||
|
level: PrivacyLevel = "low"
|
||||||
|
elif score <= 2:
|
||||||
|
level = "moderate"
|
||||||
|
else:
|
||||||
|
level = "high"
|
||||||
|
|
||||||
|
return PrivacyRisk(level=level, flags=flags)
|
||||||
208
cf_voice/prosody.py
Normal file
208
cf_voice/prosody.py
Normal file
|
|
@ -0,0 +1,208 @@
|
||||||
|
# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction
|
||||||
|
#
|
||||||
|
# MIT licensed (opensmile-python package is MIT).
|
||||||
|
#
|
||||||
|
# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set:
|
||||||
|
# F0 mean / std / percentiles (pitch)
|
||||||
|
# Jitter / Shimmer (cycle-to-cycle variation — vocal tension)
|
||||||
|
# Energy / loudness envelope
|
||||||
|
# MFCCs, spectral centroid
|
||||||
|
# Speaking rate, pause ratio
|
||||||
|
#
|
||||||
|
# Runs on CPU in a thread pool executor — no GPU required. Designed to run
|
||||||
|
# in parallel with the GPU classifiers in context._classify_real_async() via
|
||||||
|
# asyncio.gather().
|
||||||
|
#
|
||||||
|
# Enable with: CF_VOICE_PROSODY=1 (default off)
|
||||||
|
# Install: pip install opensmile
|
||||||
|
#
|
||||||
|
# openSMILE docs: https://audeering.github.io/opensmile-python/
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SAMPLE_RATE = 16_000
|
||||||
|
|
||||||
|
# F0 std normalisation constant: values below this threshold indicate flat prosody.
|
||||||
|
# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm".
|
||||||
|
# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat.
|
||||||
|
_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm"
|
||||||
|
_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean"
|
||||||
|
_LOUDNESS_FEATURE = "loudness_sma3_amean"
|
||||||
|
_JITTER_FEATURE = "jitterLocal_sma3nz_amean"
|
||||||
|
_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean"
|
||||||
|
_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProsodicSignal:
|
||||||
|
"""
|
||||||
|
Summary prosodic features for a single audio window.
|
||||||
|
|
||||||
|
These are derived from the openSMILE eGeMAPS v02 feature set.
|
||||||
|
All values are raw feature magnitudes unless noted otherwise.
|
||||||
|
|
||||||
|
f0_mean: Mean F0 in semitones from 27.5Hz reference
|
||||||
|
f0_std: Normalised F0 standard deviation (flatness indicator)
|
||||||
|
jitter: Cycle-to-cycle pitch variation (vocal tension)
|
||||||
|
shimmer: Cycle-to-cycle amplitude variation (vocal stress)
|
||||||
|
loudness: Mean loudness (energy proxy)
|
||||||
|
sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive
|
||||||
|
audio (from DimensionalResult if available), and optional
|
||||||
|
text-audio divergence (linnet#22 signal, not yet wired).
|
||||||
|
flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied.
|
||||||
|
"""
|
||||||
|
f0_mean: float
|
||||||
|
f0_std: float
|
||||||
|
jitter: float
|
||||||
|
shimmer: float
|
||||||
|
loudness: float
|
||||||
|
flat_f0_score: float
|
||||||
|
sarcasm_risk: float
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_sarcasm_risk(
|
||||||
|
flat_f0: float,
|
||||||
|
calm_positive: float = 0.0,
|
||||||
|
text_divergence: float = 0.0,
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Heuristic sarcasm indicator. Not a trained model — a signal to combine
|
||||||
|
with text divergence (linnet#22) for the final confidence score.
|
||||||
|
|
||||||
|
flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied).
|
||||||
|
calm_positive: DimensionalResult.calm_positive_score() when available.
|
||||||
|
text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22.
|
||||||
|
Pass 0.0 until the parallel text classifier is wired.
|
||||||
|
|
||||||
|
Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%).
|
||||||
|
"""
|
||||||
|
return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3)
|
||||||
|
|
||||||
|
|
||||||
|
class ProsodicExtractor:
|
||||||
|
"""
|
||||||
|
openSMILE eGeMAPS feature extractor for a single audio window.
|
||||||
|
|
||||||
|
CPU-bound inference — uses thread pool executor to avoid blocking asyncio.
|
||||||
|
Lazy-loads opensmile on first call so import cost is deferred.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
extractor = ProsodicExtractor()
|
||||||
|
signal = await extractor.extract_async(audio_float32)
|
||||||
|
print(signal.flat_f0_score, signal.sarcasm_risk)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._smile = None
|
||||||
|
|
||||||
|
def _ensure_loaded(self) -> None:
|
||||||
|
"""Lazy-load opensmile on first extract call."""
|
||||||
|
if self._smile is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
import opensmile
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"opensmile is required for prosodic feature extraction. "
|
||||||
|
"Install with: pip install opensmile"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
self._smile = opensmile.Smile(
|
||||||
|
feature_set=opensmile.FeatureSet.eGeMAPSv02,
|
||||||
|
feature_level=opensmile.FeatureLevel.Functionals,
|
||||||
|
)
|
||||||
|
logger.info("openSMILE eGeMAPS loaded")
|
||||||
|
|
||||||
|
def _extract_sync(
|
||||||
|
self,
|
||||||
|
audio_float32: np.ndarray,
|
||||||
|
calm_positive: float = 0.0,
|
||||||
|
text_divergence: float = 0.0,
|
||||||
|
) -> ProsodicSignal:
|
||||||
|
"""
|
||||||
|
Synchronous feature extraction. Always call via extract_async.
|
||||||
|
|
||||||
|
Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score.
|
||||||
|
If opensmile raises (e.g. audio too short, no voiced frames), returns a
|
||||||
|
zero-filled ProsodicSignal so the caller does not need to handle exceptions.
|
||||||
|
"""
|
||||||
|
self._ensure_loaded()
|
||||||
|
|
||||||
|
try:
|
||||||
|
feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE)
|
||||||
|
row = feats.iloc[0]
|
||||||
|
|
||||||
|
f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0))
|
||||||
|
f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0))
|
||||||
|
jitter = float(row.get(_JITTER_FEATURE, 0.0))
|
||||||
|
shimmer = float(row.get(_SHIMMER_FEATURE, 0.0))
|
||||||
|
loudness = float(row.get(_LOUDNESS_FEATURE, 0.0))
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("openSMILE extraction failed (likely silent window): %s", exc)
|
||||||
|
return ProsodicSignal(
|
||||||
|
f0_mean=0.0, f0_std=0.0, jitter=0.0,
|
||||||
|
shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalise F0 variance to a flatness score.
|
||||||
|
# f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0
|
||||||
|
# f0_std of 0.0 = maximally flat → flat_f0 = 1.0
|
||||||
|
flat_f0 = 1.0 - min(f0_std / 0.4, 1.0)
|
||||||
|
|
||||||
|
sarcasm = _compute_sarcasm_risk(
|
||||||
|
flat_f0=flat_f0,
|
||||||
|
calm_positive=calm_positive,
|
||||||
|
text_divergence=text_divergence,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ProsodicSignal(
|
||||||
|
f0_mean=round(f0_mean, 4),
|
||||||
|
f0_std=round(f0_std, 4),
|
||||||
|
jitter=round(jitter, 6),
|
||||||
|
shimmer=round(shimmer, 6),
|
||||||
|
loudness=round(loudness, 4),
|
||||||
|
flat_f0_score=round(flat_f0, 4),
|
||||||
|
sarcasm_risk=round(sarcasm, 4),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def extract_async(
|
||||||
|
self,
|
||||||
|
audio_float32: np.ndarray,
|
||||||
|
calm_positive: float = 0.0,
|
||||||
|
text_divergence: float = 0.0,
|
||||||
|
) -> ProsodicSignal:
|
||||||
|
"""
|
||||||
|
Extract prosodic features without blocking the event loop.
|
||||||
|
|
||||||
|
calm_positive: Pass DimensionalResult.calm_positive_score() when
|
||||||
|
dimensional classification has already run.
|
||||||
|
text_divergence: Pass abs(transcript_sentiment - valence) when the
|
||||||
|
parallel text classifier (linnet#22) is wired.
|
||||||
|
"""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
return await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
partial(self._extract_sync, audio_float32, calm_positive, text_divergence),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_env(cls) -> "ProsodicExtractor":
|
||||||
|
"""Construct from environment. Raises if CF_VOICE_PROSODY is not set."""
|
||||||
|
if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
|
||||||
|
raise EnvironmentError(
|
||||||
|
"CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. "
|
||||||
|
"Add it to your .env and install opensmile: pip install opensmile"
|
||||||
|
)
|
||||||
|
return cls()
|
||||||
|
|
@ -46,6 +46,17 @@ class WhisperSTT:
|
||||||
print(result.text)
|
print(result.text)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Known single-token hallucinations that Whisper emits on music/noise with
|
||||||
|
# low no_speech_prob (i.e. Whisper thinks it heard speech). These are too
|
||||||
|
# short to be real utterances in any supported language context.
|
||||||
|
_HALLUCINATION_TOKENS: frozenset[str] = frozenset({
|
||||||
|
"ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Suppress a transcript if it repeats unchanged across this many consecutive
|
||||||
|
# windows — indicates Whisper is locked into a hallucination loop.
|
||||||
|
_MAX_REPEATS = 2
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "small",
|
model_name: str = "small",
|
||||||
|
|
@ -77,6 +88,8 @@ class WhisperSTT:
|
||||||
self._device = device
|
self._device = device
|
||||||
self._model_name = model_name
|
self._model_name = model_name
|
||||||
self._session_prompt: str = ""
|
self._session_prompt: str = ""
|
||||||
|
self._last_text: str = ""
|
||||||
|
self._repeat_count: int = 0
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_env(cls) -> "WhisperSTT":
|
def from_env(cls) -> "WhisperSTT":
|
||||||
|
|
@ -91,7 +104,14 @@ class WhisperSTT:
|
||||||
"""Estimated VRAM usage in MB for this model/compute_type combination."""
|
"""Estimated VRAM usage in MB for this model/compute_type combination."""
|
||||||
return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
|
return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
|
||||||
|
|
||||||
def _transcribe_sync(self, audio_float32: np.ndarray) -> STTResult:
|
# Segments above this no_speech_prob are hallucinations (silence/music/noise).
|
||||||
|
# faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks
|
||||||
|
# for watching" family without cutting off genuine low-energy speech.
|
||||||
|
_NO_SPEECH_THRESHOLD = 0.6
|
||||||
|
|
||||||
|
def _transcribe_sync(
|
||||||
|
self, audio_float32: np.ndarray, language: str | None = None
|
||||||
|
) -> STTResult:
|
||||||
"""Synchronous transcription — always call via transcribe_chunk_async."""
|
"""Synchronous transcription — always call via transcribe_chunk_async."""
|
||||||
duration = len(audio_float32) / 16_000.0
|
duration = len(audio_float32) / 16_000.0
|
||||||
|
|
||||||
|
|
@ -100,22 +120,49 @@ class WhisperSTT:
|
||||||
text="", language="en", duration_s=duration, is_final=False
|
text="", language="en", duration_s=duration, is_final=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Energy gate: skip Whisper entirely on silent/near-silent audio.
|
||||||
|
# In the sidecar path there is no upstream MicVoiceIO silence gate,
|
||||||
|
# so we must check here. RMS < 0.005 is inaudible; Whisper will
|
||||||
|
# hallucinate "thank you" or "thanks for watching" on silence.
|
||||||
|
rms = float(np.sqrt(np.mean(audio_float32 ** 2)))
|
||||||
|
if rms < 0.005:
|
||||||
|
return STTResult(text="", language="en", duration_s=duration, is_final=False)
|
||||||
|
|
||||||
segments, info = self._model.transcribe(
|
segments, info = self._model.transcribe(
|
||||||
audio_float32,
|
audio_float32,
|
||||||
language=None,
|
language=language or None, # None = Whisper auto-detect
|
||||||
initial_prompt=self._session_prompt or None,
|
initial_prompt=None, # No session prompt — on 1s windows it causes
|
||||||
vad_filter=False, # silence gating happens upstream in MicVoiceIO
|
# phrase lock-in (model anchors on prior text
|
||||||
|
# rather than fresh audio). Reset via reset_session()
|
||||||
|
# at conversation boundaries instead.
|
||||||
|
vad_filter=True, # Silero VAD — skips non-speech frames
|
||||||
word_timestamps=False,
|
word_timestamps=False,
|
||||||
beam_size=3,
|
beam_size=3,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = " ".join(s.text.strip() for s in segments).strip()
|
# Filter hallucinated segments: discard any segment where Whisper itself
|
||||||
|
# says there is likely no speech (no_speech_prob > threshold). This is
|
||||||
|
# the correct defense against "thank you" / music hallucinations — VAD
|
||||||
|
# alone is insufficient because music harmonics look speech-like to Silero.
|
||||||
|
text = " ".join(
|
||||||
|
s.text.strip()
|
||||||
|
for s in segments
|
||||||
|
if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD
|
||||||
|
).strip()
|
||||||
|
|
||||||
# Rolling context: keep last ~50 words so the next chunk has prior text
|
# Gate 1: single-token hallucinations that slip past no_speech_prob.
|
||||||
if text:
|
if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS:
|
||||||
words = (self._session_prompt + " " + text).split()
|
text = ""
|
||||||
self._session_prompt = " ".join(words[-50:])
|
|
||||||
|
# Gate 2: repetition lock — same non-empty text N windows in a row.
|
||||||
|
if text and text == self._last_text:
|
||||||
|
self._repeat_count += 1
|
||||||
|
if self._repeat_count >= self._MAX_REPEATS:
|
||||||
|
text = ""
|
||||||
|
else:
|
||||||
|
self._last_text = text
|
||||||
|
self._repeat_count = 0
|
||||||
|
|
||||||
return STTResult(
|
return STTResult(
|
||||||
text=text,
|
text=text,
|
||||||
|
|
@ -124,19 +171,29 @@ class WhisperSTT:
|
||||||
is_final=duration >= 1.0 and info.language_probability > 0.5,
|
is_final=duration >= 1.0 and info.language_probability > 0.5,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def transcribe_chunk_async(self, pcm_int16: bytes) -> STTResult:
|
async def transcribe_chunk_async(
|
||||||
|
self, pcm_int16: bytes, language: str | None = None
|
||||||
|
) -> STTResult:
|
||||||
"""
|
"""
|
||||||
Transcribe a raw PCM Int16 chunk, non-blocking.
|
Transcribe a raw PCM Int16 chunk, non-blocking.
|
||||||
|
|
||||||
pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
|
pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
|
||||||
chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
|
chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
|
||||||
|
|
||||||
|
language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects,
|
||||||
|
which is slower and more hallucination-prone on short clips.
|
||||||
"""
|
"""
|
||||||
|
from functools import partial
|
||||||
audio = (
|
audio = (
|
||||||
np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
)
|
)
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_running_loop()
|
||||||
return await loop.run_in_executor(None, self._transcribe_sync, audio)
|
return await loop.run_in_executor(
|
||||||
|
None, partial(self._transcribe_sync, audio, language)
|
||||||
|
)
|
||||||
|
|
||||||
def reset_session(self) -> None:
|
def reset_session(self) -> None:
|
||||||
"""Clear the rolling prompt. Call at the start of each new conversation."""
|
"""Clear rolling state. Call at the start of each new conversation."""
|
||||||
self._session_prompt = ""
|
self._session_prompt = ""
|
||||||
|
self._last_text = ""
|
||||||
|
self._repeat_count = 0
|
||||||
|
|
|
||||||
500
cf_voice/telephony.py
Normal file
500
cf_voice/telephony.py
Normal file
|
|
@ -0,0 +1,500 @@
|
||||||
|
# cf_voice/telephony.py — outbound telephony abstraction
|
||||||
|
#
|
||||||
|
# Protocol + mock backend: MIT licensed.
|
||||||
|
# SignalWireBackend, FreeSWITCHBackend: BSL 1.1 (real telephony, cloud credentials).
|
||||||
|
#
|
||||||
|
# Consumers (Osprey, Harrier, Ibis, Kestrel) depend only on TelephonyBackend
|
||||||
|
# and CallSession — both MIT. The concrete backends are selected by make_telephony()
|
||||||
|
# based on the tier and available credentials.
|
||||||
|
#
|
||||||
|
# Requires optional extras for real backends:
|
||||||
|
# pip install cf-voice[signalwire] — SignalWire (paid tier, CF-provisioned)
|
||||||
|
# pip install cf-voice[freeswitch] — FreeSWITCH ESL (free tier, self-hosted)
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Literal, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CallState = Literal[
|
||||||
|
"dialing",
|
||||||
|
"ringing",
|
||||||
|
"in_progress",
|
||||||
|
"hold",
|
||||||
|
"bridged",
|
||||||
|
"completed",
|
||||||
|
"failed",
|
||||||
|
"no_answer",
|
||||||
|
"busy",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CallSession:
|
||||||
|
"""
|
||||||
|
Represents an active or completed outbound call.
|
||||||
|
|
||||||
|
call_sid is the backend-assigned identifier — for SignalWire this is a
|
||||||
|
Twilio-compatible SID string; for FreeSWITCH it is the UUID.
|
||||||
|
|
||||||
|
state is updated by the backend as the call progresses. Consumers should
|
||||||
|
poll via backend.get_state() or subscribe to webhook events.
|
||||||
|
"""
|
||||||
|
call_sid: str
|
||||||
|
to: str
|
||||||
|
from_: str
|
||||||
|
state: CallState = "dialing"
|
||||||
|
duration_s: float = 0.0
|
||||||
|
# AMD result: "human" | "machine" | "unknown"
|
||||||
|
# Populated once the backend resolves answering machine detection.
|
||||||
|
amd_result: str = "unknown"
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TelephonyBackend(Protocol):
|
||||||
|
"""
|
||||||
|
Abstract telephony backend interface.
|
||||||
|
|
||||||
|
All methods are async. Implementations must be safe to call from an
|
||||||
|
asyncio event loop. Long-running network operations run in a thread pool
|
||||||
|
(not the caller's responsibility).
|
||||||
|
|
||||||
|
Field names are stable as of cf-voice v0.1.0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def dial(
|
||||||
|
self,
|
||||||
|
to: str,
|
||||||
|
from_: str,
|
||||||
|
webhook_url: str,
|
||||||
|
*,
|
||||||
|
amd: bool = False,
|
||||||
|
) -> CallSession:
|
||||||
|
"""
|
||||||
|
Initiate an outbound call.
|
||||||
|
|
||||||
|
to / from_ E.164 numbers ("+15551234567").
|
||||||
|
webhook_url URL the backend will POST call events to (SignalWire/TwiML style).
|
||||||
|
amd If True, request answering machine detection. Result lands in
|
||||||
|
CallSession.amd_result once the backend resolves it.
|
||||||
|
|
||||||
|
Returns a CallSession with state="dialing".
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def send_dtmf(self, call_sid: str, digits: str) -> None:
|
||||||
|
"""
|
||||||
|
Send DTMF (dual-tone multi-frequency) tones mid-call.
|
||||||
|
|
||||||
|
digits String of 0-9, *, #, A-D. Each character is one tone.
|
||||||
|
Pauses may be represented as 'w' (0.5s) or 'W' (1s) if the backend
|
||||||
|
supports them.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def bridge(self, call_sid: str, target: str) -> None:
|
||||||
|
"""
|
||||||
|
Bridge the active call to a second E.164 number or SIP URI.
|
||||||
|
|
||||||
|
Used to connect the user directly to a human agent after Osprey has
|
||||||
|
navigated the IVR. The original call leg remains connected.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def hangup(self, call_sid: str) -> None:
|
||||||
|
"""Terminate the call. Idempotent — safe to call on already-ended calls."""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def announce(
|
||||||
|
self,
|
||||||
|
call_sid: str,
|
||||||
|
text: str,
|
||||||
|
voice: str = "default",
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Play synthesised speech into the call.
|
||||||
|
|
||||||
|
Implements the adaptive service identification requirement (osprey#21):
|
||||||
|
Osprey announces its identity before navigating an IVR so that the
|
||||||
|
other party can consent to automated interaction.
|
||||||
|
|
||||||
|
voice Backend-specific voice identifier. "default" uses the backend's
|
||||||
|
default TTS voice.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def get_state(self, call_sid: str) -> CallState:
|
||||||
|
"""Fetch the current state of a call from the backend."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# ── Mock backend (MIT) ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class MockTelephonyBackend:
|
||||||
|
"""
|
||||||
|
Synthetic telephony backend for development and CI.
|
||||||
|
|
||||||
|
No real calls are placed. Operations log to cf_voice.telephony and update
|
||||||
|
in-memory CallSession objects. AMD resolves to "human" after a simulated
|
||||||
|
delay.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+15551234567", "+18005550000", "https://...")
|
||||||
|
await backend.send_dtmf(session.call_sid, "1")
|
||||||
|
await backend.hangup(session.call_sid)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, amd_delay_s: float = 0.5) -> None:
|
||||||
|
self._sessions: dict[str, CallSession] = {}
|
||||||
|
self._amd_delay_s = amd_delay_s
|
||||||
|
self._call_counter = 0
|
||||||
|
|
||||||
|
def _next_sid(self) -> str:
|
||||||
|
self._call_counter += 1
|
||||||
|
return f"mock_sid_{self._call_counter:04d}"
|
||||||
|
|
||||||
|
async def dial(
|
||||||
|
self,
|
||||||
|
to: str,
|
||||||
|
from_: str,
|
||||||
|
webhook_url: str,
|
||||||
|
*,
|
||||||
|
amd: bool = False,
|
||||||
|
) -> CallSession:
|
||||||
|
sid = self._next_sid()
|
||||||
|
session = CallSession(call_sid=sid, to=to, from_=from_, state="ringing")
|
||||||
|
self._sessions[sid] = session
|
||||||
|
logger.debug("MockTelephony: dial %s → %s (sid=%s)", from_, to, sid)
|
||||||
|
|
||||||
|
async def _progress() -> None:
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
session.state = "in_progress"
|
||||||
|
if amd:
|
||||||
|
await asyncio.sleep(self._amd_delay_s)
|
||||||
|
session.amd_result = "human"
|
||||||
|
logger.debug("MockTelephony: AMD resolved human (sid=%s)", sid)
|
||||||
|
|
||||||
|
asyncio.create_task(_progress())
|
||||||
|
return session
|
||||||
|
|
||||||
|
async def send_dtmf(self, call_sid: str, digits: str) -> None:
|
||||||
|
self._sessions[call_sid] # KeyError if unknown — intentional
|
||||||
|
logger.debug("MockTelephony: DTMF %r (sid=%s)", digits, call_sid)
|
||||||
|
|
||||||
|
async def bridge(self, call_sid: str, target: str) -> None:
|
||||||
|
session = self._sessions[call_sid]
|
||||||
|
session.state = "bridged"
|
||||||
|
logger.debug("MockTelephony: bridge → %s (sid=%s)", target, call_sid)
|
||||||
|
|
||||||
|
async def hangup(self, call_sid: str) -> None:
|
||||||
|
session = self._sessions.get(call_sid)
|
||||||
|
if session:
|
||||||
|
session.state = "completed"
|
||||||
|
logger.debug("MockTelephony: hangup (sid=%s)", call_sid)
|
||||||
|
|
||||||
|
async def announce(
|
||||||
|
self,
|
||||||
|
call_sid: str,
|
||||||
|
text: str,
|
||||||
|
voice: str = "default",
|
||||||
|
) -> None:
|
||||||
|
self._sessions[call_sid] # KeyError if unknown — intentional
|
||||||
|
logger.debug(
|
||||||
|
"MockTelephony: announce voice=%s text=%r (sid=%s)", voice, text, call_sid
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_state(self, call_sid: str) -> CallState:
|
||||||
|
return self._sessions[call_sid].state
|
||||||
|
|
||||||
|
|
||||||
|
# ── SignalWire backend (BSL 1.1) ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class SignalWireBackend:
|
||||||
|
"""
|
||||||
|
SignalWire outbound telephony (Twilio-compatible REST API).
|
||||||
|
|
||||||
|
BSL 1.1 — requires paid tier or self-hosted CF SignalWire project.
|
||||||
|
|
||||||
|
Credentials sourced from environment:
|
||||||
|
CF_SW_PROJECT_ID — SignalWire project ID
|
||||||
|
CF_SW_AUTH_TOKEN — SignalWire auth token
|
||||||
|
CF_SW_SPACE_URL — space URL, e.g. "yourspace.signalwire.com"
|
||||||
|
|
||||||
|
Requires: pip install cf-voice[signalwire]
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
project_id: str | None = None,
|
||||||
|
auth_token: str | None = None,
|
||||||
|
space_url: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
from signalwire.rest import Client as SWClient # type: ignore[import]
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"SignalWire SDK is required for SignalWireBackend. "
|
||||||
|
"Install with: pip install cf-voice[signalwire]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
self._project_id = project_id or os.environ["CF_SW_PROJECT_ID"]
|
||||||
|
self._auth_token = auth_token or os.environ["CF_SW_AUTH_TOKEN"]
|
||||||
|
self._space_url = space_url or os.environ["CF_SW_SPACE_URL"]
|
||||||
|
self._client = SWClient(
|
||||||
|
self._project_id,
|
||||||
|
self._auth_token,
|
||||||
|
signalwire_space_url=self._space_url,
|
||||||
|
)
|
||||||
|
self._loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
async def dial(
|
||||||
|
self,
|
||||||
|
to: str,
|
||||||
|
from_: str,
|
||||||
|
webhook_url: str,
|
||||||
|
*,
|
||||||
|
amd: bool = False,
|
||||||
|
) -> CallSession:
|
||||||
|
call_kwargs: dict = dict(
|
||||||
|
to=to,
|
||||||
|
from_=from_,
|
||||||
|
url=webhook_url,
|
||||||
|
status_callback=webhook_url,
|
||||||
|
)
|
||||||
|
if amd:
|
||||||
|
call_kwargs["machine_detection"] = "Enable"
|
||||||
|
call_kwargs["async_amd"] = True
|
||||||
|
|
||||||
|
call = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls.create(**call_kwargs),
|
||||||
|
)
|
||||||
|
return CallSession(
|
||||||
|
call_sid=call.sid,
|
||||||
|
to=to,
|
||||||
|
from_=from_,
|
||||||
|
state="dialing",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def send_dtmf(self, call_sid: str, digits: str) -> None:
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls(call_sid).update(
|
||||||
|
twiml=f"<Response><Play digits='{digits}'/></Response>"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def bridge(self, call_sid: str, target: str) -> None:
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls(call_sid).update(
|
||||||
|
twiml=(
|
||||||
|
f"<Response><Dial><Number>{target}</Number></Dial></Response>"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def hangup(self, call_sid: str) -> None:
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls(call_sid).update(status="completed"),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def announce(
|
||||||
|
self,
|
||||||
|
call_sid: str,
|
||||||
|
text: str,
|
||||||
|
voice: str = "alice",
|
||||||
|
) -> None:
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls(call_sid).update(
|
||||||
|
twiml=f"<Response><Say voice='{voice}'>{text}</Say></Response>"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_state(self, call_sid: str) -> CallState:
|
||||||
|
call = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._client.calls(call_sid).fetch(),
|
||||||
|
)
|
||||||
|
_sw_map: dict[str, CallState] = {
|
||||||
|
"queued": "dialing", "ringing": "ringing", "in-progress": "in_progress",
|
||||||
|
"completed": "completed", "failed": "failed", "busy": "busy",
|
||||||
|
"no-answer": "no_answer",
|
||||||
|
}
|
||||||
|
return _sw_map.get(call.status, "failed")
|
||||||
|
|
||||||
|
|
||||||
|
# ── FreeSWITCH backend (BSL 1.1) ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class FreeSWITCHBackend:
|
||||||
|
"""
|
||||||
|
Self-hosted FreeSWITCH outbound telephony via ESL (event socket layer).
|
||||||
|
|
||||||
|
BSL 1.1 — requires free tier + user-provisioned FreeSWITCH + VoIP.ms SIP trunk.
|
||||||
|
|
||||||
|
Credentials sourced from environment:
|
||||||
|
CF_ESL_HOST — FreeSWITCH ESL host (default: 127.0.0.1)
|
||||||
|
CF_ESL_PORT — FreeSWITCH ESL port (default: 8021)
|
||||||
|
CF_ESL_PASSWORD — FreeSWITCH ESL password
|
||||||
|
|
||||||
|
Requires: pip install cf-voice[freeswitch]
|
||||||
|
|
||||||
|
Note: FreeSWITCH AMD (mod_vad + custom heuristic or Whisper pipe) is not
|
||||||
|
yet implemented. The amd parameter is accepted but amd_result stays "unknown".
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
host: str | None = None,
|
||||||
|
port: int | None = None,
|
||||||
|
password: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
import ESL # type: ignore[import]
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"FreeSWITCH ESL bindings are required for FreeSWITCHBackend. "
|
||||||
|
"Install with: pip install cf-voice[freeswitch]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
self._host = host or os.environ.get("CF_ESL_HOST", "127.0.0.1")
|
||||||
|
self._port = int(port or os.environ.get("CF_ESL_PORT", 8021))
|
||||||
|
self._password = password or os.environ["CF_ESL_PASSWORD"]
|
||||||
|
self._esl = ESL
|
||||||
|
|
||||||
|
def _connect(self):
|
||||||
|
conn = self._esl.ESLconnection(self._host, str(self._port), self._password)
|
||||||
|
if not conn.connected():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Could not connect to FreeSWITCH ESL at {self._host}:{self._port}"
|
||||||
|
)
|
||||||
|
return conn
|
||||||
|
|
||||||
|
async def dial(
|
||||||
|
self,
|
||||||
|
to: str,
|
||||||
|
from_: str,
|
||||||
|
webhook_url: str,
|
||||||
|
*,
|
||||||
|
amd: bool = False,
|
||||||
|
) -> CallSession:
|
||||||
|
def _originate() -> str:
|
||||||
|
conn = self._connect()
|
||||||
|
# ESL originate: sofia/gateway/voipms/{to} {from_} XML default
|
||||||
|
cmd = (
|
||||||
|
f"originate {{origination_caller_id_number={from_},"
|
||||||
|
f"origination_caller_id_name=CircuitForge}}"
|
||||||
|
f"sofia/gateway/voipms/{to.lstrip('+')} &park()"
|
||||||
|
)
|
||||||
|
result = conn.api("originate", cmd)
|
||||||
|
return result.getBody().strip()
|
||||||
|
|
||||||
|
body = await asyncio.get_event_loop().run_in_executor(None, _originate)
|
||||||
|
# FreeSWITCH returns "+OK <uuid>" on success
|
||||||
|
if not body.startswith("+OK"):
|
||||||
|
raise RuntimeError(f"FreeSWITCH originate failed: {body}")
|
||||||
|
uuid = body.removeprefix("+OK").strip()
|
||||||
|
return CallSession(call_sid=uuid, to=to, from_=from_, state="dialing")
|
||||||
|
|
||||||
|
async def send_dtmf(self, call_sid: str, digits: str) -> None:
|
||||||
|
def _dtmf() -> None:
|
||||||
|
conn = self._connect()
|
||||||
|
conn.api("uuid_send_dtmf", f"{call_sid} {digits}")
|
||||||
|
|
||||||
|
await asyncio.get_event_loop().run_in_executor(None, _dtmf)
|
||||||
|
|
||||||
|
async def bridge(self, call_sid: str, target: str) -> None:
|
||||||
|
def _bridge() -> None:
|
||||||
|
conn = self._connect()
|
||||||
|
conn.api(
|
||||||
|
"uuid_bridge",
|
||||||
|
f"{call_sid} sofia/gateway/voipms/{target.lstrip('+')}",
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.get_event_loop().run_in_executor(None, _bridge)
|
||||||
|
|
||||||
|
async def hangup(self, call_sid: str) -> None:
|
||||||
|
def _hangup() -> None:
|
||||||
|
conn = self._connect()
|
||||||
|
conn.api("uuid_kill", call_sid)
|
||||||
|
|
||||||
|
await asyncio.get_event_loop().run_in_executor(None, _hangup)
|
||||||
|
|
||||||
|
async def announce(
|
||||||
|
self,
|
||||||
|
call_sid: str,
|
||||||
|
text: str,
|
||||||
|
voice: str = "default",
|
||||||
|
) -> None:
|
||||||
|
# FreeSWITCH TTS via mod_tts_commandline or Piper pipe
|
||||||
|
def _say() -> None:
|
||||||
|
conn = self._connect()
|
||||||
|
conn.api("uuid_broadcast", f"{call_sid} say::en CHAT SPOKEN {text}")
|
||||||
|
|
||||||
|
await asyncio.get_event_loop().run_in_executor(None, _say)
|
||||||
|
|
||||||
|
async def get_state(self, call_sid: str) -> CallState:
|
||||||
|
def _fetch() -> str:
|
||||||
|
conn = self._connect()
|
||||||
|
return conn.api("uuid_getvar", f"{call_sid} call_state").getBody().strip()
|
||||||
|
|
||||||
|
raw = await asyncio.get_event_loop().run_in_executor(None, _fetch)
|
||||||
|
_fs_map: dict[str, CallState] = {
|
||||||
|
"CS_INIT": "dialing", "CS_ROUTING": "ringing",
|
||||||
|
"CS_EXECUTE": "in_progress", "CS_HANGUP": "completed",
|
||||||
|
"CS_DESTROY": "completed",
|
||||||
|
}
|
||||||
|
return _fs_map.get(raw, "failed")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Factory ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def make_telephony(
|
||||||
|
mock: bool | None = None,
|
||||||
|
backend: str | None = None,
|
||||||
|
) -> MockTelephonyBackend | SignalWireBackend | FreeSWITCHBackend:
|
||||||
|
"""
|
||||||
|
Factory: return a TelephonyBackend appropriate for the current environment.
|
||||||
|
|
||||||
|
Resolution order:
|
||||||
|
1. mock=True or CF_VOICE_MOCK=1 → MockTelephonyBackend
|
||||||
|
2. backend="signalwire" or CF_SW_PROJECT_ID present → SignalWireBackend
|
||||||
|
3. backend="freeswitch" or CF_ESL_PASSWORD present → FreeSWITCHBackend
|
||||||
|
4. Raises RuntimeError — no usable backend configured
|
||||||
|
|
||||||
|
In production, backend selection is driven by the tier system:
|
||||||
|
Free tier → FreeSWITCHBackend (BYOK VoIP)
|
||||||
|
Paid tier → SignalWireBackend (CF-provisioned)
|
||||||
|
"""
|
||||||
|
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||||
|
if use_mock:
|
||||||
|
return MockTelephonyBackend()
|
||||||
|
|
||||||
|
resolved_backend = backend or (
|
||||||
|
"signalwire" if os.environ.get("CF_SW_PROJECT_ID") else
|
||||||
|
"freeswitch" if os.environ.get("CF_ESL_PASSWORD") else
|
||||||
|
None
|
||||||
|
)
|
||||||
|
|
||||||
|
if resolved_backend == "signalwire":
|
||||||
|
return SignalWireBackend()
|
||||||
|
|
||||||
|
if resolved_backend == "freeswitch":
|
||||||
|
return FreeSWITCHBackend()
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
"No telephony backend configured. "
|
||||||
|
"Set CF_VOICE_MOCK=1 for mock mode, or provide SignalWire / FreeSWITCH credentials."
|
||||||
|
)
|
||||||
288
cf_voice/trajectory.py
Normal file
288
cf_voice/trajectory.py
Normal file
|
|
@ -0,0 +1,288 @@
|
||||||
|
# cf_voice/trajectory.py — affect trajectory and SER/VAD coherence signals
|
||||||
|
#
|
||||||
|
# MIT licensed — derived computation only, no inference models.
|
||||||
|
#
|
||||||
|
# Two signal families:
|
||||||
|
#
|
||||||
|
# 1. TrajectorySignal — rolling arousal/valence trend across the last N windows.
|
||||||
|
# Detects escalation, de-escalation, suppression, worsening, improving.
|
||||||
|
#
|
||||||
|
# 2. CoherenceSignal — cross-model comparison between SER (categorical affect)
|
||||||
|
# and VAD (continuous dimensional valence). Disagreement indicates affect
|
||||||
|
# suppression, controlled presentation, or surface-only semantic reframe.
|
||||||
|
#
|
||||||
|
# Both signals activate only after BASELINE_MIN windows per speaker are buffered.
|
||||||
|
# All thresholds are relative to the per-speaker rolling mean, not absolute —
|
||||||
|
# this is required for ND/neurodivergent speaker safety (see design doc).
|
||||||
|
#
|
||||||
|
# Safety note: these signals must never be labelled "deception" in any
|
||||||
|
# user-facing context. Use: "affect divergence", "controlled presentation",
|
||||||
|
# "framing shift". The user interprets; the system observes.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections import deque
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from cf_voice.dimensional import DimensionalResult
|
||||||
|
|
||||||
|
# Rolling window depth per speaker
|
||||||
|
BUFFER_WINDOW = 5
|
||||||
|
|
||||||
|
# Minimum frames before signals activate (relative baseline requirement)
|
||||||
|
BASELINE_MIN = 3
|
||||||
|
|
||||||
|
# Minimum arousal/valence delta per window to count as directional movement
|
||||||
|
_DELTA_THRESHOLD = 0.05
|
||||||
|
|
||||||
|
# Arousal threshold above which "neutral SER + high arousal" = suppression candidate
|
||||||
|
_SUPPRESSION_AROUSAL_MIN = 0.65
|
||||||
|
|
||||||
|
# SER affects that imply low arousal presentation (used for suppression detection)
|
||||||
|
_LOW_PRESENTATION_AFFECTS = frozenset({"neutral", "scripted", "tired", "apologetic"})
|
||||||
|
|
||||||
|
# Expected valence ranges derived from MSP-Podcast emotion distribution.
|
||||||
|
# Used to determine whether SER affect label and dimensional valence agree.
|
||||||
|
_AFFECT_VALENCE_PRIOR: dict[str, tuple[float, float]] = {
|
||||||
|
"warm": (0.60, 1.00),
|
||||||
|
"genuine": (0.55, 1.00),
|
||||||
|
"optimistic": (0.55, 0.90),
|
||||||
|
"neutral": (0.35, 0.65),
|
||||||
|
"confused": (0.30, 0.60),
|
||||||
|
"scripted": (0.30, 0.65),
|
||||||
|
"apologetic": (0.20, 0.55),
|
||||||
|
"tired": (0.10, 0.50),
|
||||||
|
"frustrated": (0.10, 0.45),
|
||||||
|
"dismissive": (0.15, 0.50),
|
||||||
|
"condescending": (0.10, 0.45),
|
||||||
|
"urgent": (0.15, 0.55),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ordinal positivity for reframe direction detection.
|
||||||
|
# Higher = more positive presentation.
|
||||||
|
_AFFECT_POSITIVITY: dict[str, int] = {
|
||||||
|
"urgent": 1,
|
||||||
|
"frustrated": 1,
|
||||||
|
"condescending": 1,
|
||||||
|
"dismissive": 2,
|
||||||
|
"tired": 2,
|
||||||
|
"apologetic": 3,
|
||||||
|
"confused": 3,
|
||||||
|
"scripted": 4,
|
||||||
|
"neutral": 4,
|
||||||
|
"optimistic": 5,
|
||||||
|
"genuine": 5,
|
||||||
|
"warm": 6,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrajectorySignal:
|
||||||
|
"""
|
||||||
|
Rolling trend across recent dimensional frames for one speaker.
|
||||||
|
|
||||||
|
All delta values: current_frame_value - mean(buffer_values).
|
||||||
|
Positive arousal_delta = current frame is more activated than baseline.
|
||||||
|
Negative valence_delta = current frame is more negative than baseline.
|
||||||
|
|
||||||
|
trend values:
|
||||||
|
"calibrating" not enough frames yet (< BASELINE_MIN)
|
||||||
|
"stable" no significant directional movement
|
||||||
|
"escalating" arousal rising: current > mean by DELTA_THRESHOLD, consecutive
|
||||||
|
"de-escalating" arousal falling after elevated period
|
||||||
|
"worsening" valence falling: current < mean, consecutive
|
||||||
|
"improving" valence rising after depressed period
|
||||||
|
"suppressed" SER affect is calm/neutral, VAD arousal is elevated
|
||||||
|
"""
|
||||||
|
arousal_delta: float
|
||||||
|
valence_delta: float
|
||||||
|
dominance_delta: float
|
||||||
|
arousal_trend: str # "rising" | "falling" | "flat"
|
||||||
|
valence_trend: str # "rising" | "falling" | "flat"
|
||||||
|
trend: str
|
||||||
|
frames_in_buffer: int
|
||||||
|
baseline_established: bool
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CoherenceSignal:
|
||||||
|
"""
|
||||||
|
Cross-signal comparison: SER categorical affect vs. VAD dimensional valence.
|
||||||
|
|
||||||
|
coherence_score:
|
||||||
|
1.0 = SER label and VAD valence are fully consistent.
|
||||||
|
0.0 = maximum disagreement.
|
||||||
|
|
||||||
|
suppression_flag:
|
||||||
|
True when the speaker is presenting as calm/neutral (SER) but VAD arousal
|
||||||
|
is elevated. Indicates controlled presentation with activation underneath.
|
||||||
|
This is relative to a per-session threshold — not a universal claim.
|
||||||
|
|
||||||
|
reframe_type:
|
||||||
|
"none" no SER category shift this window
|
||||||
|
"genuine" SER shifted toward more positive AND dimensional valence also
|
||||||
|
improved (>= DELTA_THRESHOLD in this window)
|
||||||
|
"surface" SER shifted toward more positive BUT dimensional valence
|
||||||
|
continued its prior trajectory unchanged or worsening
|
||||||
|
|
||||||
|
affect_divergence:
|
||||||
|
Signed: VAD-implied valence minus SER-implied valence midpoint.
|
||||||
|
Negative = VAD more negative than SER label implies (masking candidate).
|
||||||
|
Positive = VAD more positive than SER label implies (unusual).
|
||||||
|
"""
|
||||||
|
coherence_score: float
|
||||||
|
suppression_flag: bool
|
||||||
|
reframe_type: str # "none" | "genuine" | "surface"
|
||||||
|
affect_divergence: float
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def affect_coherence(affect: str, valence: float) -> float:
|
||||||
|
"""
|
||||||
|
Compute coherence between a SER affect category and a VAD valence score.
|
||||||
|
|
||||||
|
Returns 1.0 when valence falls inside the expected range for the affect.
|
||||||
|
Returns 0.0 when the gap between valence and the nearest range boundary
|
||||||
|
exceeds 0.40 (the full range of a typical incoherence gap).
|
||||||
|
"""
|
||||||
|
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
|
||||||
|
if lo <= valence <= hi:
|
||||||
|
return 1.0
|
||||||
|
gap = min(abs(valence - lo), abs(valence - hi))
|
||||||
|
return round(max(0.0, 1.0 - (gap / 0.40)), 3)
|
||||||
|
|
||||||
|
|
||||||
|
def affect_divergence_score(affect: str, valence: float) -> float:
|
||||||
|
"""
|
||||||
|
Signed divergence: actual VAD valence minus the midpoint of the expected range.
|
||||||
|
|
||||||
|
Negative = VAD more negative than SER label implies.
|
||||||
|
Positive = VAD more positive than SER label implies.
|
||||||
|
"""
|
||||||
|
lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
|
||||||
|
midpoint = (lo + hi) / 2.0
|
||||||
|
return round(valence - midpoint, 3)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_trajectory(
|
||||||
|
buffer: deque,
|
||||||
|
current: DimensionalResult,
|
||||||
|
ser_affect: str,
|
||||||
|
prior_ser_affect: str | None,
|
||||||
|
) -> tuple[TrajectorySignal, CoherenceSignal]:
|
||||||
|
"""
|
||||||
|
Compute trajectory and coherence signals for one speaker at one window.
|
||||||
|
|
||||||
|
buffer Rolling deque of prior DimensionalResult for this speaker.
|
||||||
|
Must be updated AFTER this call (append current to buffer).
|
||||||
|
current DimensionalResult for the window being classified.
|
||||||
|
ser_affect SER affect label for this window (from ToneClassifier).
|
||||||
|
prior_ser_affect SER affect label from the previous window, for reframe detection.
|
||||||
|
Pass None on the first window or when not tracking.
|
||||||
|
|
||||||
|
Returns (TrajectorySignal, CoherenceSignal). Both have baseline_established=False
|
||||||
|
and trend="calibrating" when buffer has fewer than BASELINE_MIN entries.
|
||||||
|
"""
|
||||||
|
n = len(buffer)
|
||||||
|
|
||||||
|
# Coherence can be computed without a buffer
|
||||||
|
coh_score = affect_coherence(ser_affect, current.valence)
|
||||||
|
div_score = affect_divergence_score(ser_affect, current.valence)
|
||||||
|
|
||||||
|
suppression = (
|
||||||
|
ser_affect in _LOW_PRESENTATION_AFFECTS
|
||||||
|
and current.arousal > _SUPPRESSION_AROUSAL_MIN
|
||||||
|
and current.valence < 0.50
|
||||||
|
)
|
||||||
|
|
||||||
|
reframe = "none"
|
||||||
|
if prior_ser_affect and prior_ser_affect != ser_affect:
|
||||||
|
if _is_more_positive(ser_affect, prior_ser_affect):
|
||||||
|
# Valence actually improved in this window vs. single prior frame
|
||||||
|
if n >= 1:
|
||||||
|
prev_valence = list(buffer)[-1].valence
|
||||||
|
dim_improved = (current.valence - prev_valence) >= _DELTA_THRESHOLD
|
||||||
|
else:
|
||||||
|
dim_improved = False
|
||||||
|
reframe = "genuine" if dim_improved else "surface"
|
||||||
|
|
||||||
|
coher = CoherenceSignal(
|
||||||
|
coherence_score=coh_score,
|
||||||
|
suppression_flag=suppression,
|
||||||
|
reframe_type=reframe,
|
||||||
|
affect_divergence=div_score,
|
||||||
|
)
|
||||||
|
|
||||||
|
if n < BASELINE_MIN:
|
||||||
|
traj = TrajectorySignal(
|
||||||
|
arousal_delta=0.0,
|
||||||
|
valence_delta=0.0,
|
||||||
|
dominance_delta=0.0,
|
||||||
|
arousal_trend="flat",
|
||||||
|
valence_trend="flat",
|
||||||
|
trend="calibrating",
|
||||||
|
frames_in_buffer=n,
|
||||||
|
baseline_established=False,
|
||||||
|
)
|
||||||
|
return traj, coher
|
||||||
|
|
||||||
|
mean_arousal = sum(f.arousal for f in buffer) / n
|
||||||
|
mean_valence = sum(f.valence for f in buffer) / n
|
||||||
|
mean_dominance = sum(f.dominance for f in buffer) / n
|
||||||
|
|
||||||
|
a_delta = current.arousal - mean_arousal
|
||||||
|
v_delta = current.valence - mean_valence
|
||||||
|
d_delta = current.dominance - mean_dominance
|
||||||
|
|
||||||
|
a_trend = (
|
||||||
|
"rising" if a_delta > _DELTA_THRESHOLD else
|
||||||
|
"falling" if a_delta < -_DELTA_THRESHOLD else
|
||||||
|
"flat"
|
||||||
|
)
|
||||||
|
v_trend = (
|
||||||
|
"rising" if v_delta > _DELTA_THRESHOLD else
|
||||||
|
"falling" if v_delta < -_DELTA_THRESHOLD else
|
||||||
|
"flat"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Consecutive movement: check whether the most recent buffered frame
|
||||||
|
# was already moving in the same direction as the current frame.
|
||||||
|
buf_list = list(buffer)
|
||||||
|
prev = buf_list[-1]
|
||||||
|
a_consecutive = a_trend == "rising" and (current.arousal - prev.arousal) > 0.03
|
||||||
|
v_consecutive = v_trend == "falling" and (current.valence - prev.valence) < -0.03
|
||||||
|
|
||||||
|
# Composite trend label
|
||||||
|
if suppression:
|
||||||
|
trend = "suppressed"
|
||||||
|
elif a_trend == "rising" and a_consecutive:
|
||||||
|
trend = "escalating"
|
||||||
|
elif a_trend == "falling" and mean_arousal > 0.55:
|
||||||
|
trend = "de-escalating"
|
||||||
|
elif v_trend == "falling" and v_consecutive:
|
||||||
|
trend = "worsening"
|
||||||
|
elif v_trend == "rising" and mean_valence < 0.45:
|
||||||
|
trend = "improving"
|
||||||
|
else:
|
||||||
|
trend = "stable"
|
||||||
|
|
||||||
|
traj = TrajectorySignal(
|
||||||
|
arousal_delta=round(a_delta, 3),
|
||||||
|
valence_delta=round(v_delta, 3),
|
||||||
|
dominance_delta=round(d_delta, 3),
|
||||||
|
arousal_trend=a_trend,
|
||||||
|
valence_trend=v_trend,
|
||||||
|
trend=trend,
|
||||||
|
frames_in_buffer=n,
|
||||||
|
baseline_established=True,
|
||||||
|
)
|
||||||
|
return traj, coher
|
||||||
|
|
||||||
|
|
||||||
|
# ── Internal helpers ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _is_more_positive(current: str, prior: str) -> bool:
|
||||||
|
"""True when the current SER affect is ranked more positive than prior."""
|
||||||
|
return _AFFECT_POSITIVITY.get(current, 4) > _AFFECT_POSITIVITY.get(prior, 4)
|
||||||
|
|
@ -11,6 +11,8 @@ requires-python = ">=3.11"
|
||||||
license = {text = "MIT"}
|
license = {text = "MIT"}
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pydantic>=2.0",
|
"pydantic>=2.0",
|
||||||
|
"fastapi>=0.111",
|
||||||
|
"uvicorn[standard]>=0.29",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|
@ -26,6 +28,14 @@ inference = [
|
||||||
"pyannote.audio>=3.1",
|
"pyannote.audio>=3.1",
|
||||||
"python-dotenv>=1.0",
|
"python-dotenv>=1.0",
|
||||||
]
|
]
|
||||||
|
signalwire = [
|
||||||
|
"signalwire>=2.0",
|
||||||
|
]
|
||||||
|
freeswitch = [
|
||||||
|
# ESL Python bindings are compiled from FreeSWITCH source.
|
||||||
|
# See: https://developer.signalwire.com/freeswitch/FreeSWITCH-Explained/Client-and-Developer-Interfaces/Event-Socket-Library/
|
||||||
|
"python-ESL",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=8.0",
|
"pytest>=8.0",
|
||||||
"pytest-asyncio>=0.23",
|
"pytest-asyncio>=0.23",
|
||||||
|
|
|
||||||
69
scripts/test_classify_e2e.py
Normal file
69
scripts/test_classify_e2e.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
"""
|
||||||
|
End-to-end integration test for the cf-voice /classify endpoint.
|
||||||
|
|
||||||
|
Extracts a 2-second window from a local media file, base64-encodes the
|
||||||
|
raw PCM, and POSTs it to the running cf-voice service at localhost:8009.
|
||||||
|
Prints each returned AudioEvent for quick inspection.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- cf-voice running at localhost:8009 (CF_VOICE_DIARIZE=1 for speaker labels)
|
||||||
|
- ffmpeg on PATH
|
||||||
|
- A local audio/video file (edit MEDIA_FILE below)
|
||||||
|
|
||||||
|
Run:
|
||||||
|
python scripts/test_classify_e2e.py
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv"
|
||||||
|
START_S = 120
|
||||||
|
DURATION_S = 2
|
||||||
|
SAMPLE_RATE = 16_000
|
||||||
|
CF_VOICE_URL = "http://localhost:8009"
|
||||||
|
|
||||||
|
proc = subprocess.run(
|
||||||
|
[
|
||||||
|
"ffmpeg", "-i", MEDIA_FILE,
|
||||||
|
"-ss", str(START_S),
|
||||||
|
"-t", str(DURATION_S),
|
||||||
|
"-ar", str(SAMPLE_RATE),
|
||||||
|
"-ac", "1",
|
||||||
|
"-f", "s16le",
|
||||||
|
"-",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
pcm = proc.stdout
|
||||||
|
audio = np.frombuffer(pcm, dtype=np.int16)
|
||||||
|
print(f"audio samples: {len(audio)}, duration: {len(audio) / SAMPLE_RATE:.2f}s")
|
||||||
|
|
||||||
|
payload = json.dumps({
|
||||||
|
"audio_chunk": base64.b64encode(pcm).decode(),
|
||||||
|
"timestamp": float(START_S),
|
||||||
|
"session_id": "test",
|
||||||
|
}).encode()
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{CF_VOICE_URL}/classify",
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
result = json.loads(resp.read())
|
||||||
|
|
||||||
|
for ev in result["events"]:
|
||||||
|
print(
|
||||||
|
f" {ev['event_type']:10}"
|
||||||
|
f" speaker_id={ev.get('speaker_id', 'N/A'):14}"
|
||||||
|
f" label={ev.get('label', '')}"
|
||||||
|
)
|
||||||
65
scripts/test_diarize_real.py
Normal file
65
scripts/test_diarize_real.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
"""
|
||||||
|
Manual integration test for speaker diarization via pyannote.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- HF_TOKEN env var (or set below)
|
||||||
|
- CF_VOICE_DIARIZE=1
|
||||||
|
- ffmpeg on PATH
|
||||||
|
- A local audio/video file (edit MEDIA_FILE below)
|
||||||
|
- pip install cf-voice[inference]
|
||||||
|
|
||||||
|
Run:
|
||||||
|
HF_TOKEN=hf_... CF_VOICE_DIARIZE=1 python scripts/test_diarize_real.py
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Override if not in env
|
||||||
|
if not os.environ.get("HF_TOKEN"):
|
||||||
|
raise SystemExit("Set HF_TOKEN in env before running this script.")
|
||||||
|
os.environ.setdefault("CF_VOICE_DIARIZE", "1")
|
||||||
|
|
||||||
|
MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv"
|
||||||
|
START_S = 120
|
||||||
|
DURATION_S = 2
|
||||||
|
SAMPLE_RATE = 16_000
|
||||||
|
|
||||||
|
from cf_voice.diarize import Diarizer, SpeakerTracker # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
d = Diarizer.from_env()
|
||||||
|
tracker = SpeakerTracker()
|
||||||
|
|
||||||
|
proc = subprocess.run(
|
||||||
|
[
|
||||||
|
"ffmpeg", "-i", MEDIA_FILE,
|
||||||
|
"-ss", str(START_S),
|
||||||
|
"-t", str(DURATION_S),
|
||||||
|
"-ar", str(SAMPLE_RATE),
|
||||||
|
"-ac", "1",
|
||||||
|
"-f", "s16le",
|
||||||
|
"-",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
audio = np.frombuffer(proc.stdout, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
rms = float(np.sqrt(np.mean(audio**2)))
|
||||||
|
print(f"audio: {len(audio)} samples, {len(audio) / SAMPLE_RATE:.2f}s, rms={rms:.4f}")
|
||||||
|
|
||||||
|
segs = await d.diarize_async(audio)
|
||||||
|
print(f"segments ({len(segs)}): {segs}")
|
||||||
|
|
||||||
|
mid = len(audio) / 2.0 / SAMPLE_RATE
|
||||||
|
label = d.speaker_at(segs, mid, tracker)
|
||||||
|
print(f"speaker_at({mid:.2f}s): {label}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
119
tests/test_acoustic.py
Normal file
119
tests/test_acoustic.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
import pytest
|
||||||
|
from cf_voice.acoustic import (
|
||||||
|
AcousticBackend,
|
||||||
|
AcousticResult,
|
||||||
|
ASTAcousticBackend,
|
||||||
|
MockAcousticBackend,
|
||||||
|
make_acoustic,
|
||||||
|
)
|
||||||
|
from cf_voice.events import AudioEvent
|
||||||
|
|
||||||
|
|
||||||
|
class TestAcousticResult:
|
||||||
|
def test_fields(self):
|
||||||
|
evt = AudioEvent(timestamp=1.0, event_type="queue", label="ringback", confidence=0.9)
|
||||||
|
result = AcousticResult(queue=evt, speaker=None, environ=None, scene=None, timestamp=1.0)
|
||||||
|
assert result.queue.label == "ringback"
|
||||||
|
assert result.speaker is None
|
||||||
|
assert result.environ is None
|
||||||
|
assert result.scene is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestMockAcousticBackend:
|
||||||
|
def test_classify_returns_result(self):
|
||||||
|
backend = MockAcousticBackend(seed=0)
|
||||||
|
result = backend.classify_window(b"", timestamp=0.0)
|
||||||
|
assert isinstance(result, AcousticResult)
|
||||||
|
assert result.timestamp == 0.0
|
||||||
|
|
||||||
|
def test_all_events_present(self):
|
||||||
|
backend = MockAcousticBackend(seed=1)
|
||||||
|
result = backend.classify_window(b"", timestamp=1.0)
|
||||||
|
assert result.queue is not None
|
||||||
|
assert result.speaker is not None
|
||||||
|
assert result.environ is not None
|
||||||
|
assert result.scene is not None
|
||||||
|
|
||||||
|
def test_event_types_correct(self):
|
||||||
|
backend = MockAcousticBackend(seed=2)
|
||||||
|
result = backend.classify_window(b"", timestamp=2.0)
|
||||||
|
assert result.queue.event_type == "queue"
|
||||||
|
assert result.speaker.event_type == "speaker"
|
||||||
|
assert result.environ.event_type == "environ"
|
||||||
|
assert result.scene.event_type == "scene"
|
||||||
|
|
||||||
|
def test_confidence_in_range(self):
|
||||||
|
backend = MockAcousticBackend(seed=3)
|
||||||
|
for _ in range(5):
|
||||||
|
result = backend.classify_window(b"", timestamp=0.0)
|
||||||
|
assert 0.0 <= result.queue.confidence <= 1.0
|
||||||
|
assert 0.0 <= result.speaker.confidence <= 1.0
|
||||||
|
assert 0.0 <= result.environ.confidence <= 1.0
|
||||||
|
assert 0.0 <= result.scene.confidence <= 1.0
|
||||||
|
|
||||||
|
def test_lifecycle_advances(self):
|
||||||
|
"""Phases should change after their duration elapses."""
|
||||||
|
import time
|
||||||
|
backend = MockAcousticBackend(seed=42)
|
||||||
|
# Force phase to advance by manipulating phase_start
|
||||||
|
backend._phase_start -= 1000 # pretend 1000s elapsed
|
||||||
|
result = backend.classify_window(b"", timestamp=0.0)
|
||||||
|
# Should have advanced — just verify it doesn't crash and returns valid
|
||||||
|
assert result.queue.label in (
|
||||||
|
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_isinstance_protocol(self):
|
||||||
|
backend = MockAcousticBackend()
|
||||||
|
assert isinstance(backend, AcousticBackend)
|
||||||
|
|
||||||
|
def test_deterministic_with_seed(self):
|
||||||
|
b1 = MockAcousticBackend(seed=99)
|
||||||
|
b2 = MockAcousticBackend(seed=99)
|
||||||
|
r1 = b1.classify_window(b"", timestamp=0.0)
|
||||||
|
r2 = b2.classify_window(b"", timestamp=0.0)
|
||||||
|
assert r1.queue.label == r2.queue.label
|
||||||
|
assert r1.queue.confidence == r2.queue.confidence
|
||||||
|
|
||||||
|
|
||||||
|
class TestASTAcousticBackend:
|
||||||
|
def test_raises_import_error_without_deps(self, monkeypatch):
|
||||||
|
"""ASTAcousticBackend should raise ImportError when transformers is unavailable."""
|
||||||
|
import builtins
|
||||||
|
real_import = builtins.__import__
|
||||||
|
|
||||||
|
def mock_import(name, *args, **kwargs):
|
||||||
|
if name in ("transformers",):
|
||||||
|
raise ImportError(f"Mocked: {name} not available")
|
||||||
|
return real_import(name, *args, **kwargs)
|
||||||
|
|
||||||
|
monkeypatch.setattr(builtins, "__import__", mock_import)
|
||||||
|
with pytest.raises(ImportError, match="transformers"):
|
||||||
|
ASTAcousticBackend()
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeAcoustic:
|
||||||
|
def test_mock_flag(self):
|
||||||
|
backend = make_acoustic(mock=True)
|
||||||
|
assert isinstance(backend, MockAcousticBackend)
|
||||||
|
|
||||||
|
def test_mock_env(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_MOCK", "1")
|
||||||
|
backend = make_acoustic()
|
||||||
|
assert isinstance(backend, MockAcousticBackend)
|
||||||
|
|
||||||
|
def test_real_falls_back_to_mock_without_deps(self, monkeypatch, capsys):
|
||||||
|
"""make_acoustic(mock=False) falls back to mock when deps are missing."""
|
||||||
|
import builtins
|
||||||
|
real_import = builtins.__import__
|
||||||
|
|
||||||
|
def mock_import(name, *args, **kwargs):
|
||||||
|
if name in ("transformers",):
|
||||||
|
raise ImportError(f"Mocked: {name} not available")
|
||||||
|
return real_import(name, *args, **kwargs)
|
||||||
|
|
||||||
|
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||||
|
monkeypatch.setattr(builtins, "__import__", mock_import)
|
||||||
|
backend = make_acoustic(mock=False)
|
||||||
|
# Should fall back gracefully, never raise
|
||||||
|
assert isinstance(backend, MockAcousticBackend)
|
||||||
131
tests/test_diarize.py
Normal file
131
tests/test_diarize.py
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
# tests/test_diarize.py — SpeakerTracker and speaker_at() diarization logic
|
||||||
|
#
|
||||||
|
# All tests are pure Python — no GPU, no pyannote, no HF_TOKEN required.
|
||||||
|
# The Diarizer class itself is only tested for its from_env() guard and the
|
||||||
|
# speaker_at() method, both of which run without loading the model.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from cf_voice.diarize import (
|
||||||
|
Diarizer,
|
||||||
|
SpeakerSegment,
|
||||||
|
SpeakerTracker,
|
||||||
|
SPEAKER_MULTIPLE,
|
||||||
|
SPEAKER_UNKNOWN,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SpeakerTracker ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_tracker_first_speaker_is_a():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
assert t.label("SPEAKER_00") == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tracker_second_speaker_is_b():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
t.label("SPEAKER_00")
|
||||||
|
assert t.label("SPEAKER_01") == "Speaker B"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tracker_same_id_returns_same_label():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
first = t.label("SPEAKER_00")
|
||||||
|
second = t.label("SPEAKER_00")
|
||||||
|
assert first == second == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tracker_26_speakers():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
labels = [t.label(f"SPEAKER_{i:02d}") for i in range(26)]
|
||||||
|
assert labels[0] == "Speaker A"
|
||||||
|
assert labels[25] == "Speaker Z"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tracker_27th_speaker_wraps():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
for i in range(26):
|
||||||
|
t.label(f"SPEAKER_{i:02d}")
|
||||||
|
label_27 = t.label("SPEAKER_26")
|
||||||
|
assert label_27 == "Speaker AA"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tracker_reset_clears_map():
|
||||||
|
t = SpeakerTracker()
|
||||||
|
t.label("SPEAKER_00")
|
||||||
|
t.label("SPEAKER_01")
|
||||||
|
t.reset()
|
||||||
|
# After reset, SPEAKER_01 is seen as new and maps to "Speaker A" again
|
||||||
|
assert t.label("SPEAKER_01") == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Diarizer.speaker_at() ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _segs(*items: tuple[str, float, float]) -> list[SpeakerSegment]:
|
||||||
|
return [SpeakerSegment(speaker_id=s, start_s=st, end_s=en) for s, st, en in items]
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_single_speaker():
|
||||||
|
d = object.__new__(Diarizer) # bypass __init__ (no GPU needed)
|
||||||
|
segs = _segs(("SPEAKER_00", 0.0, 2.0))
|
||||||
|
t = SpeakerTracker()
|
||||||
|
assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_no_coverage_returns_unknown():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
segs = _segs(("SPEAKER_00", 0.0, 1.0))
|
||||||
|
assert d.speaker_at(segs, 1.5) == SPEAKER_UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_empty_segments_returns_unknown():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
assert d.speaker_at([], 1.0) == SPEAKER_UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_overlap_returns_multiple():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
segs = _segs(
|
||||||
|
("SPEAKER_00", 0.0, 2.0),
|
||||||
|
("SPEAKER_01", 0.5, 2.0), # overlaps SPEAKER_00 from 0.5s
|
||||||
|
)
|
||||||
|
assert d.speaker_at(segs, 1.0) == SPEAKER_MULTIPLE
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_boundary_inclusive():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
segs = _segs(("SPEAKER_00", 1.0, 2.0))
|
||||||
|
t = SpeakerTracker()
|
||||||
|
# Exact boundary timestamps are included
|
||||||
|
assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A"
|
||||||
|
assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_without_tracker_returns_raw_id():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
segs = _segs(("SPEAKER_00", 0.0, 2.0))
|
||||||
|
assert d.speaker_at(segs, 1.0) == "SPEAKER_00"
|
||||||
|
|
||||||
|
|
||||||
|
def test_speaker_at_two_speakers_no_overlap():
|
||||||
|
d = object.__new__(Diarizer)
|
||||||
|
t = SpeakerTracker()
|
||||||
|
segs = _segs(
|
||||||
|
("SPEAKER_00", 0.0, 1.0),
|
||||||
|
("SPEAKER_01", 1.5, 2.5),
|
||||||
|
)
|
||||||
|
assert d.speaker_at(segs, 0.5, tracker=t) == "Speaker A"
|
||||||
|
assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker B"
|
||||||
|
# Gap at 1.2s: window [0.7, 1.7] → SPEAKER_00 has 0.3s, SPEAKER_01 has 0.2s
|
||||||
|
# Dominant speaker (SPEAKER_00 = "Speaker A") is returned, not SPEAKER_UNKNOWN.
|
||||||
|
assert d.speaker_at(segs, 1.2, tracker=t) == "Speaker A"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Diarizer.from_env() guard ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_from_env_raises_without_hf_token(monkeypatch):
|
||||||
|
monkeypatch.delenv("HF_TOKEN", raising=False)
|
||||||
|
with pytest.raises(EnvironmentError, match="HF_TOKEN"):
|
||||||
|
Diarizer.from_env()
|
||||||
|
|
@ -75,10 +75,60 @@ class TestMockVoiceIO:
|
||||||
io = make_io()
|
io = make_io()
|
||||||
assert isinstance(io, MockVoiceIO)
|
assert isinstance(io, MockVoiceIO)
|
||||||
|
|
||||||
def test_make_io_real_raises(self, monkeypatch):
|
def test_make_io_real_returns_mic_io(self, monkeypatch):
|
||||||
|
"""make_io(mock=False) returns MicVoiceIO when sounddevice/numpy are installed."""
|
||||||
|
from cf_voice.capture import MicVoiceIO
|
||||||
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||||
with pytest.raises(NotImplementedError):
|
io = make_io(mock=False)
|
||||||
make_io(mock=False)
|
assert isinstance(io, MicVoiceIO)
|
||||||
|
|
||||||
|
|
||||||
|
class TestContextClassifierChunk:
|
||||||
|
"""Tests for classify_chunk() — multi-class event output."""
|
||||||
|
|
||||||
|
def test_mock_returns_four_event_types(self):
|
||||||
|
classifier = ContextClassifier.mock(interval_s=0.05, seed=10)
|
||||||
|
events = classifier.classify_chunk(timestamp=1.0)
|
||||||
|
types = {e.event_type for e in events}
|
||||||
|
# In mock mode all four event types should be present
|
||||||
|
assert "tone" in types
|
||||||
|
assert "queue" in types
|
||||||
|
assert "speaker" in types
|
||||||
|
assert "environ" in types
|
||||||
|
|
||||||
|
def test_mock_tone_event_has_subtext(self):
|
||||||
|
classifier = ContextClassifier.mock(interval_s=0.05, seed=11)
|
||||||
|
events = classifier.classify_chunk(timestamp=0.0)
|
||||||
|
tone_events = [e for e in events if e.event_type == "tone"]
|
||||||
|
assert len(tone_events) == 1
|
||||||
|
assert tone_events[0].subtext is not None
|
||||||
|
|
||||||
|
def test_elcor_override_flag(self):
|
||||||
|
classifier = ContextClassifier.mock(interval_s=0.05, seed=12)
|
||||||
|
events_generic = classifier.classify_chunk(timestamp=0.0, elcor=False)
|
||||||
|
events_elcor = classifier.classify_chunk(timestamp=0.0, elcor=True)
|
||||||
|
|
||||||
|
def subtext(evs):
|
||||||
|
return next(e.subtext for e in evs if e.event_type == "tone")
|
||||||
|
|
||||||
|
generic_sub = subtext(events_generic)
|
||||||
|
elcor_sub = subtext(events_elcor)
|
||||||
|
# Generic format: "Tone: X". Elcor format: "With X:" or "Warmly:" etc.
|
||||||
|
assert generic_sub.startswith("Tone:") or not generic_sub.endswith(":")
|
||||||
|
# Elcor format ends with ":"
|
||||||
|
assert elcor_sub.endswith(":")
|
||||||
|
|
||||||
|
def test_session_id_propagates(self):
|
||||||
|
classifier = ContextClassifier.mock(interval_s=0.05, seed=13)
|
||||||
|
events = classifier.classify_chunk(timestamp=0.0, session_id="ses_test")
|
||||||
|
tone_events = [e for e in events if e.event_type == "tone"]
|
||||||
|
assert tone_events[0].session_id == "ses_test"
|
||||||
|
|
||||||
|
def test_prior_frames_zero_means_no_shift(self):
|
||||||
|
classifier = ContextClassifier.mock(interval_s=0.05, seed=14)
|
||||||
|
events = classifier.classify_chunk(timestamp=0.0, prior_frames=0)
|
||||||
|
tone_events = [e for e in events if e.event_type == "tone"]
|
||||||
|
assert tone_events[0].shift_magnitude == 0.0
|
||||||
|
|
||||||
|
|
||||||
class TestContextClassifier:
|
class TestContextClassifier:
|
||||||
|
|
|
||||||
109
tests/test_prefs.py
Normal file
109
tests/test_prefs.py
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from cf_voice.prefs import (
|
||||||
|
PREF_CONFIDENCE_THRESHOLD,
|
||||||
|
PREF_ELCOR_MODE,
|
||||||
|
PREF_ELCOR_PRIOR_FRAMES,
|
||||||
|
PREF_WHISPER_MODEL,
|
||||||
|
get_confidence_threshold,
|
||||||
|
get_elcor_prior_frames,
|
||||||
|
get_voice_pref,
|
||||||
|
get_whisper_model,
|
||||||
|
is_elcor_enabled,
|
||||||
|
set_voice_pref,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _DictStore:
|
||||||
|
"""In-memory preference store for testing."""
|
||||||
|
|
||||||
|
def __init__(self, data: dict | None = None) -> None:
|
||||||
|
self._data: dict = data or {}
|
||||||
|
|
||||||
|
def get(self, user_id, path, default=None):
|
||||||
|
return self._data.get(path, default)
|
||||||
|
|
||||||
|
def set(self, user_id, path, value):
|
||||||
|
self._data[path] = value
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetVoicePref:
|
||||||
|
def test_returns_default_when_nothing_set(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_ELCOR", raising=False)
|
||||||
|
val = get_voice_pref(PREF_ELCOR_MODE, store=_DictStore())
|
||||||
|
assert val is False
|
||||||
|
|
||||||
|
def test_explicit_store_takes_priority(self):
|
||||||
|
store = _DictStore({PREF_ELCOR_MODE: True})
|
||||||
|
assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True
|
||||||
|
|
||||||
|
def test_env_fallback_bool(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_ELCOR", "1")
|
||||||
|
assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is True
|
||||||
|
|
||||||
|
def test_env_fallback_false(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_ELCOR", "0")
|
||||||
|
assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is False
|
||||||
|
|
||||||
|
def test_env_fallback_float(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_CONFIDENCE_THRESHOLD", "0.7")
|
||||||
|
val = get_voice_pref(PREF_CONFIDENCE_THRESHOLD, store=_DictStore())
|
||||||
|
assert abs(val - 0.7) < 1e-9
|
||||||
|
|
||||||
|
def test_env_fallback_int(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_ELCOR_PRIOR_FRAMES", "6")
|
||||||
|
val = get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, store=_DictStore())
|
||||||
|
assert val == 6
|
||||||
|
|
||||||
|
def test_env_fallback_str(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_WHISPER_MODEL", "medium")
|
||||||
|
val = get_voice_pref(PREF_WHISPER_MODEL, store=_DictStore())
|
||||||
|
assert val == "medium"
|
||||||
|
|
||||||
|
def test_store_beats_env(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_ELCOR", "1")
|
||||||
|
store = _DictStore({PREF_ELCOR_MODE: False})
|
||||||
|
# store has explicit False — but store.get returns None for falsy values
|
||||||
|
# only if the key is absent; here key IS set so store wins
|
||||||
|
store._data[PREF_ELCOR_MODE] = True
|
||||||
|
assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True
|
||||||
|
|
||||||
|
def test_unknown_key_returns_none(self):
|
||||||
|
val = get_voice_pref("voice.nonexistent", store=_DictStore())
|
||||||
|
assert val is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetVoicePref:
|
||||||
|
def test_sets_in_store(self):
|
||||||
|
store = _DictStore()
|
||||||
|
set_voice_pref(PREF_ELCOR_MODE, True, store=store)
|
||||||
|
assert store._data[PREF_ELCOR_MODE] is True
|
||||||
|
|
||||||
|
def test_no_store_raises(self, monkeypatch):
|
||||||
|
# Patch _cf_core_store to return None (simulates no cf-core installed)
|
||||||
|
import cf_voice.prefs as prefs_mod
|
||||||
|
monkeypatch.setattr(prefs_mod, "_cf_core_store", lambda: None)
|
||||||
|
with pytest.raises(RuntimeError, match="No writable preference store"):
|
||||||
|
set_voice_pref(PREF_ELCOR_MODE, True)
|
||||||
|
|
||||||
|
|
||||||
|
class TestConvenienceHelpers:
|
||||||
|
def test_is_elcor_enabled_false_default(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_ELCOR", raising=False)
|
||||||
|
assert is_elcor_enabled(store=_DictStore()) is False
|
||||||
|
|
||||||
|
def test_is_elcor_enabled_true_from_store(self):
|
||||||
|
store = _DictStore({PREF_ELCOR_MODE: True})
|
||||||
|
assert is_elcor_enabled(store=store) is True
|
||||||
|
|
||||||
|
def test_get_confidence_threshold_default(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_CONFIDENCE_THRESHOLD", raising=False)
|
||||||
|
assert get_confidence_threshold(store=_DictStore()) == pytest.approx(0.55)
|
||||||
|
|
||||||
|
def test_get_whisper_model_default(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_WHISPER_MODEL", raising=False)
|
||||||
|
assert get_whisper_model(store=_DictStore()) == "small"
|
||||||
|
|
||||||
|
def test_get_elcor_prior_frames_default(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_ELCOR_PRIOR_FRAMES", raising=False)
|
||||||
|
assert get_elcor_prior_frames(store=_DictStore()) == 4
|
||||||
141
tests/test_telephony.py
Normal file
141
tests/test_telephony.py
Normal file
|
|
@ -0,0 +1,141 @@
|
||||||
|
import asyncio
|
||||||
|
import pytest
|
||||||
|
from cf_voice.telephony import (
|
||||||
|
CallSession,
|
||||||
|
MockTelephonyBackend,
|
||||||
|
TelephonyBackend,
|
||||||
|
make_telephony,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCallSession:
|
||||||
|
def test_defaults(self):
|
||||||
|
s = CallSession(call_sid="sid_1", to="+15551234567", from_="+18005550000")
|
||||||
|
assert s.state == "dialing"
|
||||||
|
assert s.amd_result == "unknown"
|
||||||
|
assert s.duration_s == 0.0
|
||||||
|
assert s.error is None
|
||||||
|
|
||||||
|
def test_state_mutation(self):
|
||||||
|
s = CallSession(call_sid="sid_2", to="+1", from_="+2", state="in_progress")
|
||||||
|
s.state = "completed"
|
||||||
|
assert s.state == "completed"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMockTelephonyBackend:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_dial_returns_session(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+15551234567", "+18005550000", "https://example.com/wh")
|
||||||
|
assert isinstance(session, CallSession)
|
||||||
|
assert session.call_sid.startswith("mock_sid_")
|
||||||
|
assert session.to == "+15551234567"
|
||||||
|
assert session.from_ == "+18005550000"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_dial_transitions_to_in_progress(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+15551234567", "+18005550000", "https://x.com")
|
||||||
|
# give the background task a moment to transition
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
assert session.state == "in_progress"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_amd_resolves_human(self):
|
||||||
|
backend = MockTelephonyBackend(amd_delay_s=0.05)
|
||||||
|
session = await backend.dial("+1555", "+1800", "https://x.com", amd=True)
|
||||||
|
await asyncio.sleep(0.2)
|
||||||
|
assert session.amd_result == "human"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_dtmf(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
# should not raise
|
||||||
|
await backend.send_dtmf(session.call_sid, "1234#")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_dtmf_unknown_sid_raises(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
await backend.send_dtmf("nonexistent_sid", "1")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_bridge_updates_state(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
await backend.bridge(session.call_sid, "+15559999999")
|
||||||
|
assert session.state == "bridged"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_hangup_sets_completed(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
await backend.hangup(session.call_sid)
|
||||||
|
assert session.state == "completed"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_hangup_idempotent(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
await backend.hangup(session.call_sid)
|
||||||
|
await backend.hangup(session.call_sid)
|
||||||
|
assert session.state == "completed"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_announce_does_not_raise(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
await backend.announce(session.call_sid, "Hello, this is an automated assistant.")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_state(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
session = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
state = await backend.get_state(session.call_sid)
|
||||||
|
assert state in ("ringing", "in_progress", "dialing")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_multiple_calls_unique_sids(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
s1 = await backend.dial("+1", "+2", "https://x.com")
|
||||||
|
s2 = await backend.dial("+3", "+4", "https://x.com")
|
||||||
|
assert s1.call_sid != s2.call_sid
|
||||||
|
|
||||||
|
def test_isinstance_protocol(self):
|
||||||
|
backend = MockTelephonyBackend()
|
||||||
|
assert isinstance(backend, TelephonyBackend)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeTelephony:
|
||||||
|
def test_mock_flag(self):
|
||||||
|
backend = make_telephony(mock=True)
|
||||||
|
assert isinstance(backend, MockTelephonyBackend)
|
||||||
|
|
||||||
|
def test_mock_env(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("CF_VOICE_MOCK", "1")
|
||||||
|
backend = make_telephony()
|
||||||
|
assert isinstance(backend, MockTelephonyBackend)
|
||||||
|
|
||||||
|
def test_no_config_raises(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||||
|
monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False)
|
||||||
|
monkeypatch.delenv("CF_ESL_PASSWORD", raising=False)
|
||||||
|
with pytest.raises(RuntimeError, match="No telephony backend configured"):
|
||||||
|
make_telephony()
|
||||||
|
|
||||||
|
def test_signalwire_selected_by_env(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||||
|
monkeypatch.setenv("CF_SW_PROJECT_ID", "proj_123")
|
||||||
|
# SignalWireBackend will raise ImportError (signalwire SDK not installed)
|
||||||
|
# but only at instantiation — make_telephony should call the constructor
|
||||||
|
with pytest.raises((ImportError, RuntimeError)):
|
||||||
|
make_telephony()
|
||||||
|
|
||||||
|
def test_freeswitch_selected_by_env(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||||
|
monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False)
|
||||||
|
monkeypatch.setenv("CF_ESL_PASSWORD", "s3cret")
|
||||||
|
# FreeSWITCHBackend will raise ImportError (ESL not installed)
|
||||||
|
with pytest.raises((ImportError, RuntimeError)):
|
||||||
|
make_telephony()
|
||||||
Loading…
Reference in a new issue