From 24f04b67db04c395fb22aa8e03bbdcd9d4ce545b Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sat, 18 Apr 2026 22:36:58 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20full=20voice=20pipeline=20=E2=80=94=20A?= =?UTF-8?q?ST=20acoustic,=20accent,=20privacy,=20prosody,=20dimensional,?= =?UTF-8?q?=20trajectory,=20telephony,=20FastAPI=20app?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3. --- .env.example | 40 +- README.md | 72 ++++ cf_voice/accent.py | 152 ++++++++ cf_voice/acoustic.py | 366 +++++++++++++++++++ cf_voice/app.py | 197 ++++++++++ cf_voice/classify.py | 72 +++- cf_voice/context.py | 686 ++++++++++++++++++++++++++++++++--- cf_voice/diarize.py | 142 +++++++- cf_voice/dimensional.py | 190 ++++++++++ cf_voice/events.py | 56 ++- cf_voice/io.py | 11 +- cf_voice/models.py | 26 +- cf_voice/prefs.py | 181 +++++++++ cf_voice/privacy.py | 115 ++++++ cf_voice/prosody.py | 208 +++++++++++ cf_voice/stt.py | 83 ++++- cf_voice/telephony.py | 500 +++++++++++++++++++++++++ cf_voice/trajectory.py | 288 +++++++++++++++ pyproject.toml | 10 + scripts/test_classify_e2e.py | 69 ++++ scripts/test_diarize_real.py | 65 ++++ tests/test_acoustic.py | 119 ++++++ tests/test_diarize.py | 131 +++++++ tests/test_models.py | 56 ++- tests/test_prefs.py | 109 ++++++ tests/test_telephony.py | 141 +++++++ 26 files changed, 3974 insertions(+), 111 deletions(-) create mode 100644 cf_voice/accent.py create mode 100644 cf_voice/acoustic.py create mode 100644 cf_voice/app.py create mode 100644 cf_voice/dimensional.py create mode 100644 cf_voice/prefs.py create mode 100644 cf_voice/privacy.py create mode 100644 cf_voice/prosody.py create mode 100644 cf_voice/telephony.py create mode 100644 cf_voice/trajectory.py create mode 100644 scripts/test_classify_e2e.py create mode 100644 scripts/test_diarize_real.py create mode 100644 tests/test_acoustic.py create mode 100644 tests/test_diarize.py create mode 100644 tests/test_prefs.py create mode 100644 tests/test_telephony.py diff --git a/.env.example b/.env.example index 2e58c43..96497bc 100644 --- a/.env.example +++ b/.env.example @@ -3,14 +3,29 @@ # load it via python-dotenv in their own startup. For standalone cf-voice # dev/testing, source this file manually or install python-dotenv. -# ── HuggingFace ─────────────────────────────────────────────────────────────── -# Required for pyannote.audio speaker diarization model download. -# Get a free token at https://huggingface.co/settings/tokens -# Also accept the gated model terms at: -# https://huggingface.co/pyannote/speaker-diarization-3.1 -# https://huggingface.co/pyannote/segmentation-3.0 +# ── HuggingFace — free tier / local use ────────────────────────────────────── +# Used by the local diarization path (free tier, user's own machine). +# Each user must: +# 1. Create a free account at huggingface.co +# 2. Accept the gated model terms at: +# https://huggingface.co/pyannote/speaker-diarization-3.1 +# https://huggingface.co/pyannote/segmentation-3.0 +# 3. Generate a read token at huggingface.co/settings/tokens HF_TOKEN= +# ── HuggingFace — paid tier / cf-orch backend ───────────────────────────────── +# Used by cf-orch when running diarization as a managed service on Heimdall. +# This is a CircuitForge org token — NOT the user's personal token. +# +# Prerequisites (one-time, manual — tracked in circuitforge-orch#27): +# 1. Create CircuitForge org on huggingface.co +# 2. Accept pyannote/speaker-diarization-3.1 terms under the org account +# 3. Accept pyannote/segmentation-3.0 terms under the org account +# 4. Generate a read-only org token and set it here +# +# Leave blank on local installs — HF_TOKEN above is used instead. +CF_HF_TOKEN= + # ── Whisper STT ─────────────────────────────────────────────────────────────── # Model size: tiny | base | small | medium | large-v2 | large-v3 # Smaller = faster / less VRAM; larger = more accurate. @@ -29,3 +44,16 @@ CF_VOICE_MOCK= # ── Tone classifier ─────────────────────────────────────────────────────────── # Minimum confidence to emit a VoiceFrame (below this = frame skipped). CF_VOICE_CONFIDENCE_THRESHOLD=0.55 + +# ── Elcor annotation mode ───────────────────────────────────────────────────── +# Accessibility feature for autistic and ND users. Switches tone subtext from +# generic format ("Tone: Frustrated") to Elcor-style prefix format +# ("With barely concealed frustration:"). Opt-in, local-only. +# Overridden by cf-core preferences store when circuitforge_core is installed. +# 1 = enabled, 0 or unset = disabled (default). +CF_VOICE_ELCOR=0 + +# Number of prior VoiceFrames to include as context for Elcor label generation. +# Larger windows = more contextually aware annotations, higher LLM prompt cost. +# Default: 4 frames (~10 seconds of rolling context at 2.5s intervals). +CF_VOICE_ELCOR_PRIOR_FRAMES=4 diff --git a/README.md b/README.md index 1e14e8a..cc4595f 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,64 @@ Host apps subscribing via `` receive `MessageEvent` with `type = --- +--- + +## Telephony + +`cf_voice.telephony` provides the outbound call abstraction for Osprey, Harrier, Ibis, and Kestrel. + +### Quick start + +```python +from cf_voice.telephony import make_telephony + +# Mock mode — no real calls placed (CF_VOICE_MOCK=1 or mock=True) +backend = make_telephony(mock=True) + +session = await backend.dial( + to="+15551234567", + from_="+18005550000", + webhook_url="https://yourapp.example.com/voice/events", + amd=True, # answering machine detection +) + +# Adaptive service identification (osprey#21) +await backend.announce(session.call_sid, "This is an automated assistant.") + +# Navigate IVR +await backend.send_dtmf(session.call_sid, "2") # Press 2 for billing + +# Bridge to user's phone once human agent answers +await backend.bridge(session.call_sid, "+14155550100") + +await backend.hangup(session.call_sid) +``` + +### Backend selection + +`make_telephony()` resolves the backend in this order: + +| Condition | Backend | +|---|---| +| `CF_VOICE_MOCK=1` or `mock=True` | `MockTelephonyBackend` (dev/CI) | +| `CF_SW_PROJECT_ID` env set | `SignalWireBackend` (paid tier) | +| `CF_ESL_PASSWORD` env set | `FreeSWITCHBackend` (free tier, self-hosted) | +| none | `RuntimeError` | + +### Installing real backends + +```bash +# Paid tier — SignalWire managed telephony +pip install cf-voice[signalwire] + +# Free tier — self-hosted FreeSWITCH (requires compiled ESL bindings) +pip install cf-voice[freeswitch] +``` + +Set credentials in `.env` (see `.env.example`). + +--- + ## Mock mode Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode. @@ -139,6 +197,7 @@ Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `Voice | `cf_voice.models` | MIT | `VoiceFrame` dataclass | | `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types | | `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory | +| `cf_voice.telephony` | MIT (Protocol + Mock), BSL (backends) | `TelephonyBackend` Protocol, `MockTelephonyBackend`, `SignalWireBackend`, `FreeSWITCHBackend`, `make_telephony()` | | `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing | | `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper | | `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody | @@ -149,6 +208,19 @@ BSL applies to inference modules. IO + types + wire format = MIT. --- +--- + +## Attribution + +Speaker diarization uses [pyannote.audio](https://github.com/pyannote/pyannote-audio) (MIT) and the following gated HuggingFace models (CC BY 4.0): + +- `pyannote/speaker-diarization-3.1` — Hervé Bredin et al. +- `pyannote/segmentation-3.0` — Hervé Bredin et al. + +CC BY 4.0 requires attribution in any distributed product. The models are gated: each user must accept the license terms on HuggingFace before their `HF_TOKEN` will authorize a download. + +--- + ## Consumed by - `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer) diff --git a/cf_voice/accent.py b/cf_voice/accent.py new file mode 100644 index 0000000..5882510 --- /dev/null +++ b/cf_voice/accent.py @@ -0,0 +1,152 @@ +# cf_voice/accent.py — accent / language identification classifier +# +# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference). +# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity). +# +# Accent alone is not high-risk, but combined with birdsong or a quiet rural +# background it becomes location-identifying. The privacy scorer accounts for +# this compound signal. +# +# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent +# fine-tune for region. Lazy-loaded to keep startup fast. +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +@dataclass +class AccentResult: + """ + Language + regional accent classification for the primary speaker. + + language: BCP-47 language tag (e.g. "en", "fr", "zh") + region: cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other") + confidence: float in [0, 1] + """ + language: str + region: str + confidence: float + + +class MockAccentClassifier: + """ + Synthetic accent classifier for development and CI. + + Returns a fixed result so the privacy scorer can exercise all code paths + without loading a real model. + """ + + def classify(self, audio: "list[float] | bytes") -> AccentResult | None: + return AccentResult(language="en", region="en_gb", confidence=0.72) + + +class AccentClassifier: + """ + Real accent / language classifier. + + BSL 1.1 — requires [inference] extras. + + Language detection: facebook/mms-lid-126 (126 languages, MIT licensed). + Accent region: maps language tag to a regional ACCENT_LABEL. + + VRAM: ~500 MB on CUDA. + """ + + _LANG_MODEL_ID = "facebook/mms-lid-126" + + def __init__(self) -> None: + try: + from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor + except ImportError as exc: + raise ImportError( + "transformers is required for accent classification. " + "Install with: pip install cf-voice[inference]" + ) from exc + + import torch + + self._device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device) + self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID) + self._model = Wav2Vec2ForSequenceClassification.from_pretrained( + self._LANG_MODEL_ID + ).to(self._device) + self._model.eval() + + def classify(self, audio: "list[float] | bytes") -> AccentResult | None: + import numpy as np + import torch + + if isinstance(audio, bytes): + audio_np = np.frombuffer(audio, dtype=np.float32) + else: + audio_np = np.asarray(audio, dtype=np.float32) + + if len(audio_np) < 1600: # need at least 100ms at 16kHz + return None + + inputs = self._extractor( + audio_np, sampling_rate=16_000, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self._device) for k, v in inputs.items()} + + with torch.no_grad(): + logits = self._model(**inputs).logits + probs = torch.softmax(logits, dim=-1)[0] + + top_idx = int(probs.argmax()) + confidence = float(probs[top_idx]) + language = self._model.config.id2label.get(top_idx, "other") + + region = _lang_to_region(language) + return AccentResult(language=language, region=region, confidence=confidence) + + +def _lang_to_region(lang: str) -> str: + """Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL.""" + _MAP: dict[str, str] = { + "eng": "en_us", # MMS uses ISO 639-3; sub-regional accent needs fine-tune + "fra": "fr", + "spa": "es", + "deu": "de", + "zho": "zh", + "jpn": "ja", + "en": "en_us", + "en-GB": "en_gb", + "en-AU": "en_au", + "en-CA": "en_ca", + "en-IN": "en_in", + "fr": "fr", + "de": "de", + "es": "es", + "zh": "zh", + "ja": "ja", + } + return _MAP.get(lang, "other") + + +def make_accent_classifier( + mock: bool | None = None, +) -> "MockAccentClassifier | AccentClassifier | None": + """ + Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None. + + Callers must check for None before invoking classify(). + """ + enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1" + if not enabled: + return None + + use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" + if use_mock: + return MockAccentClassifier() + + try: + return AccentClassifier() + except (ImportError, Exception) as exc: + logger.warning("AccentClassifier unavailable (%s) — using mock", exc) + return MockAccentClassifier() diff --git a/cf_voice/acoustic.py b/cf_voice/acoustic.py new file mode 100644 index 0000000..575bf87 --- /dev/null +++ b/cf_voice/acoustic.py @@ -0,0 +1,366 @@ +# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier +# +# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference). +# Requires [inference] extras for real mode. +# +# This module is the AMD (answering machine detection) backbone for Osprey. +# It runs in parallel with the STT pipeline — it never processes words, +# only acoustic features (pitch, timbre, background, DTMF tones, ringback). +# +# Navigation v0.2.x wires the real YAMNet model. +# Current: mock emits a plausible call-lifecycle sequence. +from __future__ import annotations + +import asyncio +import logging +import random +import time +from dataclasses import dataclass +from typing import AsyncIterator, Protocol, Sequence, runtime_checkable + +from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS + +logger = logging.getLogger(__name__) + +_SAMPLE_RATE = 16_000 + + +@dataclass +class AcousticResult: + """Batch of AudioEvents produced from a single audio window.""" + queue: AudioEvent | None + speaker: AudioEvent | None + environ: AudioEvent | None + scene: AudioEvent | None + timestamp: float + + +@runtime_checkable +class AcousticBackend(Protocol): + """ + Interface for acoustic event classifiers. + + classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an + AcousticResult covering one analysis window (~2s). It is synchronous and + runs in a thread pool when called from async code. + """ + + def classify_window( + self, + audio: "list[float] | bytes", + timestamp: float = 0.0, + ) -> AcousticResult: + ... + + +@runtime_checkable +class SceneBackend(Protocol): + """ + Interface for dedicated acoustic scene classifiers. + + Separate from AcousticBackend to allow future swapping to a specialised + scene model (e.g. AudioSet acoustic-scene subset) without touching the + telephony event classifier. + """ + + def classify_scene( + self, + audio: "list[float] | bytes", + timestamp: float = 0.0, + ) -> AudioEvent | None: + ... + + +# ── Call lifecycle for mock mode ────────────────────────────────────────────── +# Approximates what a real outbound call looks like acoustically. +# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center + +_MOCK_LIFECYCLE: list[dict] = [ + # (min_s, max_s): how long to stay in this phase + {"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)}, + {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)}, + {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)}, + {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)}, + {"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)}, + {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)}, + {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)}, + # AMD moment: background_shift is the primary signal + {"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)}, + {"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)}, +] + + +class MockAcousticBackend: + """ + Synthetic acoustic classifier for development and CI. + + Cycles through a plausible call lifecycle so Osprey's IVR state machine + can be tested without real telephony. The AMD signal (background_shift → + human_single) is emitted at the right point in the sequence. + + Usage: + backend = MockAcousticBackend(seed=42) + result = backend.classify_window(b"", timestamp=4.5) + print(result.environ.label) # → "hold_music", "background_shift", etc. + """ + + def __init__(self, seed: int | None = None) -> None: + self._rng = random.Random(seed) + self._phase_idx = 0 + self._phase_start = time.monotonic() + self._phase_dur = self._draw_phase_dur(0) + + def _draw_phase_dur(self, idx: int) -> float: + lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"] + return self._rng.uniform(lo, hi) + + def _current_phase(self) -> dict: + now = time.monotonic() + elapsed = now - self._phase_start + if elapsed >= self._phase_dur: + self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE) + self._phase_start = now + self._phase_dur = self._draw_phase_dur(self._phase_idx) + return _MOCK_LIFECYCLE[self._phase_idx] + + def _make_event( + self, + event_type: str, + label: str, + timestamp: float, + ) -> AudioEvent: + return AudioEvent( + timestamp=timestamp, + event_type=event_type, # type: ignore[arg-type] + label=label, + confidence=self._rng.uniform(0.72, 0.97), + ) + + def classify_window( + self, + audio: "list[float] | bytes", + timestamp: float = 0.0, + ) -> AcousticResult: + phase = self._current_phase() + return AcousticResult( + queue=self._make_event("queue", phase["queue"], timestamp), + speaker=self._make_event("speaker", phase["speaker"], timestamp), + environ=self._make_event("environ", phase["environ"], timestamp), + scene=self._make_event("scene", phase["scene"], timestamp), + timestamp=timestamp, + ) + + +# ── AST acoustic backend (BSL 1.1) ─────────────────────────────────────────── + + +class ASTAcousticBackend: + """ + Audio Spectrogram Transformer acoustic event classifier. + + BSL 1.1 — requires [inference] extras. + + Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to + classify queue state, speaker type, and background environment from a + single forward pass. Top-15 predictions are scanned; the highest-confidence + match per event category is emitted. + + Model: MIT/ast-finetuned-audioset-10-10-0.4593 + VRAM: ~300 MB on CUDA (fp32) + Input: float32 16kHz mono audio (any length; feature extractor pads/truncates) + + Replaces the YAMNet stub. Synchronous — run from a thread pool executor + when called from async code. + """ + + _MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593" + _SAMPLE_RATE = 16_000 + _TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range + + # Minimum confidence below which an event is suppressed even if it's the + # top match in its category. + _MIN_CONFIDENCE: dict[str, float] = { + "queue": 0.10, + "speaker": 0.08, + "environ": 0.12, + "scene": 0.08, # scenes fire reliably — lower bar is fine + } + + # AudioSet class name → (event_type, cf-voice label). + # Top-K predictions are scanned; highest confidence per category wins. + # "call_center" requires dedicated call-centre acoustics, not generic indoor. + # "Music" was previously duplicated (queue + environ) — Python dicts keep the + # last entry, silently losing the queue mapping. Fixed: use the specific + # "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ. + _LABEL_MAP: dict[str, tuple[str, str]] = { + # ── Queue / call-state labels ────────────────────────────────────────── + "Ringtone": ("queue", "ringback"), + "Telephone bell ringing": ("queue", "ringback"), + "Busy signal": ("queue", "busy"), + "Dial tone": ("queue", "dtmf_tone"), + "DTMF": ("queue", "dtmf_tone"), + "Silence": ("queue", "silence"), + # ── Speaker type labels ──────────────────────────────────────────────── + "Speech": ("speaker", "human_single"), + "Male speech, man speaking": ("speaker", "human_single"), + "Female speech, woman speaking": ("speaker", "human_single"), + "Child speech, kid speaking": ("speaker", "human_single"), + "Crowd": ("speaker", "human_multi"), + "Hubbub, speech noise, speech babble": ("speaker", "human_multi"), + "Laughter": ("speaker", "human_multi"), + "Chuckle, chortle": ("speaker", "human_multi"), + "Speech synthesizer": ("speaker", "ivr_synth"), + # ── Environmental labels ─────────────────────────────────────────────── + # Telephony — requires specific call-centre acoustics, not generic indoor + "Telephone": ("environ", "call_center"), + "Telephone dialing, DTMF": ("environ", "call_center"), + "Reverberation": ("environ", "background_shift"), + "Echo": ("environ", "background_shift"), + "Background noise": ("environ", "noise_floor_change"), + "Noise": ("environ", "noise_floor_change"), + "White noise": ("environ", "noise_floor_change"), + "Pink noise": ("environ", "noise_floor_change"), + "Static": ("environ", "noise_floor_change"), + "Music": ("environ", "music"), + # Nature + "Bird": ("environ", "birdsong"), + "Bird vocalization, bird call, bird song": ("environ", "birdsong"), + "Chirp, tweet": ("environ", "birdsong"), + "Wind": ("environ", "wind"), + "Wind noise (microphone)": ("environ", "wind"), + "Rain": ("environ", "rain"), + "Rain on surface": ("environ", "rain"), + "Water": ("environ", "water"), + "Stream": ("environ", "water"), + # Urban + "Traffic noise, roadway noise": ("environ", "traffic"), + "Vehicle": ("environ", "traffic"), + "Crowd": ("environ", "crowd_chatter"), + "Chatter": ("environ", "crowd_chatter"), + "Construction": ("environ", "construction"), + "Drill": ("environ", "construction"), + # Indoor + "Air conditioning": ("environ", "hvac"), + "Mechanical fan": ("environ", "hvac"), + "Computer keyboard": ("environ", "keyboard_typing"), + "Typing": ("environ", "keyboard_typing"), + "Restaurant": ("environ", "restaurant"), + "Dishes, pots, and pans": ("environ", "restaurant"), + # ── Acoustic scene labels ────────────────────────────────────────────── + # "Inside, small/large room" moved from environ to scene — they correctly + # describe the acoustic scene but are NOT specific enough for call_center. + "Inside, small room": ("scene", "indoor_quiet"), + "Inside, large room or hall": ("scene", "indoor_crowd"), + "Outside, urban or manmade": ("scene", "outdoor_urban"), + "Field recording": ("scene", "outdoor_nature"), + "Rail transport": ("scene", "public_transit"), + "Bus": ("scene", "public_transit"), + "Train": ("scene", "public_transit"), + "Car": ("scene", "vehicle"), + "Truck": ("scene", "vehicle"), + "Motorcycle": ("scene", "vehicle"), + # Music in the queue sense — "Musical instrument" is more specific + # than the ambiguous top-level "Music" class + "Musical instrument": ("queue", "hold_music"), + "Piano": ("queue", "hold_music"), + "Guitar": ("queue", "hold_music"), + } + + def __init__(self) -> None: + try: + from transformers import ASTFeatureExtractor, ASTForAudioClassification + except ImportError as exc: + raise ImportError( + "transformers is required for AST acoustic classification. " + "Install with: pip install cf-voice[inference]" + ) from exc + + import torch + + self._device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device) + self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID) + self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to( + self._device + ) + self._model.eval() + + def classify_window( + self, + audio: "list[float] | bytes", + timestamp: float = 0.0, + ) -> AcousticResult: + import numpy as np + import torch + + if isinstance(audio, bytes): + audio_np = np.frombuffer(audio, dtype=np.float32) + else: + audio_np = np.asarray(audio, dtype=np.float32) + + if len(audio_np) == 0: + return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp) + + inputs = self._extractor( + audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt" + ) + inputs = {k: v.to(self._device) for k, v in inputs.items()} + + with torch.no_grad(): + logits = self._model(**inputs).logits + probs = torch.softmax(logits, dim=-1)[0] + id2label = self._model.config.id2label + + top_k = min(self._TOP_K, len(probs)) + top_indices = probs.topk(top_k).indices.tolist() + predictions = [(id2label[i], float(probs[i])) for i in top_indices] + + # Take highest-confidence match per category + best: dict[str, tuple[str, float]] = {} # event_type → (label, conf) + for ast_label, conf in predictions: + mapping = self._LABEL_MAP.get(ast_label) + if mapping is None: + continue + etype, cf_label = mapping + if etype not in best or conf > best[etype][1]: + best[etype] = (cf_label, conf) + + def _make_event(etype: str, label: str, conf: float) -> AudioEvent: + return AudioEvent( + timestamp=timestamp, + event_type=etype, # type: ignore[arg-type] + label=label, + confidence=round(conf, 4), + ) + + def _above_threshold(etype: str) -> bool: + if etype not in best: + return False + _, conf = best[etype] + return conf >= self._MIN_CONFIDENCE.get(etype, 0.10) + + return AcousticResult( + queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None, + speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None, + environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None, + scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None, + timestamp=timestamp, + ) + + +def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend": + """ + Factory: return an AcousticBackend for the current environment. + + mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend + Otherwise → ASTAcousticBackend (falls back to mock on import error) + """ + import os + use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" + if use_mock: + return MockAcousticBackend() + try: + return ASTAcousticBackend() + except (ImportError, Exception) as exc: + logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc) + return MockAcousticBackend() diff --git a/cf_voice/app.py b/cf_voice/app.py new file mode 100644 index 0000000..2aac11a --- /dev/null +++ b/cf_voice/app.py @@ -0,0 +1,197 @@ +""" +cf-voice FastAPI service — managed by cf-orch. + +Tone/affect classification sidecar for Linnet and any product that needs +real-time audio context annotation. Wraps ContextClassifier so it runs as an +independent managed process rather than embedded in the consumer's process. + +Endpoints: + GET /health → {"status": "ok", "mode": "mock"|"real"} + POST /classify → ClassifyResponse + +Usage: + python -m cf_voice.app --port 8007 --gpu-id 0 + +Mock mode (no GPU, no audio hardware required): + CF_VOICE_MOCK=1 python -m cf_voice.app --port 8007 +""" +from __future__ import annotations + +import argparse +import logging +import os + +import uvicorn +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +from cf_voice.context import ContextClassifier, model_status + +logger = logging.getLogger(__name__) + +_classifier: ContextClassifier | None = None +_mock_mode: bool = False + + +# ── Request / response models ───────────────────────────────────────────────── + + +class ClassifyRequest(BaseModel): + audio_chunk: str | None = None # base64-encoded PCM int16 mono 16kHz; None in mock mode + timestamp: float = 0.0 + elcor: bool | None = None + prior_frames: int | None = None + session_id: str = "" + language: str | None = None # BCP-47 hint for Whisper ("en", "es", …); None = auto-detect + num_speakers: int | None = None # pyannote hint: None = auto; 1–8 = fixed min+max + + +class AudioEventOut(BaseModel): + event_type: str + label: str + confidence: float + timestamp: float + speaker_id: str = "speaker_a" + subtext: str | None = None + affect: str | None = None + shift_magnitude: float | None = None + shift_direction: str | None = None + prosody_flags: list[str] = [] + # Dimensional emotion (audeering model) — None when classifier disabled + valence: float | None = None + arousal: float | None = None + dominance: float | None = None + # Prosodic signals (openSMILE) — None when extractor disabled + sarcasm_risk: float | None = None + flat_f0_score: float | None = None + # Trajectory signals — None until BASELINE_MIN frames buffered per speaker + arousal_delta: float | None = None + valence_delta: float | None = None + trend: str | None = None + # Coherence signals (SER vs VAD) + coherence_score: float | None = None + suppression_flag: bool | None = None + reframe_type: str | None = None + affect_divergence: float | None = None + + +class ClassifyResponse(BaseModel): + events: list[AudioEventOut] + + +# ── App factory ─────────────────────────────────────────────────────────────── + + +def create_app(gpu_id: int = 0, mock: bool = False) -> FastAPI: + global _classifier, _mock_mode + + # Signal GPU to the inference backends (wav2vec2 loads via transformers pipeline) + if not mock: + os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id)) + + _mock_mode = mock or os.environ.get("CF_VOICE_MOCK", "") == "1" + _classifier = ContextClassifier.mock() if _mock_mode else ContextClassifier.from_env() + logger.info("cf-voice ready: mode=%s", "mock" if _mock_mode else "real") + + app = FastAPI(title="cf-voice", version="0.1.0") + + @app.on_event("startup") + async def _startup_prewarm() -> None: + """Pre-warm all configured models so downloads happen at startup, not + on the first classify call (which has a hard 9-second timeout).""" + if _classifier is not None: + import asyncio as _asyncio + _asyncio.create_task(_classifier.prewarm()) + + @app.get("/health") + def health() -> dict: + result: dict = { + "status": "ok", + "mode": "mock" if _mock_mode else "real", + "models": dict(model_status), + } + # Surface misconfigured-but-silent diarizer so Linnet can warn the user. + # Check env vars only — no model loading needed at health-check time. + warnings: list[str] = [] + if os.environ.get("CF_VOICE_DIARIZE", "0") == "1": + token = os.environ.get("HF_TOKEN", "").strip() + if not token: + warnings.append( + "Diarization is enabled (CF_VOICE_DIARIZE=1) but HF_TOKEN is not set. " + "Speaker identity badges will not appear. " + "Set HF_TOKEN in your .env and accept pyannote model terms at huggingface.co." + ) + if warnings: + result["warnings"] = warnings + return result + + @app.post("/classify") + async def classify(req: ClassifyRequest) -> ClassifyResponse: + if _classifier is None: + raise HTTPException(503, detail="classifier not initialised") + try: + events = await _classifier.classify_chunk_async( + audio_b64=req.audio_chunk, + timestamp=req.timestamp, + prior_frames=req.prior_frames, + elcor=req.elcor, + session_id=req.session_id, + language=req.language, + num_speakers=req.num_speakers, + ) + except NotImplementedError as exc: + raise HTTPException(501, detail=str(exc)) + + from cf_voice.events import ToneEvent + + out: list[AudioEventOut] = [] + for e in events: + is_tone = isinstance(e, ToneEvent) + out.append(AudioEventOut( + event_type=e.event_type, + label=e.label, + confidence=round(e.confidence, 4), + timestamp=e.timestamp, + speaker_id=getattr(e, "speaker_id", "speaker_a") or "speaker_a", + subtext=getattr(e, "subtext", None), + affect=getattr(e, "affect", None) if is_tone else None, + shift_magnitude=getattr(e, "shift_magnitude", None) if is_tone else None, + shift_direction=getattr(e, "shift_direction", None) if is_tone else None, + prosody_flags=getattr(e, "prosody_flags", []) if is_tone else [], + valence=getattr(e, "valence", None) if is_tone else None, + arousal=getattr(e, "arousal", None) if is_tone else None, + dominance=getattr(e, "dominance", None) if is_tone else None, + sarcasm_risk=getattr(e, "sarcasm_risk", None) if is_tone else None, + flat_f0_score=getattr(e, "flat_f0_score", None) if is_tone else None, + arousal_delta=getattr(e, "arousal_delta", None) if is_tone else None, + valence_delta=getattr(e, "valence_delta", None) if is_tone else None, + trend=getattr(e, "trend", None) if is_tone else None, + coherence_score=getattr(e, "coherence_score", None) if is_tone else None, + suppression_flag=getattr(e, "suppression_flag", None) if is_tone else None, + reframe_type=getattr(e, "reframe_type", None) if is_tone else None, + affect_divergence=getattr(e, "affect_divergence", None) if is_tone else None, + )) + return ClassifyResponse(events=out) + + return app + + +# ── CLI entrypoint ──────────────────────────────────────────────────────────── + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="cf-voice tone classification server") + parser.add_argument("--port", type=int, default=8007) + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--gpu-id", type=int, default=0) + parser.add_argument("--mock", action="store_true", + help="Run in mock mode (no GPU, no audio hardware needed)") + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s — %(message)s") + args = _parse_args() + app = create_app(gpu_id=args.gpu_id, mock=args.mock) + uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/cf_voice/classify.py b/cf_voice/classify.py index f5dff6a..256d011 100644 --- a/cf_voice/classify.py +++ b/cf_voice/classify.py @@ -82,13 +82,21 @@ class ToneClassifier: Tone/affect classifier: wav2vec2 SER + librosa prosody. Loads the model lazily on first call to avoid import-time GPU allocation. - Thread-safe for concurrent classify() calls — the pipeline is stateless + Thread-safe for concurrent classify() calls — the model is stateless per-call; session state lives in the caller (ContextClassifier). + + Uses AutoFeatureExtractor + AutoModelForAudioClassification directly + rather than hf_pipeline to avoid torchcodec audio backend initialization. + torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x + systems. Calling the model directly bypasses the pipeline's audio backend + selection entirely since we already have float32 at 16kHz. """ def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None: self._threshold = threshold - self._pipeline = None # lazy-loaded + self._feature_extractor = None # lazy-loaded + self._model = None # lazy-loaded + self._device: str = "cpu" @classmethod def from_env(cls) -> "ToneClassifier": @@ -96,23 +104,41 @@ class ToneClassifier: return cls(threshold=threshold) def _load_pipeline(self) -> None: - if self._pipeline is not None: + if self._model is not None: return try: - from transformers import pipeline as hf_pipeline + from transformers import ( + AutoFeatureExtractor, + AutoModelForAudioClassification, + ) except ImportError as exc: raise ImportError( "transformers is required for tone classification. " "Install with: pip install cf-voice[inference]" ) from exc - device = 0 if _cuda_available() else -1 - logger.info("Loading SER model %s on device %s", _SER_MODEL_ID, device) - self._pipeline = hf_pipeline( - "audio-classification", - model=_SER_MODEL_ID, - device=device, + import torch + + if _cuda_available(): + self._device = "cuda:0" + # fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000. + # Only supported on CUDA — CPU must stay float32. + torch_dtype = torch.float16 + else: + self._device = "cpu" + torch_dtype = torch.float32 + + logger.info( + "Loading SER model %s on device=%s dtype=%s", + _SER_MODEL_ID, self._device, torch_dtype, ) + self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID) + self._model = AutoModelForAudioClassification.from_pretrained( + _SER_MODEL_ID, + torch_dtype=torch_dtype, + ).to(self._device) + # Switch to inference mode — disables dropout, batch norm tracks running stats + self._model.train(False) def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult: """ @@ -121,13 +147,33 @@ class ToneClassifier: transcript is used as a weak signal for ambiguous cases (e.g. words like "unfortunately" bias toward apologetic even on a neutral voice). """ + import torch + self._load_pipeline() # Ensure the model sees float32 at the right rate assert audio_float32.dtype == np.float32, "audio must be float32" - # Run SER - preds = self._pipeline({"raw": audio_float32, "sampling_rate": _SAMPLE_RATE}) + # Run SER — call feature extractor + model directly to bypass the + # hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency). + inputs = self._feature_extractor( + audio_float32, + sampling_rate=_SAMPLE_RATE, + return_tensors="pt", + ) + inputs = {k: v.to(self._device) for k, v in inputs.items()} + if self._model.dtype == torch.float16: + inputs = {k: v.to(torch.float16) for k, v in inputs.items()} + + with torch.no_grad(): + logits = self._model(**inputs).logits + probs = torch.softmax(logits, dim=-1)[0] + id2label = self._model.config.id2label + preds = [ + {"label": id2label[i], "score": float(probs[i])} + for i in range(len(probs)) + ] + best = max(preds, key=lambda p: p["score"]) emotion = best["label"].lower() confidence = float(best["score"]) @@ -158,7 +204,7 @@ class ToneClassifier: self, audio_float32: np.ndarray, transcript: str = "" ) -> ToneResult: """classify() without blocking the event loop.""" - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() fn = partial(self.classify, audio_float32, transcript) return await loop.run_in_executor(None, fn) diff --git a/cf_voice/context.py b/cf_voice/context.py index 7b450f4..27d5951 100644 --- a/cf_voice/context.py +++ b/cf_voice/context.py @@ -1,99 +1,289 @@ -# cf_voice/context.py — tone classification and context enrichment +# cf_voice/context.py — parallel audio context classifier (orchestrator) # # BSL 1.1 when real inference models are integrated. -# Currently a passthrough stub: wraps a VoiceIO source and forwards frames. +# Mock mode: MIT licensed (no real inference). # -# Real implementation (Notation v0.1.x) will: -# - Run YAMNet acoustic event detection on the audio buffer -# - Run wav2vec2-based SER (speech emotion recognition) -# - Run librosa prosody extraction (pitch, energy, rate) -# - Combine into enriched VoiceFrame label + confidence -# - Support pyannote.audio speaker diarization (Navigation v0.2.x) +# Runs three classifiers in parallel against the same audio window: +# 1. Tone/affect (classify.py) — wav2vec2 SER + librosa prosody +# 2. Queue/environ (acoustic.py) — YAMNet acoustic event detection +# 3. Speaker type/VAD (diarize.py) — pyannote.audio (Navigation v0.2.x) +# +# Combined output is a list[AudioEvent] per window, merged into VoiceFrame +# for the streaming path. +# +# Elcor mode reads from cf-core preferences (cf_voice.prefs) so that the +# annotation format is user-configurable without per-request flags. from __future__ import annotations +import asyncio +import logging import os from typing import AsyncIterator +from cf_voice.acoustic import MockAcousticBackend, make_acoustic from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame from cf_voice.io import MockVoiceIO, VoiceIO, make_io from cf_voice.models import VoiceFrame +from cf_voice.prefs import get_elcor_prior_frames, is_elcor_enabled + +logger = logging.getLogger(__name__) + +# ── Per-model download/load status registry ─────────────────────────────────── +# Written by _load_* methods; read by the /health endpoint in app.py. +# Values: "disabled" | "loading" | "ready" | "error" +# Thread-safe: individual str assignment is atomic in CPython. +model_status: dict[str, str] = {} + + +# ── No-op coroutines for disabled/unavailable classifiers ───────────────────── + +async def _noop_stt() -> None: + """Placeholder when STT is disabled or unavailable.""" + return None + + +async def _noop_diarize() -> list: + """Placeholder when diarization is disabled or unavailable.""" + return [] + + +# ───────────────────────────────────────────────────────────────────────────── class ContextClassifier: """ High-level voice context classifier. - Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation. - In stub mode the frames pass through unchanged — the enrichment pipeline - (YAMNet + wav2vec2 + librosa) is filled in incrementally. + Wraps a VoiceIO source and runs three parallel classifiers on each audio + window: tone (SER), queue/environ (YAMNet), and speaker (pyannote). + + In mock mode all classifiers produce synthetic events — no GPU, microphone, + or HuggingFace token required. Usage ----- classifier = ContextClassifier.from_env() async for frame in classifier.stream(): print(frame.label, frame.confidence) + + For the full multi-class event list (queue + speaker + tone): + events = classifier.classify_chunk(audio_b64, timestamp=4.5) """ - def __init__(self, io: VoiceIO) -> None: + def __init__( + self, + io: VoiceIO, + user_id: str | None = None, + store=None, + ) -> None: self._io = io + self._user_id = user_id + self._store = store + self._acoustic = make_acoustic( + mock=isinstance(io, MockVoiceIO) + or os.environ.get("CF_VOICE_MOCK", "") == "1" + ) + # Lazy — loaded on first real classify call, then reused. + self._tone: "ToneClassifier | None" = None + # STT: loaded if faster-whisper is installed. Controlled by CF_VOICE_STT (default: 1). + self._stt: "WhisperSTT | None" = None + self._stt_loaded: bool = False # False = not yet attempted + # Diarizer: optional — requires HF_TOKEN and CF_VOICE_DIARIZE=1. + self._diarizer: "Diarizer | None" = None + self._diarizer_loaded: bool = False + # Per-session speaker label tracker — maps pyannote IDs → "Speaker A/B/..." + # Reset at session end (when the ContextClassifier is stopped). + from cf_voice.diarize import SpeakerTracker + self._speaker_tracker: SpeakerTracker = SpeakerTracker() + # One-at-a-time GPU classify gate. All three models share the same GPU; + # running them "in parallel" just serializes at the CUDA level while + # filling the thread pool. Drop incoming frames when a classify is + # already in flight — freshness beats completeness for real-time audio. + self._classify_lock: asyncio.Lock = asyncio.Lock() + # Dimensional classifier (audeering) — lazy, CF_VOICE_DIMENSIONAL=1 + self._dimensional: "DimensionalClassifier | None" = None + self._dimensional_loaded: bool = False + # Prosodic extractor (openSMILE) — lazy, CF_VOICE_PROSODY=1 + self._prosodic: "ProsodicExtractor | None" = None + self._prosodic_loaded: bool = False + # Per-speaker rolling dimensional buffers for trajectory/coherence signals. + # Keys are speaker_id strings; values are deques of DimensionalResult. + # Reset at session end alongside SpeakerTracker. + from collections import deque as _deque + from cf_voice.trajectory import BUFFER_WINDOW + self._dim_buffer: dict[str, "_deque"] = {} + self._last_ser_affect: dict[str, str] = {} + self._buffer_window = BUFFER_WINDOW + # Accent classifier — lazy, gated by CF_VOICE_ACCENT=1 + self._accent: "MockAccentClassifier | AccentClassifier | None" = None + self._accent_loaded: bool = False @classmethod - def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier": + def from_env( + cls, + interval_s: float = 2.5, + user_id: str | None = None, + store=None, + ) -> "ContextClassifier": """ Create a ContextClassifier from environment. - CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed). + + CF_VOICE_MOCK=1 activates full mock mode (no GPU, no audio hardware). + If real audio hardware is unavailable (faster-whisper not installed), + falls back to mock mode automatically. + user_id + store are forwarded to cf-core preferences for Elcor/threshold + lookups. """ - io = make_io(interval_s=interval_s) - return cls(io=io) + if os.environ.get("CF_VOICE_MOCK", "") == "1": + return cls.mock(interval_s=interval_s, user_id=user_id, store=store) + try: + io = make_io(interval_s=interval_s) + except (NotImplementedError, ImportError): + # Real audio hardware or inference extras unavailable — fall back to + # mock mode so the coordinator starts cleanly on headless nodes. + return cls.mock(interval_s=interval_s, user_id=user_id, store=store) + return cls(io=io, user_id=user_id, store=store) @classmethod - def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier": + def mock( + cls, + interval_s: float = 2.5, + seed: int | None = None, + user_id: str | None = None, + store=None, + ) -> "ContextClassifier": """Create a ContextClassifier backed by MockVoiceIO. Useful in tests.""" - from cf_voice.io import MockVoiceIO - return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed)) + return cls( + io=MockVoiceIO(interval_s=interval_s, seed=seed), + user_id=user_id, + store=store, + ) async def stream(self) -> AsyncIterator[VoiceFrame]: """ Yield enriched VoiceFrames continuously. Stub: frames from the IO layer pass through unchanged. - Real: enrichment pipeline runs here before yield. + Real (Navigation v0.2.x): acoustic + diarization enrichment runs here. """ async for frame in self._io.stream(): yield self._enrich(frame) async def stop(self) -> None: await self._io.stop() + self._speaker_tracker.reset() + self._dim_buffer.clear() + self._last_ser_affect.clear() def classify_chunk( self, - audio_b64: str, + audio_b64: str | None = None, timestamp: float = 0.0, - prior_frames: int = 0, - elcor: bool = False, + prior_frames: int | None = None, + elcor: bool | None = None, + session_id: str = "", ) -> list[AudioEvent]: """ - Classify a single audio chunk and return AudioEvents. + Classify a single audio window and return all AudioEvents. - This is the request-response path used by the cf-orch endpoint. + Returns a heterogeneous list containing zero or one of each: + - ToneEvent (event_type="tone") + - AudioEvent (event_type="queue") + - AudioEvent (event_type="speaker") + - AudioEvent (event_type="environ") + + This is the request-response path used by the cf-orch SSE endpoint. The streaming path (async generator) is for continuous consumers. - elcor=True switches subtext format to Mass Effect Elcor prefix style. - Generic tone annotation is always present regardless of elcor flag. + audio_b64 Base64-encoded PCM int16 mono 16kHz bytes. + Pass None in mock mode (ignored). + timestamp Session-relative seconds since capture started. + prior_frames Rolling context window size for Elcor LLM. + Defaults to user preference (PREF_ELCOR_PRIOR_FRAMES). + elcor Override Elcor mode for this request. + None = read from user preference (PREF_ELCOR_MODE). + session_id Caller-assigned correlation ID for the session. """ - if isinstance(self._io, MockVoiceIO): - return self._classify_chunk_mock(timestamp, prior_frames, elcor) + use_elcor = elcor if elcor is not None else is_elcor_enabled( + user_id=self._user_id, store=self._store + ) + context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames( + user_id=self._user_id, store=self._store + ) - return self._classify_chunk_real(audio_b64, timestamp, elcor) + if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1": + return self._classify_mock(timestamp, context_frames, use_elcor, session_id) - def _classify_chunk_mock( - self, timestamp: float, prior_frames: int, elcor: bool + if not audio_b64: + return [] + + return self._classify_real(audio_b64, timestamp, use_elcor, session_id) + + async def classify_chunk_async( + self, + audio_b64: str | None = None, + timestamp: float = 0.0, + prior_frames: int | None = None, + elcor: bool | None = None, + session_id: str = "", + language: str | None = None, + num_speakers: int | None = None, ) -> list[AudioEvent]: - """Synthetic path — used in mock mode and CI.""" + """ + Async variant of classify_chunk. + + Runs tone, STT, diarization, and acoustic classification in parallel + using asyncio.gather(). Use this from async contexts (FastAPI routes) + to get true concurrency across all four inference paths. + """ + use_elcor = elcor if elcor is not None else is_elcor_enabled( + user_id=self._user_id, store=self._store + ) + context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames( + user_id=self._user_id, store=self._store + ) + + if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1": + return self._classify_mock(timestamp, context_frames, use_elcor, session_id) + + if not audio_b64: + return [] + + # Drop frame if a classify is already in flight — GPU models serialize + # anyway, so queuing just adds latency without improving output. + if self._classify_lock.locked(): + logger.debug("classify busy — dropping frame at t=%.2f", timestamp) + return [] + + async with self._classify_lock: + # Diarization (pyannote) can take 3–8 s on first invocations even with GPU. + # 25 s gives enough headroom without stalling the stream for too long. + try: + return await asyncio.wait_for( + self._classify_real_async(audio_b64, timestamp, use_elcor, session_id, language, num_speakers), + timeout=25.0, + ) + except asyncio.TimeoutError: + logger.warning("classify_real_async timed out at t=%.2f — dropping frame", timestamp) + return [] + + def _classify_mock( + self, + timestamp: float, + prior_frames: int, + elcor: bool, + session_id: str, + ) -> list[AudioEvent]: + """ + Synthetic multi-class event batch. + + Tone event comes from the MockVoiceIO RNG (consistent seed behaviour). + Queue/speaker/environ come from MockAcousticBackend (call lifecycle simulation). + """ rng = self._io._rng # type: ignore[attr-defined] - import time as _time label = rng.choice(self._io._labels) # type: ignore[attr-defined] shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0 + frame = VoiceFrame( label=label, confidence=rng.uniform(0.6, 0.97), @@ -101,30 +291,54 @@ class ContextClassifier: shift_magnitude=round(shift, 3), timestamp=timestamp, ) - tone = tone_event_from_voice_frame( + tone: ToneEvent = tone_event_from_voice_frame( frame_label=frame.label, frame_confidence=frame.confidence, shift_magnitude=frame.shift_magnitude, timestamp=frame.timestamp, elcor=elcor, ) - return [tone] + tone.session_id = session_id - def _classify_chunk_real( - self, audio_b64: str, timestamp: float, elcor: bool + acoustic = self._acoustic.classify_window(b"", timestamp=timestamp) + + events: list[AudioEvent] = [tone] + if acoustic.queue: + events.append(acoustic.queue) + if acoustic.speaker: + events.append(acoustic.speaker) + if acoustic.environ: + events.append(acoustic.environ) + if acoustic.scene: + events.append(acoustic.scene) + return events + + def _classify_real( + self, + audio_b64: str, + timestamp: float, + elcor: bool, + session_id: str, ) -> list[AudioEvent]: - """Real inference path — used when CF_VOICE_MOCK is unset.""" - import asyncio + """ + Real inference path — used when CF_VOICE_MOCK is unset. + + Tone: wav2vec2 SER via ToneClassifier (classify.py). + Acoustic: YAMNet via YAMNetAcousticBackend (Navigation v0.2.x stub). + Speaker: pyannote VAD (diarize.py) — merged in ContextClassifier, not here. + """ import base64 + import numpy as np + from cf_voice.classify import ToneClassifier pcm = base64.b64decode(audio_b64) audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0 - # ToneClassifier is stateless per-call, safe to instantiate inline - classifier = ToneClassifier.from_env() - tone_result = classifier.classify(audio) + if self._tone is None: + self._tone = ToneClassifier.from_env() + tone_result = self._tone.classify(audio) frame = VoiceFrame( label=tone_result.label, @@ -133,20 +347,398 @@ class ContextClassifier: shift_magnitude=0.0, timestamp=timestamp, ) - event = tone_event_from_voice_frame( + tone: ToneEvent = tone_event_from_voice_frame( frame_label=frame.label, frame_confidence=frame.confidence, shift_magnitude=frame.shift_magnitude, timestamp=frame.timestamp, elcor=elcor, ) - return [event] + tone.session_id = session_id + + events: list[AudioEvent] = [tone] + + # Acoustic events: Navigation v0.2.x (YAMNet not yet implemented) + # YAMNetAcousticBackend raises NotImplementedError at construction — + # we catch and log rather than failing the entire classify call. + try: + acoustic = self._acoustic.classify_window(audio.tobytes(), timestamp=timestamp) + if acoustic.queue: + events.append(acoustic.queue) + if acoustic.speaker: + events.append(acoustic.speaker) + if acoustic.environ: + events.append(acoustic.environ) + if acoustic.scene: + events.append(acoustic.scene) + except NotImplementedError: + pass + + return events + + def _load_stt(self) -> "WhisperSTT | None": + """Lazy-load WhisperSTT once. Returns None if unavailable or disabled.""" + if self._stt_loaded: + return self._stt + self._stt_loaded = True + if os.environ.get("CF_VOICE_STT", "1") != "1": + model_status["stt"] = "disabled" + return None + model_status["stt"] = "loading" + try: + from cf_voice.stt import WhisperSTT + self._stt = WhisperSTT.from_env() + model_status["stt"] = "ready" + logger.info("WhisperSTT loaded (model=%s)", os.environ.get("CF_VOICE_WHISPER_MODEL", "small")) + except Exception as exc: + model_status["stt"] = "error" + logger.warning("WhisperSTT unavailable: %s", exc) + return self._stt + + def _load_diarizer(self) -> "Diarizer | None": + """Lazy-load Diarizer once. Returns None if HF_TOKEN absent or CF_VOICE_DIARIZE!=1.""" + if self._diarizer_loaded: + return self._diarizer + self._diarizer_loaded = True + if os.environ.get("CF_VOICE_DIARIZE", "0") != "1": + model_status["diarizer"] = "disabled" + return None + model_status["diarizer"] = "loading" + try: + from cf_voice.diarize import Diarizer + self._diarizer = Diarizer.from_env() + model_status["diarizer"] = "ready" + logger.info("Diarizer loaded") + except Exception as exc: + model_status["diarizer"] = "error" + logger.warning("Diarizer unavailable: %s", exc) + return self._diarizer + + def _load_dimensional(self) -> "DimensionalClassifier | None": + """Lazy-load DimensionalClassifier once. Returns None if CF_VOICE_DIMENSIONAL!=1.""" + if self._dimensional_loaded: + return self._dimensional + self._dimensional_loaded = True + if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1": + model_status["dimensional"] = "disabled" + return None + model_status["dimensional"] = "loading" + try: + from cf_voice.dimensional import DimensionalClassifier + self._dimensional = DimensionalClassifier() + model_status["dimensional"] = "ready" + logger.info("DimensionalClassifier loaded (audeering VAD model)") + except Exception as exc: + model_status["dimensional"] = "error" + logger.warning("DimensionalClassifier unavailable: %s", exc) + return self._dimensional + + def _load_accent(self) -> "MockAccentClassifier | AccentClassifier | None": + """Lazy-load AccentClassifier once. Returns None if CF_VOICE_ACCENT!=1.""" + if self._accent_loaded: + return self._accent + self._accent_loaded = True + from cf_voice.accent import make_accent_classifier + result = make_accent_classifier( + mock=isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1" + ) + self._accent = result + if result is None: + model_status["accent"] = "disabled" + else: + model_status["accent"] = "ready" + logger.info("AccentClassifier loaded (mock=%s)", isinstance(result, type(result).__mro__[0])) + return self._accent + + def _load_prosodic(self) -> "ProsodicExtractor | None": + """Lazy-load ProsodicExtractor once. Returns None if CF_VOICE_PROSODY!=1.""" + if self._prosodic_loaded: + return self._prosodic + self._prosodic_loaded = True + if os.environ.get("CF_VOICE_PROSODY", "0") != "1": + model_status["prosody"] = "disabled" + return None + model_status["prosody"] = "loading" + try: + from cf_voice.prosody import ProsodicExtractor + self._prosodic = ProsodicExtractor() + model_status["prosody"] = "ready" + logger.info("ProsodicExtractor loaded (openSMILE eGeMAPS)") + except Exception as exc: + model_status["prosody"] = "error" + logger.warning("ProsodicExtractor unavailable: %s", exc) + return self._prosodic + + async def prewarm(self) -> None: + """Pre-load all configured models in a thread-pool so downloads happen at + startup rather than on the first classify call. Safe to call multiple times + (each _load_* method is idempotent after the first call).""" + if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1": + return + loop = asyncio.get_running_loop() + # Load each model in its own executor slot so status updates are visible + # as each one completes rather than all at once. + await loop.run_in_executor(None, self._load_stt) + await loop.run_in_executor(None, self._load_diarizer) + await loop.run_in_executor(None, self._load_dimensional) + await loop.run_in_executor(None, self._load_prosodic) + logger.info("cf-voice prewarm complete: %s", model_status) + + async def _classify_real_async( + self, + audio_b64: str, + timestamp: float, + elcor: bool, + session_id: str, + language: str | None = None, + num_speakers: int | None = None, + ) -> list[AudioEvent]: + """ + Real inference path running all classifiers in parallel. + + Tone (wav2vec2) + STT (Whisper) + Diarization (pyannote, optional) + + Acoustic (AST) all run concurrently via asyncio.gather(). Each result + is type-checked after gather — a single classifier failure does not + abort the call. + + Transcript text is fed back to ToneClassifier as a weak signal (e.g. + "unfortunately" biases toward apologetic). Diarizer output sets the + speaker_id on the VoiceFrame. + """ + import base64 + from functools import partial + + import numpy as np + + from cf_voice.classify import ToneClassifier, _apply_transcript_hints, _AFFECT_TO_LABEL + + pcm = base64.b64decode(audio_b64) + audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0 + + # Lazy-load models on first real call + if self._tone is None: + self._tone = ToneClassifier.from_env() + stt = self._load_stt() + diarizer = self._load_diarizer() + dimensional = self._load_dimensional() + prosodic = self._load_prosodic() + accent_clf = self._load_accent() + + # Build coroutines — all run in thread pool executors internally. + # Dimensional, prosodic, and accent run in parallel with SER/STT/diarization. + tone_coro = self._tone.classify_async(audio) + stt_coro = stt.transcribe_chunk_async(pcm, language=language) if stt else _noop_stt() + diarize_coro = diarizer.diarize_async(audio, num_speakers=num_speakers) if diarizer else _noop_diarize() + loop = asyncio.get_running_loop() + acoustic_coro = loop.run_in_executor( + None, partial(self._acoustic.classify_window, audio.tobytes(), timestamp) + ) + dimensional_coro = dimensional.classify_async(audio) if dimensional else _noop_stt() + prosodic_coro = prosodic.extract_async(audio) if prosodic else _noop_stt() + accent_coro = loop.run_in_executor( + None, partial(accent_clf.classify, audio.tobytes()) + ) if accent_clf else _noop_stt() + + ( + tone_result, stt_result, diarize_segs, acoustic, + dimensional_result, prosodic_result, accent_result, + ) = await asyncio.gather( + tone_coro, stt_coro, diarize_coro, acoustic_coro, + dimensional_coro, prosodic_coro, accent_coro, + return_exceptions=True, + ) + + # Extract transcript text (STT optional) + transcript = "" + if stt_result and not isinstance(stt_result, BaseException): + transcript = stt_result.text # type: ignore[union-attr] + + # Apply transcript weak signal to affect if STT produced text + if transcript and not isinstance(tone_result, BaseException): + new_affect = _apply_transcript_hints(tone_result.affect, transcript) # type: ignore[union-attr] + if new_affect != tone_result.affect: # type: ignore[union-attr] + from cf_voice.classify import ToneResult + tone_result = ToneResult( # type: ignore[assignment] + label=_AFFECT_TO_LABEL.get(new_affect, tone_result.label), # type: ignore[union-attr] + affect=new_affect, + confidence=tone_result.confidence, # type: ignore[union-attr] + prosody_flags=tone_result.prosody_flags, # type: ignore[union-attr] + ) + + # Get speaker_id from diarization (falls back to "speaker_a") + speaker_id = "speaker_a" + if isinstance(diarize_segs, BaseException): + logger.warning("Diarizer failed in gather: %s", diarize_segs) + elif diarizer and diarize_segs is not None: + window_mid = len(audio) / 2.0 / 16_000.0 + speaker_id = diarizer.speaker_at( # type: ignore[arg-type] + diarize_segs, window_mid, tracker=self._speaker_tracker + ) + logger.debug("diarize: segs=%d speaker=%s mid=%.3f", len(diarize_segs), speaker_id, window_mid) + + if isinstance(tone_result, BaseException): + logger.warning("Tone classifier failed: %s", tone_result) + return [] + + # Unpack dimensional result (None when classifier is disabled or failed) + dim = None + if dimensional_result and not isinstance(dimensional_result, BaseException): + dim = dimensional_result + + # Unpack prosodic result. If dimensional is also available, pass the + # calm-positive score so sarcasm_risk benefits from both signals. + pros = None + if prosodic_result and not isinstance(prosodic_result, BaseException): + if dim is not None: + # Re-compute sarcasm_risk with dimensional context + from cf_voice.prosody import _compute_sarcasm_risk + calm_pos = dim.calm_positive_score() + updated_risk = _compute_sarcasm_risk( + flat_f0=prosodic_result.flat_f0_score, # type: ignore[union-attr] + calm_positive=calm_pos, + ) + from cf_voice.prosody import ProsodicSignal + pros = ProsodicSignal( + f0_mean=prosodic_result.f0_mean, # type: ignore[union-attr] + f0_std=prosodic_result.f0_std, # type: ignore[union-attr] + jitter=prosodic_result.jitter, # type: ignore[union-attr] + shimmer=prosodic_result.shimmer, # type: ignore[union-attr] + loudness=prosodic_result.loudness, # type: ignore[union-attr] + flat_f0_score=prosodic_result.flat_f0_score, # type: ignore[union-attr] + sarcasm_risk=updated_risk, + ) + else: + pros = prosodic_result + + frame = VoiceFrame( + label=tone_result.label, # type: ignore[union-attr] + confidence=tone_result.confidence, # type: ignore[union-attr] + speaker_id=speaker_id, + shift_magnitude=0.0, + timestamp=timestamp, + valence=dim.valence if dim else None, + arousal=dim.arousal if dim else None, + dominance=dim.dominance if dim else None, + sarcasm_risk=pros.sarcasm_risk if pros else None, + flat_f0_score=pros.flat_f0_score if pros else None, + ) + tone_event: ToneEvent = tone_event_from_voice_frame( + frame_label=frame.label, + frame_confidence=frame.confidence, + shift_magnitude=frame.shift_magnitude, + timestamp=frame.timestamp, + elcor=elcor, + ) + tone_event.session_id = session_id + tone_event.speaker_id = speaker_id + # Attach dimensional and prosodic results to the wire event + tone_event.valence = frame.valence + tone_event.arousal = frame.arousal + tone_event.dominance = frame.dominance + tone_event.sarcasm_risk = frame.sarcasm_risk + tone_event.flat_f0_score = frame.flat_f0_score + + # Trajectory and coherence signals — only when dimensional is running + if dim: + from collections import deque as _deque + from cf_voice.trajectory import compute_trajectory + + spk_buffer = self._dim_buffer.setdefault( + speaker_id, _deque(maxlen=self._buffer_window) + ) + prior_affect = self._last_ser_affect.get(speaker_id) + traj, coher = compute_trajectory( + spk_buffer, dim, tone_result.affect, prior_affect # type: ignore[union-attr] + ) + # Update buffer and affect history after computing (not before) + spk_buffer.append(dim) + self._last_ser_affect[speaker_id] = tone_result.affect # type: ignore[union-attr] + + tone_event.arousal_delta = traj.arousal_delta if traj.baseline_established else None + tone_event.valence_delta = traj.valence_delta if traj.baseline_established else None + tone_event.trend = traj.trend if traj.baseline_established else None + tone_event.coherence_score = coher.coherence_score + tone_event.suppression_flag = coher.suppression_flag + tone_event.reframe_type = coher.reframe_type if coher.reframe_type != "none" else None + tone_event.affect_divergence = coher.affect_divergence + + logger.debug( + "Dimensional: valence=%.3f arousal=%.3f dominance=%.3f quadrant=%s " + "trend=%s coherence=%.2f suppressed=%s reframe=%s", + dim.valence, dim.arousal, dim.dominance, dim.affect_quadrant(), + traj.trend, coher.coherence_score, coher.suppression_flag, coher.reframe_type, + ) + + if pros: + logger.debug( + "Prosodic: flat_f0=%.3f sarcasm_risk=%.3f", + pros.flat_f0_score, pros.sarcasm_risk, + ) + + events: list[AudioEvent] = [tone_event] + + # Emit transcript event so consumers can display live STT + if transcript: + events.append(AudioEvent( + timestamp=timestamp, + event_type="transcript", # type: ignore[arg-type] + label=transcript, + confidence=1.0, + speaker_id=speaker_id, + )) + + # Acoustic events (queue / speaker type / environ / scene) + scene_label: str | None = None + environ_labels: list[str] = [] + speaker_label: str | None = None + if not isinstance(acoustic, BaseException): + if acoustic.queue: # type: ignore[union-attr] + events.append(acoustic.queue) # type: ignore[union-attr] + if acoustic.speaker: # type: ignore[union-attr] + events.append(acoustic.speaker) # type: ignore[union-attr] + speaker_label = acoustic.speaker.label # type: ignore[union-attr] + if acoustic.environ: # type: ignore[union-attr] + events.append(acoustic.environ) # type: ignore[union-attr] + environ_labels = [acoustic.environ.label] # type: ignore[union-attr] + if acoustic.scene: # type: ignore[union-attr] + events.append(acoustic.scene) # type: ignore[union-attr] + scene_label = acoustic.scene.label # type: ignore[union-attr] + + # Accent event (optional — gated by CF_VOICE_ACCENT=1) + accent_region: str | None = None + if accent_result and not isinstance(accent_result, BaseException): + accent_region = accent_result.region # type: ignore[union-attr] + events.append(AudioEvent( + timestamp=timestamp, + event_type="accent", # type: ignore[arg-type] + label=accent_region, + confidence=accent_result.confidence, # type: ignore[union-attr] + speaker_id=speaker_id, + )) + + # Privacy risk scoring — local only, never transmitted + from cf_voice.privacy import score_privacy_risk + risk = score_privacy_risk( + scene=scene_label, + environ_labels=environ_labels, + speaker=speaker_label, + accent=accent_region, + ) + if risk.level != "low": + logger.info( + "privacy_risk=%s flags=%s session=%s", + risk.level, risk.flags, session_id, + ) + # Attach risk to the tone event so Linnet can surface the gate + tone_event.prosody_flags = list(tone_event.prosody_flags) + [f"privacy:{risk.level}"] + + return events def _enrich(self, frame: VoiceFrame) -> VoiceFrame: """ - Apply tone classification to a raw frame. + Apply tone classification to a raw frame (streaming path). Stub: identity transform — returns frame unchanged. - Real: replace label + confidence with classifier output. + Real (Navigation v0.2.x): replace label + confidence with classifier output. """ return frame diff --git a/cf_voice/diarize.py b/cf_voice/diarize.py index 217dd51..4d8ed18 100644 --- a/cf_voice/diarize.py +++ b/cf_voice/diarize.py @@ -7,12 +7,16 @@ # Requires accepting gated model terms at: # https://huggingface.co/pyannote/speaker-diarization-3.1 # https://huggingface.co/pyannote/segmentation-3.0 +# +# Enable with: CF_VOICE_DIARIZE=1 (default off) +# Requires: HF_TOKEN set in environment from __future__ import annotations import asyncio import logging import os -from dataclasses import dataclass +import string +from dataclasses import dataclass, field import numpy as np @@ -21,11 +25,16 @@ logger = logging.getLogger(__name__) _DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1" _SAMPLE_RATE = 16_000 +# Label returned when two speakers overlap in the same window +SPEAKER_MULTIPLE = "Multiple" +# Label returned when no speaker segment covers the timestamp (silence / VAD miss) +SPEAKER_UNKNOWN = "speaker_a" + @dataclass class SpeakerSegment: """A speaker-labelled time range within an audio window.""" - speaker_id: str # ephemeral local label, e.g. "SPEAKER_00" + speaker_id: str # raw pyannote label, e.g. "SPEAKER_00" start_s: float end_s: float @@ -34,6 +43,51 @@ class SpeakerSegment: return self.end_s - self.start_s +class SpeakerTracker: + """ + Maps ephemeral pyannote speaker IDs to stable per-session friendly labels. + + pyannote returns IDs like "SPEAKER_00", "SPEAKER_01" which are opaque and + may differ across audio windows. SpeakerTracker assigns a consistent + friendly label ("Speaker A", "Speaker B", ...) for the lifetime of one + session, based on first-seen order. + + Speaker embeddings are never stored — only the raw_id → label string map, + which contains no biometric information. Call reset() at session end to + discard the map. + + For sessions with more than 26 speakers, labels wrap to "Speaker AA", + "Speaker AB", etc. (unlikely in practice but handled gracefully). + """ + + def __init__(self) -> None: + self._map: dict[str, str] = {} + self._counter: int = 0 + + def label(self, raw_id: str) -> str: + """Return the friendly label for a pyannote speaker ID.""" + if raw_id not in self._map: + self._map[raw_id] = self._next_label() + return self._map[raw_id] + + def reset(self) -> None: + """Discard all label mappings. Call at session end.""" + self._map.clear() + self._counter = 0 + + def _next_label(self) -> str: + idx = self._counter + self._counter += 1 + letters = string.ascii_uppercase + n = len(letters) + if idx < n: + return f"Speaker {letters[idx]}" + # Two-letter suffix for >26 speakers + outer = idx // n + inner = idx % n + return f"Speaker {letters[outer - 1]}{letters[inner]}" + + class Diarizer: """ Async wrapper around pyannote.audio speaker diarization pipeline. @@ -47,9 +101,9 @@ class Diarizer: Usage ----- diarizer = Diarizer.from_env() + tracker = SpeakerTracker() segments = await diarizer.diarize_async(audio_float32) - for seg in segments: - print(seg.speaker_id, seg.start_s, seg.end_s) + label = diarizer.speaker_at(segments, timestamp_s=1.0, tracker=tracker) Navigation v0.2.x wires this into ContextClassifier so that each VoiceFrame carries the correct speaker_id from diarization output. @@ -67,7 +121,7 @@ class Diarizer: logger.info("Loading diarization pipeline %s", _DIARIZATION_MODEL) self._pipeline = Pipeline.from_pretrained( _DIARIZATION_MODEL, - use_auth_token=hf_token, + token=hf_token, ) # Move to GPU if available @@ -92,16 +146,29 @@ class Diarizer: return cls(hf_token=token) def _diarize_sync( - self, audio_float32: np.ndarray, sample_rate: int = _SAMPLE_RATE + self, + audio_float32: np.ndarray, + sample_rate: int = _SAMPLE_RATE, + num_speakers: int | None = None, ) -> list[SpeakerSegment]: - """Synchronous diarization — always call via diarize_async.""" + """Synchronous diarization — always call via diarize_async. + + num_speakers: when set, passed as min_speakers=max_speakers to pyannote, + which skips the agglomeration heuristic and improves boundary accuracy + for known-size conversations (e.g. 2-person call). + """ import torch # pyannote expects (channels, samples) float32 tensor waveform = torch.from_numpy(audio_float32[np.newaxis, :].astype(np.float32)) - diarization = self._pipeline( - {"waveform": waveform, "sample_rate": sample_rate} - ) + pipeline_kwargs: dict = {"waveform": waveform, "sample_rate": sample_rate} + if num_speakers and num_speakers > 0: + pipeline_kwargs["min_speakers"] = num_speakers + pipeline_kwargs["max_speakers"] = num_speakers + output = self._pipeline(pipeline_kwargs) + # pyannote >= 3.3 wraps results in DiarizeOutput; earlier versions return + # Annotation directly. Normalise to Annotation before iterating. + diarization = getattr(output, "speaker_diarization", output) segments: list[SpeakerSegment] = [] for turn, _, speaker in diarization.itertracks(yield_label=True): @@ -118,6 +185,7 @@ class Diarizer: self, audio_float32: np.ndarray, sample_rate: int = _SAMPLE_RATE, + num_speakers: int | None = None, ) -> list[SpeakerSegment]: """ Diarize an audio window without blocking the event loop. @@ -125,22 +193,58 @@ class Diarizer: audio_float32 should be 16kHz mono float32. Typical input is a 2-second window from MicVoiceIO (32000 samples). Returns segments ordered by start_s. + + num_speakers: passed through to pyannote as min_speakers=max_speakers + when set and > 0. Improves accuracy for known speaker counts. """ - loop = asyncio.get_event_loop() + from functools import partial + loop = asyncio.get_running_loop() return await loop.run_in_executor( - None, self._diarize_sync, audio_float32, sample_rate + None, + partial(self._diarize_sync, audio_float32, sample_rate, num_speakers), ) def speaker_at( - self, segments: list[SpeakerSegment], timestamp_s: float + self, + segments: list[SpeakerSegment], + timestamp_s: float, + tracker: SpeakerTracker | None = None, + window_s: float = 1.0, ) -> str: """ - Return the speaker_id active at a given timestamp within the window. + Return the friendly speaker label dominating a window around timestamp_s. - Falls back to "speaker_a" if no segment covers the timestamp - (e.g. during silence or at window boundaries). + Strategy (in order): + 1. If segments directly cover timestamp_s: use majority rule among them. + 2. If timestamp_s falls in a silence gap: use the speaker with the most + total speaking time across the whole window [0, window_s]. This handles + pauses between pyannote segments without falling back to "speaker_a". + 3. No segments at all: SPEAKER_UNKNOWN. + + tracker is optional; if omitted, raw pyannote IDs are returned as-is. """ + if not segments: + return SPEAKER_UNKNOWN + + covering = [seg for seg in segments if seg.start_s <= timestamp_s <= seg.end_s] + + if len(covering) >= 2: + return SPEAKER_MULTIPLE + + if len(covering) == 1: + raw_id = covering[0].speaker_id + return tracker.label(raw_id) if tracker else raw_id + + # Midpoint fell in a silence gap — find dominant speaker over the window. + from collections import defaultdict + duration_by_speaker: dict[str, float] = defaultdict(float) + win_start = max(0.0, timestamp_s - window_s / 2) + win_end = timestamp_s + window_s / 2 for seg in segments: - if seg.start_s <= timestamp_s <= seg.end_s: - return seg.speaker_id - return "speaker_a" + overlap = min(seg.end_s, win_end) - max(seg.start_s, win_start) + if overlap > 0: + duration_by_speaker[seg.speaker_id] += overlap + if not duration_by_speaker: + return SPEAKER_UNKNOWN + raw_id = max(duration_by_speaker, key=lambda k: duration_by_speaker[k]) + return tracker.label(raw_id) if tracker else raw_id diff --git a/cf_voice/dimensional.py b/cf_voice/dimensional.py new file mode 100644 index 0000000..26ace1d --- /dev/null +++ b/cf_voice/dimensional.py @@ -0,0 +1,190 @@ +# cf_voice/dimensional.py — audeering dimensional emotion model +# +# BSL 1.1: real inference. Requires [inference] extras. +# +# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim +# Outputs three continuous 0-1 scores: +# valence: negative (0) to positive (1) +# arousal: low energy (0) to high energy (1) +# dominance: submissive (0) to dominant (1) +# +# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech. +# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP. +# +# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is +# downloaded — ~1.5GB, adds ~800MB GPU VRAM) +# +# HuggingFace model page: +# https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim +from __future__ import annotations + +import asyncio +import logging +import os +from dataclasses import dataclass +from functools import partial + +import numpy as np + +logger = logging.getLogger(__name__) + +_SAMPLE_RATE = 16_000 +_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim" + + +@dataclass +class DimensionalResult: + """ + Output of the audeering dimensional emotion model. + + All scores are 0.0-1.0 continuous values: + valence: negative affect (0) to positive affect (1) + arousal: low energy / calm (0) to high energy / excited (1) + dominance: submissive / uncertain (0) to dominant / assertive (1) + + Sarcasm signal: low arousal + higher valence = "calm-positive" profile. + Combined with flat F0 (prosody.py) and text divergence (linnet#22) for + the full multi-signal sarcasm heuristic. + """ + valence: float + arousal: float + dominance: float + + def affect_quadrant(self) -> str: + """ + Map VAD position to a descriptive quadrant label. + + These are reference labels for logging and debugging, not user-facing. + The annotation layer (Elcor) handles user-facing interpretation. + """ + v_high = self.valence >= 0.5 + a_high = self.arousal >= 0.5 + if v_high and a_high: + return "enthusiastic" + if v_high and not a_high: + return "calm_positive" # sarcasm candidate when paired with flat F0 + if not v_high and a_high: + return "frustrated_urgent" + return "sad_resigned" + + def calm_positive_score(self) -> float: + """ + 0-1 score indicating how strongly the VAD position matches the + calm-positive sarcasm candidate profile (low arousal, higher valence). + + Used as one component of the combined sarcasm heuristic. + """ + valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive + arousal_low = 1.0 - self.arousal # how calm + return (valence_pos * 0.5 + arousal_low * 0.5) + + +class DimensionalClassifier: + """ + Async wrapper around the audeering wav2vec2 dimensional emotion model. + + The model runs in a thread pool executor to avoid blocking asyncio. + Loaded once on first call and reused; the underlying wav2vec2 model + lands on CUDA when available (same device as the SER model in classify.py). + + Usage + ----- + clf = DimensionalClassifier.from_env() + result = await clf.classify_async(audio_float32) + print(result.valence, result.arousal, result.dominance) + """ + + def __init__(self) -> None: + self._model = None + self._processor = None + self._loaded = False + + def _ensure_loaded(self) -> None: + """Load model and processor on first inference call (not at construction).""" + if self._loaded: + return + self._loaded = True + + try: + from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification + except ImportError as exc: + raise ImportError( + "transformers is required for dimensional emotion classification. " + "Install with: pip install cf-voice[inference]" + ) from exc + + logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID) + self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID) + self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID) + + try: + import torch + if torch.cuda.is_available(): + self._model = self._model.to(torch.device("cuda")) + logger.info("Dimensional model on CUDA") + except ImportError: + pass + + self._model.eval() + + def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult: + """ + Synchronous inference. Always call via classify_async. + + The audeering model outputs [valence, arousal, dominance] as logits + in the range 0-1 (sigmoid regression heads, not softmax). The model was + fine-tuned on MSP-Podcast with per-dimension regression, not classification. + """ + self._ensure_loaded() + + try: + import torch + except ImportError as exc: + raise ImportError("torch is required for dimensional inference") from exc + + inputs = self._processor( + audio_float32, + sampling_rate=_SAMPLE_RATE, + return_tensors="pt", + padding=True, + ) + + if torch.cuda.is_available(): + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + logits = self._model(**inputs).logits + + # Model outputs [valence, arousal, dominance] in a single (1, 3) tensor + scores = logits[0].cpu().float().numpy() + valence = float(np.clip(scores[0], 0.0, 1.0)) + arousal = float(np.clip(scores[1], 0.0, 1.0)) + dominance = float(np.clip(scores[2], 0.0, 1.0)) + + return DimensionalResult( + valence=round(valence, 4), + arousal=round(arousal, 4), + dominance=round(dominance, 4), + ) + + async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult: + """ + Classify audio without blocking the event loop. + + Runs in a thread pool executor. Designed to be gathered alongside + the SER and diarization coroutines in context._classify_real_async(). + """ + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + None, partial(self._classify_sync, audio_float32) + ) + + @classmethod + def from_env(cls) -> "DimensionalClassifier": + """Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set.""" + if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1": + raise EnvironmentError( + "CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. " + "Add it to your .env file. The model requires ~800MB GPU VRAM." + ) + return cls() diff --git a/cf_voice/events.py b/cf_voice/events.py index eb4a578..3148391 100644 --- a/cf_voice/events.py +++ b/cf_voice/events.py @@ -10,10 +10,10 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import Literal -EventType = Literal["queue", "speaker", "environ", "tone"] +EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"] # ── Queue state labels ──────────────────────────────────────────────────────── -# Detected from YAMNet acoustic event classification +# Detected from AST acoustic event classification QUEUE_LABELS = Literal[ "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone" ] @@ -21,13 +21,36 @@ QUEUE_LABELS = Literal[ # ── Speaker type labels ─────────────────────────────────────────────────────── # Detected from pyannote VAD + custom IVR-vs-human head SPEAKER_LABELS = Literal[ - "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker" + "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker", + "background_voices", ] # ── Environmental labels ────────────────────────────────────────────────────── -# Background shift is the primary AMD (answering machine detection) signal +# Background shift is the primary AMD (answering machine detection) signal. +# Telephony labels + general-purpose acoustic scene labels. ENVIRON_LABELS = Literal[ - "call_center", "music", "background_shift", "noise_floor_change", "quiet" + # Telephony + "call_center", "music", "background_shift", "noise_floor_change", "quiet", + # Nature + "birdsong", "wind", "rain", "water", + # Urban + "traffic", "crowd_chatter", "street_signal", "construction", + # Indoor + "hvac", "keyboard_typing", "restaurant", +] + +# ── Acoustic scene labels ───────────────────────────────────────────────────── +# Broad scene category — primary input to privacy risk scoring. +SCENE_LABELS = Literal[ + "indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature", + "vehicle", "public_transit", +] + +# ── Accent / language labels ────────────────────────────────────────────────── +# Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1. +ACCENT_LABELS = Literal[ + "en_gb", "en_us", "en_au", "en_ca", "en_in", + "fr", "es", "de", "zh", "ja", "other", ] # ── Tone / affect labels ────────────────────────────────────────────────────── @@ -86,12 +109,35 @@ class ToneEvent(AudioEvent): The subtext field carries the human-readable annotation. Format is controlled by the caller (elcor flag in the classify request). + + Dimensional emotion (Navigation v0.2.x — audeering model): + valence / arousal / dominance are None when the dimensional classifier + is not enabled (CF_VOICE_DIMENSIONAL != "1"). + + Prosodic signals (Navigation v0.2.x — openSMILE): + sarcasm_risk / flat_f0_score are None when extractor is not enabled. """ affect: str = "neutral" shift_magnitude: float = 0.0 shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable" prosody_flags: list[str] = field(default_factory=list) session_id: str = "" # caller-assigned; correlates events to a session + # Dimensional emotion scores (audeering, optional) + valence: float | None = None + arousal: float | None = None + dominance: float | None = None + # Prosodic signals (openSMILE, optional) + sarcasm_risk: float | None = None + flat_f0_score: float | None = None + # Trajectory signals (rolling buffer — activates after BASELINE_MIN frames) + arousal_delta: float | None = None + valence_delta: float | None = None + trend: str | None = None # "stable"|"escalating"|"suppressed"|… + # Coherence signals (SER vs VAD cross-comparison) + coherence_score: float | None = None + suppression_flag: bool | None = None + reframe_type: str | None = None # "none"|"genuine"|"surface" + affect_divergence: float | None = None def __post_init__(self) -> None: # Force event_type to "tone" regardless of what the caller passed. diff --git a/cf_voice/io.py b/cf_voice/io.py index 0257afd..03ad7d8 100644 --- a/cf_voice/io.py +++ b/cf_voice/io.py @@ -118,5 +118,12 @@ def make_io( if use_mock: return MockVoiceIO(interval_s=interval_s) - from cf_voice.capture import MicVoiceIO - return MicVoiceIO(device_index=device_index) + try: + from cf_voice.capture import MicVoiceIO + return MicVoiceIO(device_index=device_index) + except ImportError as exc: + raise NotImplementedError( + "Real audio capture requires [inference] extras. " + "Install with: pip install cf-voice[inference]\n" + f"Missing: {exc}" + ) from exc diff --git a/cf_voice/models.py b/cf_voice/models.py index 8cd6535..f456347 100644 --- a/cf_voice/models.py +++ b/cf_voice/models.py @@ -13,19 +13,30 @@ class VoiceFrame: A single annotated moment in a voice stream. Produced by cf_voice.io (audio capture) and enriched by cf_voice.context - (tone classification, speaker diarization). + (tone classification, speaker diarization, dimensional emotion). Fields ------ label Tone annotation, e.g. "Warmly impatient" or "Deflecting". Generic by default; Elcor-style prefix format is an easter egg surfaced by the product UI, not set here. - confidence 0.0–1.0. Below ~0.5 the annotation is speculative. + confidence 0.0-1.0. Below ~0.5 the annotation is speculative. speaker_id Ephemeral local label ("speaker_a", "speaker_b"). Not tied to identity — resets each session. - shift_magnitude Delta from the previous frame's tone, 0.0–1.0. + shift_magnitude Delta from the previous frame's tone, 0.0-1.0. High values indicate a meaningful register shift. timestamp Session-relative seconds since capture started. + + Dimensional emotion (audeering model — Navigation v0.2.x, optional): + valence 0.0-1.0. Negative affect (0) to positive affect (1). + arousal 0.0-1.0. Low energy / calm (0) to high energy / excited (1). + dominance 0.0-1.0. Submissive / uncertain (0) to assertive / dominant (1). + + Prosodic features (openSMILE eGeMAPS — Navigation v0.2.x, optional): + sarcasm_risk 0.0-1.0 heuristic score: flat F0 + calm-positive VAD + + text divergence (linnet#22). All three signals required for + high confidence — audio-only signals are weak priors. + flat_f0_score Normalised F0 flatness: 1.0 = maximally flat pitch. """ label: str @@ -34,6 +45,15 @@ class VoiceFrame: shift_magnitude: float timestamp: float + # Dimensional emotion scores — None when dimensional classifier is disabled + valence: float | None = None + arousal: float | None = None + dominance: float | None = None + + # Prosodic signals — None when prosodic extractor is disabled + sarcasm_risk: float | None = None + flat_f0_score: float | None = None + def is_reliable(self, threshold: float = 0.6) -> bool: """Return True when confidence meets the given threshold.""" return self.confidence >= threshold diff --git a/cf_voice/prefs.py b/cf_voice/prefs.py new file mode 100644 index 0000000..a4b7420 --- /dev/null +++ b/cf_voice/prefs.py @@ -0,0 +1,181 @@ +# cf_voice/prefs.py — user preference hooks for cf-core preferences module +# +# MIT licensed. Provides voice-specific preference keys and helpers. +# +# When circuitforge_core is installed, reads/writes from the shared preference +# store (LocalFileStore or cloud backend). When it is not installed (standalone +# cf-voice use), falls back to environment variables only. +# +# Preference paths use dot-separated notation (cf-core convention): +# "voice.elcor_mode" bool — Elcor-style tone annotations +# "voice.confidence_threshold" float — minimum confidence to emit a frame +# "voice.whisper_model" str — faster-whisper model size +# "voice.elcor_prior_frames" int — rolling context window for Elcor LLM +from __future__ import annotations + +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + +# ── Preference key constants ────────────────────────────────────────────────── + +PREF_ELCOR_MODE = "voice.elcor_mode" +PREF_CONFIDENCE_THRESHOLD = "voice.confidence_threshold" +PREF_WHISPER_MODEL = "voice.whisper_model" +PREF_ELCOR_PRIOR_FRAMES = "voice.elcor_prior_frames" + +# Defaults used when neither preference store nor environment has a value +_DEFAULTS: dict[str, Any] = { + PREF_ELCOR_MODE: False, + PREF_CONFIDENCE_THRESHOLD: 0.55, + PREF_WHISPER_MODEL: "small", + PREF_ELCOR_PRIOR_FRAMES: 4, +} + +# ── Environment variable fallbacks ──────────────────────────────────────────── + +_ENV_KEYS: dict[str, str] = { + PREF_ELCOR_MODE: "CF_VOICE_ELCOR", + PREF_CONFIDENCE_THRESHOLD: "CF_VOICE_CONFIDENCE_THRESHOLD", + PREF_WHISPER_MODEL: "CF_VOICE_WHISPER_MODEL", + PREF_ELCOR_PRIOR_FRAMES: "CF_VOICE_ELCOR_PRIOR_FRAMES", +} + +_COERCE: dict[str, type] = { + PREF_ELCOR_MODE: bool, + PREF_CONFIDENCE_THRESHOLD: float, + PREF_WHISPER_MODEL: str, + PREF_ELCOR_PRIOR_FRAMES: int, +} + + +def _from_env(pref_path: str) -> Any: + """Read a preference from its environment variable fallback.""" + env_key = _ENV_KEYS.get(pref_path) + if env_key is None: + return None + raw = os.environ.get(env_key) + if raw is None: + return None + coerce = _COERCE.get(pref_path, str) + try: + if coerce is bool: + return raw.strip().lower() in ("1", "true", "yes") + return coerce(raw) + except (ValueError, TypeError): + logger.warning("prefs: could not parse env %s=%r as %s", env_key, raw, coerce) + return None + + +def _cf_core_store(): + """Return the cf-core default preference store, or None if not available.""" + try: + from circuitforge_core.preferences import store as _store_mod + return _store_mod._DEFAULT_STORE + except ImportError: + return None + + +# ── Public API ──────────────────────────────────────────────────────────────── + + +def get_voice_pref( + pref_path: str, + user_id: str | None = None, + store=None, +) -> Any: + """ + Read a voice preference value. + + Resolution order: + 1. Explicit store (passed in by caller — used for testing or cloud backends) + 2. cf-core LocalFileStore (if circuitforge_core is installed) + 3. Environment variable fallback + 4. Built-in default + + pref_path One of the PREF_* constants, e.g. PREF_ELCOR_MODE. + user_id Passed to the store for cloud backends; local store ignores it. + """ + # 1. Explicit store + if store is not None: + val = store.get(user_id=user_id, path=pref_path, default=None) + if val is not None: + return val + + # 2. cf-core default store + cf_store = _cf_core_store() + if cf_store is not None: + val = cf_store.get(user_id=user_id, path=pref_path, default=None) + if val is not None: + return val + + # 3. Environment variable + env_val = _from_env(pref_path) + if env_val is not None: + return env_val + + # 4. Built-in default + return _DEFAULTS.get(pref_path) + + +def set_voice_pref( + pref_path: str, + value: Any, + user_id: str | None = None, + store=None, +) -> None: + """ + Write a voice preference value. + + Writes to the explicit store if provided, otherwise to the cf-core default + store. Raises RuntimeError if neither is available (env-only mode has no + writable persistence). + """ + target = store or _cf_core_store() + if target is None: + raise RuntimeError( + "No writable preference store available. " + "Install circuitforge_core or pass a store explicitly." + ) + target.set(user_id=user_id, path=pref_path, value=value) + + +def is_elcor_enabled(user_id: str | None = None, store=None) -> bool: + """ + Convenience: return True if the user has Elcor annotation mode enabled. + + Elcor mode switches tone subtext from generic format ("Tone: Frustrated") + to the Mass Effect Elcor prefix format ("With barely concealed frustration:"). + It is an accessibility feature for autistic and ND users who benefit from + explicit tonal annotation. Opt-in, local-only — no data leaves the device. + + Defaults to False. + """ + return bool(get_voice_pref(PREF_ELCOR_MODE, user_id=user_id, store=store)) + + +def get_confidence_threshold(user_id: str | None = None, store=None) -> float: + """Return the minimum confidence threshold for emitting VoiceFrames (0.0–1.0).""" + return float( + get_voice_pref(PREF_CONFIDENCE_THRESHOLD, user_id=user_id, store=store) + ) + + +def get_whisper_model(user_id: str | None = None, store=None) -> str: + """Return the faster-whisper model name to use (e.g. "small", "medium").""" + return str(get_voice_pref(PREF_WHISPER_MODEL, user_id=user_id, store=store)) + + +def get_elcor_prior_frames(user_id: str | None = None, store=None) -> int: + """ + Return the number of prior VoiceFrames to include as context for Elcor + label generation. Larger windows produce more contextually aware annotations + but increase LLM prompt length and latency. + + Default: 4 frames (~8–10 seconds of rolling context at 2s intervals). + """ + return int( + get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, user_id=user_id, store=store) + ) diff --git a/cf_voice/privacy.py b/cf_voice/privacy.py new file mode 100644 index 0000000..4613749 --- /dev/null +++ b/cf_voice/privacy.py @@ -0,0 +1,115 @@ +# cf_voice/privacy.py — local acoustic privacy risk scoring +# +# MIT licensed. Never transmitted to cloud. Never logged server-side. +# +# Derives a privacy_risk level (low / moderate / high) from the combined +# acoustic fingerprint: scene + environ labels + speaker type + accent. +# +# Design rationale (#20): +# - "outdoor_urban" + "crowd_chatter" + "traffic" → low: clearly public +# - "indoor_quiet" + "background_voices" → moderate: conversation overheard +# - "outdoor_nature" + "birdsong" + regional accent → moderate-high: location-identifying compound +# - "indoor_quiet" + no background voices → low +# +# Risk gates (Linnet): +# high: warn before sending audio chunk to cloud STT; offer local-only fallback +# moderate: attach privacy_flags to session state, no blocking action +# low: proceed normally +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +PrivacyLevel = Literal["low", "moderate", "high"] + + +@dataclass +class PrivacyRisk: + """ + Locally-computed privacy risk for a single audio window. + + level: aggregate risk level + flags: ordered list of contributing signal descriptions + """ + level: PrivacyLevel + flags: list[str] = field(default_factory=list) + + +# ── Signal sets ─────────────────────────────────────────────────────────────── + +_PUBLIC_SCENES = {"outdoor_urban", "public_transit"} +_NATURE_SCENES = {"outdoor_nature"} +_QUIET_SCENES = {"indoor_quiet"} + +_LOCATION_ENVIRON = {"birdsong", "wind", "rain", "water"} +_URBAN_ENVIRON = {"traffic", "crowd_chatter", "street_signal", "construction"} + + +def score_privacy_risk( + scene: str | None, + environ_labels: list[str], + speaker: str | None, + accent: str | None, +) -> PrivacyRisk: + """ + Derive a PrivacyRisk from the current acoustic fingerprint. + + All inputs are nullable — this function handles partial signals gracefully. + Called per audio window; results are never persisted or transmitted. + + Args: + scene: SCENE_LABEL string or None + environ_labels: list of ENVIRON_LABEL strings active in this window + speaker: SPEAKER_LABEL string or None + accent: ACCENT_LABEL string or None (None when CF_VOICE_ACCENT disabled) + """ + flags: list[str] = [] + score = 0 # internal accumulator; maps to level at the end + + environ_set = set(environ_labels) + + # ── Clearly public environments → reduce risk ───────────────────────────── + if scene in _PUBLIC_SCENES or environ_set & _URBAN_ENVIRON: + flags.append("public_environment") + score -= 1 + + # ── Background voices: conversation may be overheard ───────────────────── + if speaker == "background_voices": + flags.append("background_voices_detected") + score += 2 + + # ── Quiet indoor: no background noise reduces identifiability ──────────── + if scene in _QUIET_SCENES and speaker not in ("background_voices", "human_multi"): + flags.append("controlled_environment") + # No score change — neutral + + # ── Nature sounds: alone they suggest a quiet, potentially identifiable location + nature_match = environ_set & _LOCATION_ENVIRON + if nature_match: + flags.append(f"location_signal: {', '.join(sorted(nature_match))}") + score += 1 + + # ── Nature scene + nature sounds: compound location-identifying signal ──── + if scene in _NATURE_SCENES and nature_match: + flags.append("compound_location_signal") + score += 1 + + # ── Regional accent + nature: narrows location to region + environment ──── + if accent and accent not in ("en_us", "other") and nature_match: + flags.append(f"accent_plus_location: {accent}") + score += 1 + + # ── Quiet indoor + background voices: overheard conversation ───────────── + if scene in _QUIET_SCENES and speaker == "background_voices": + flags.append("overheard_conversation") + score += 1 + + # ── Map score to level ──────────────────────────────────────────────────── + if score <= 0: + level: PrivacyLevel = "low" + elif score <= 2: + level = "moderate" + else: + level = "high" + + return PrivacyRisk(level=level, flags=flags) diff --git a/cf_voice/prosody.py b/cf_voice/prosody.py new file mode 100644 index 0000000..f897116 --- /dev/null +++ b/cf_voice/prosody.py @@ -0,0 +1,208 @@ +# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction +# +# MIT licensed (opensmile-python package is MIT). +# +# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set: +# F0 mean / std / percentiles (pitch) +# Jitter / Shimmer (cycle-to-cycle variation — vocal tension) +# Energy / loudness envelope +# MFCCs, spectral centroid +# Speaking rate, pause ratio +# +# Runs on CPU in a thread pool executor — no GPU required. Designed to run +# in parallel with the GPU classifiers in context._classify_real_async() via +# asyncio.gather(). +# +# Enable with: CF_VOICE_PROSODY=1 (default off) +# Install: pip install opensmile +# +# openSMILE docs: https://audeering.github.io/opensmile-python/ +from __future__ import annotations + +import asyncio +import logging +import os +from dataclasses import dataclass +from functools import partial + +import numpy as np + +logger = logging.getLogger(__name__) + +_SAMPLE_RATE = 16_000 + +# F0 std normalisation constant: values below this threshold indicate flat prosody. +# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm". +# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat. +_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm" +_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean" +_LOUDNESS_FEATURE = "loudness_sma3_amean" +_JITTER_FEATURE = "jitterLocal_sma3nz_amean" +_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean" +_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec" + + +@dataclass +class ProsodicSignal: + """ + Summary prosodic features for a single audio window. + + These are derived from the openSMILE eGeMAPS v02 feature set. + All values are raw feature magnitudes unless noted otherwise. + + f0_mean: Mean F0 in semitones from 27.5Hz reference + f0_std: Normalised F0 standard deviation (flatness indicator) + jitter: Cycle-to-cycle pitch variation (vocal tension) + shimmer: Cycle-to-cycle amplitude variation (vocal stress) + loudness: Mean loudness (energy proxy) + sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive + audio (from DimensionalResult if available), and optional + text-audio divergence (linnet#22 signal, not yet wired). + flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied. + """ + f0_mean: float + f0_std: float + jitter: float + shimmer: float + loudness: float + flat_f0_score: float + sarcasm_risk: float + + +def _compute_sarcasm_risk( + flat_f0: float, + calm_positive: float = 0.0, + text_divergence: float = 0.0, +) -> float: + """ + Heuristic sarcasm indicator. Not a trained model — a signal to combine + with text divergence (linnet#22) for the final confidence score. + + flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied). + calm_positive: DimensionalResult.calm_positive_score() when available. + text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22. + Pass 0.0 until the parallel text classifier is wired. + + Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%). + """ + return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3) + + +class ProsodicExtractor: + """ + openSMILE eGeMAPS feature extractor for a single audio window. + + CPU-bound inference — uses thread pool executor to avoid blocking asyncio. + Lazy-loads opensmile on first call so import cost is deferred. + + Usage + ----- + extractor = ProsodicExtractor() + signal = await extractor.extract_async(audio_float32) + print(signal.flat_f0_score, signal.sarcasm_risk) + """ + + def __init__(self) -> None: + self._smile = None + + def _ensure_loaded(self) -> None: + """Lazy-load opensmile on first extract call.""" + if self._smile is not None: + return + + try: + import opensmile + except ImportError as exc: + raise ImportError( + "opensmile is required for prosodic feature extraction. " + "Install with: pip install opensmile" + ) from exc + + self._smile = opensmile.Smile( + feature_set=opensmile.FeatureSet.eGeMAPSv02, + feature_level=opensmile.FeatureLevel.Functionals, + ) + logger.info("openSMILE eGeMAPS loaded") + + def _extract_sync( + self, + audio_float32: np.ndarray, + calm_positive: float = 0.0, + text_divergence: float = 0.0, + ) -> ProsodicSignal: + """ + Synchronous feature extraction. Always call via extract_async. + + Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score. + If opensmile raises (e.g. audio too short, no voiced frames), returns a + zero-filled ProsodicSignal so the caller does not need to handle exceptions. + """ + self._ensure_loaded() + + try: + feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE) + row = feats.iloc[0] + + f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0)) + f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0)) + jitter = float(row.get(_JITTER_FEATURE, 0.0)) + shimmer = float(row.get(_SHIMMER_FEATURE, 0.0)) + loudness = float(row.get(_LOUDNESS_FEATURE, 0.0)) + + except Exception as exc: + logger.debug("openSMILE extraction failed (likely silent window): %s", exc) + return ProsodicSignal( + f0_mean=0.0, f0_std=0.0, jitter=0.0, + shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0, + ) + + # Normalise F0 variance to a flatness score. + # f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0 + # f0_std of 0.0 = maximally flat → flat_f0 = 1.0 + flat_f0 = 1.0 - min(f0_std / 0.4, 1.0) + + sarcasm = _compute_sarcasm_risk( + flat_f0=flat_f0, + calm_positive=calm_positive, + text_divergence=text_divergence, + ) + + return ProsodicSignal( + f0_mean=round(f0_mean, 4), + f0_std=round(f0_std, 4), + jitter=round(jitter, 6), + shimmer=round(shimmer, 6), + loudness=round(loudness, 4), + flat_f0_score=round(flat_f0, 4), + sarcasm_risk=round(sarcasm, 4), + ) + + async def extract_async( + self, + audio_float32: np.ndarray, + calm_positive: float = 0.0, + text_divergence: float = 0.0, + ) -> ProsodicSignal: + """ + Extract prosodic features without blocking the event loop. + + calm_positive: Pass DimensionalResult.calm_positive_score() when + dimensional classification has already run. + text_divergence: Pass abs(transcript_sentiment - valence) when the + parallel text classifier (linnet#22) is wired. + """ + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + None, + partial(self._extract_sync, audio_float32, calm_positive, text_divergence), + ) + + @classmethod + def from_env(cls) -> "ProsodicExtractor": + """Construct from environment. Raises if CF_VOICE_PROSODY is not set.""" + if os.environ.get("CF_VOICE_PROSODY", "0") != "1": + raise EnvironmentError( + "CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. " + "Add it to your .env and install opensmile: pip install opensmile" + ) + return cls() diff --git a/cf_voice/stt.py b/cf_voice/stt.py index 7bb3685..2b62c81 100644 --- a/cf_voice/stt.py +++ b/cf_voice/stt.py @@ -46,6 +46,17 @@ class WhisperSTT: print(result.text) """ + # Known single-token hallucinations that Whisper emits on music/noise with + # low no_speech_prob (i.e. Whisper thinks it heard speech). These are too + # short to be real utterances in any supported language context. + _HALLUCINATION_TOKENS: frozenset[str] = frozenset({ + "ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um", + }) + + # Suppress a transcript if it repeats unchanged across this many consecutive + # windows — indicates Whisper is locked into a hallucination loop. + _MAX_REPEATS = 2 + def __init__( self, model_name: str = "small", @@ -77,6 +88,8 @@ class WhisperSTT: self._device = device self._model_name = model_name self._session_prompt: str = "" + self._last_text: str = "" + self._repeat_count: int = 0 @classmethod def from_env(cls) -> "WhisperSTT": @@ -91,7 +104,14 @@ class WhisperSTT: """Estimated VRAM usage in MB for this model/compute_type combination.""" return _VRAM_ESTIMATES_MB.get(self._model_name, 1500) - def _transcribe_sync(self, audio_float32: np.ndarray) -> STTResult: + # Segments above this no_speech_prob are hallucinations (silence/music/noise). + # faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks + # for watching" family without cutting off genuine low-energy speech. + _NO_SPEECH_THRESHOLD = 0.6 + + def _transcribe_sync( + self, audio_float32: np.ndarray, language: str | None = None + ) -> STTResult: """Synchronous transcription — always call via transcribe_chunk_async.""" duration = len(audio_float32) / 16_000.0 @@ -100,22 +120,49 @@ class WhisperSTT: text="", language="en", duration_s=duration, is_final=False ) + # Energy gate: skip Whisper entirely on silent/near-silent audio. + # In the sidecar path there is no upstream MicVoiceIO silence gate, + # so we must check here. RMS < 0.005 is inaudible; Whisper will + # hallucinate "thank you" or "thanks for watching" on silence. + rms = float(np.sqrt(np.mean(audio_float32 ** 2))) + if rms < 0.005: + return STTResult(text="", language="en", duration_s=duration, is_final=False) + segments, info = self._model.transcribe( audio_float32, - language=None, - initial_prompt=self._session_prompt or None, - vad_filter=False, # silence gating happens upstream in MicVoiceIO + language=language or None, # None = Whisper auto-detect + initial_prompt=None, # No session prompt — on 1s windows it causes + # phrase lock-in (model anchors on prior text + # rather than fresh audio). Reset via reset_session() + # at conversation boundaries instead. + vad_filter=True, # Silero VAD — skips non-speech frames word_timestamps=False, beam_size=3, temperature=0.0, ) - text = " ".join(s.text.strip() for s in segments).strip() + # Filter hallucinated segments: discard any segment where Whisper itself + # says there is likely no speech (no_speech_prob > threshold). This is + # the correct defense against "thank you" / music hallucinations — VAD + # alone is insufficient because music harmonics look speech-like to Silero. + text = " ".join( + s.text.strip() + for s in segments + if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD + ).strip() - # Rolling context: keep last ~50 words so the next chunk has prior text - if text: - words = (self._session_prompt + " " + text).split() - self._session_prompt = " ".join(words[-50:]) + # Gate 1: single-token hallucinations that slip past no_speech_prob. + if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS: + text = "" + + # Gate 2: repetition lock — same non-empty text N windows in a row. + if text and text == self._last_text: + self._repeat_count += 1 + if self._repeat_count >= self._MAX_REPEATS: + text = "" + else: + self._last_text = text + self._repeat_count = 0 return STTResult( text=text, @@ -124,19 +171,29 @@ class WhisperSTT: is_final=duration >= 1.0 and info.language_probability > 0.5, ) - async def transcribe_chunk_async(self, pcm_int16: bytes) -> STTResult: + async def transcribe_chunk_async( + self, pcm_int16: bytes, language: str | None = None + ) -> STTResult: """ Transcribe a raw PCM Int16 chunk, non-blocking. pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms chunks accumulated by MicVoiceIO (2-second window = 64000 bytes). + + language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects, + which is slower and more hallucination-prone on short clips. """ + from functools import partial audio = ( np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0 ) - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, self._transcribe_sync, audio) + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + None, partial(self._transcribe_sync, audio, language) + ) def reset_session(self) -> None: - """Clear the rolling prompt. Call at the start of each new conversation.""" + """Clear rolling state. Call at the start of each new conversation.""" self._session_prompt = "" + self._last_text = "" + self._repeat_count = 0 diff --git a/cf_voice/telephony.py b/cf_voice/telephony.py new file mode 100644 index 0000000..84d824f --- /dev/null +++ b/cf_voice/telephony.py @@ -0,0 +1,500 @@ +# cf_voice/telephony.py — outbound telephony abstraction +# +# Protocol + mock backend: MIT licensed. +# SignalWireBackend, FreeSWITCHBackend: BSL 1.1 (real telephony, cloud credentials). +# +# Consumers (Osprey, Harrier, Ibis, Kestrel) depend only on TelephonyBackend +# and CallSession — both MIT. The concrete backends are selected by make_telephony() +# based on the tier and available credentials. +# +# Requires optional extras for real backends: +# pip install cf-voice[signalwire] — SignalWire (paid tier, CF-provisioned) +# pip install cf-voice[freeswitch] — FreeSWITCH ESL (free tier, self-hosted) +from __future__ import annotations + +import asyncio +import logging +import os +from dataclasses import dataclass, field +from typing import Literal, Protocol, runtime_checkable + +logger = logging.getLogger(__name__) + +CallState = Literal[ + "dialing", + "ringing", + "in_progress", + "hold", + "bridged", + "completed", + "failed", + "no_answer", + "busy", +] + + +@dataclass +class CallSession: + """ + Represents an active or completed outbound call. + + call_sid is the backend-assigned identifier — for SignalWire this is a + Twilio-compatible SID string; for FreeSWITCH it is the UUID. + + state is updated by the backend as the call progresses. Consumers should + poll via backend.get_state() or subscribe to webhook events. + """ + call_sid: str + to: str + from_: str + state: CallState = "dialing" + duration_s: float = 0.0 + # AMD result: "human" | "machine" | "unknown" + # Populated once the backend resolves answering machine detection. + amd_result: str = "unknown" + error: str | None = None + + +@runtime_checkable +class TelephonyBackend(Protocol): + """ + Abstract telephony backend interface. + + All methods are async. Implementations must be safe to call from an + asyncio event loop. Long-running network operations run in a thread pool + (not the caller's responsibility). + + Field names are stable as of cf-voice v0.1.0. + """ + + async def dial( + self, + to: str, + from_: str, + webhook_url: str, + *, + amd: bool = False, + ) -> CallSession: + """ + Initiate an outbound call. + + to / from_ E.164 numbers ("+15551234567"). + webhook_url URL the backend will POST call events to (SignalWire/TwiML style). + amd If True, request answering machine detection. Result lands in + CallSession.amd_result once the backend resolves it. + + Returns a CallSession with state="dialing". + """ + ... + + async def send_dtmf(self, call_sid: str, digits: str) -> None: + """ + Send DTMF (dual-tone multi-frequency) tones mid-call. + + digits String of 0-9, *, #, A-D. Each character is one tone. + Pauses may be represented as 'w' (0.5s) or 'W' (1s) if the backend + supports them. + """ + ... + + async def bridge(self, call_sid: str, target: str) -> None: + """ + Bridge the active call to a second E.164 number or SIP URI. + + Used to connect the user directly to a human agent after Osprey has + navigated the IVR. The original call leg remains connected. + """ + ... + + async def hangup(self, call_sid: str) -> None: + """Terminate the call. Idempotent — safe to call on already-ended calls.""" + ... + + async def announce( + self, + call_sid: str, + text: str, + voice: str = "default", + ) -> None: + """ + Play synthesised speech into the call. + + Implements the adaptive service identification requirement (osprey#21): + Osprey announces its identity before navigating an IVR so that the + other party can consent to automated interaction. + + voice Backend-specific voice identifier. "default" uses the backend's + default TTS voice. + """ + ... + + async def get_state(self, call_sid: str) -> CallState: + """Fetch the current state of a call from the backend.""" + ... + + +# ── Mock backend (MIT) ──────────────────────────────────────────────────────── + + +class MockTelephonyBackend: + """ + Synthetic telephony backend for development and CI. + + No real calls are placed. Operations log to cf_voice.telephony and update + in-memory CallSession objects. AMD resolves to "human" after a simulated + delay. + + Usage: + backend = MockTelephonyBackend() + session = await backend.dial("+15551234567", "+18005550000", "https://...") + await backend.send_dtmf(session.call_sid, "1") + await backend.hangup(session.call_sid) + """ + + def __init__(self, amd_delay_s: float = 0.5) -> None: + self._sessions: dict[str, CallSession] = {} + self._amd_delay_s = amd_delay_s + self._call_counter = 0 + + def _next_sid(self) -> str: + self._call_counter += 1 + return f"mock_sid_{self._call_counter:04d}" + + async def dial( + self, + to: str, + from_: str, + webhook_url: str, + *, + amd: bool = False, + ) -> CallSession: + sid = self._next_sid() + session = CallSession(call_sid=sid, to=to, from_=from_, state="ringing") + self._sessions[sid] = session + logger.debug("MockTelephony: dial %s → %s (sid=%s)", from_, to, sid) + + async def _progress() -> None: + await asyncio.sleep(0.05) + session.state = "in_progress" + if amd: + await asyncio.sleep(self._amd_delay_s) + session.amd_result = "human" + logger.debug("MockTelephony: AMD resolved human (sid=%s)", sid) + + asyncio.create_task(_progress()) + return session + + async def send_dtmf(self, call_sid: str, digits: str) -> None: + self._sessions[call_sid] # KeyError if unknown — intentional + logger.debug("MockTelephony: DTMF %r (sid=%s)", digits, call_sid) + + async def bridge(self, call_sid: str, target: str) -> None: + session = self._sessions[call_sid] + session.state = "bridged" + logger.debug("MockTelephony: bridge → %s (sid=%s)", target, call_sid) + + async def hangup(self, call_sid: str) -> None: + session = self._sessions.get(call_sid) + if session: + session.state = "completed" + logger.debug("MockTelephony: hangup (sid=%s)", call_sid) + + async def announce( + self, + call_sid: str, + text: str, + voice: str = "default", + ) -> None: + self._sessions[call_sid] # KeyError if unknown — intentional + logger.debug( + "MockTelephony: announce voice=%s text=%r (sid=%s)", voice, text, call_sid + ) + + async def get_state(self, call_sid: str) -> CallState: + return self._sessions[call_sid].state + + +# ── SignalWire backend (BSL 1.1) ────────────────────────────────────────────── + + +class SignalWireBackend: + """ + SignalWire outbound telephony (Twilio-compatible REST API). + + BSL 1.1 — requires paid tier or self-hosted CF SignalWire project. + + Credentials sourced from environment: + CF_SW_PROJECT_ID — SignalWire project ID + CF_SW_AUTH_TOKEN — SignalWire auth token + CF_SW_SPACE_URL — space URL, e.g. "yourspace.signalwire.com" + + Requires: pip install cf-voice[signalwire] + """ + + def __init__( + self, + project_id: str | None = None, + auth_token: str | None = None, + space_url: str | None = None, + ) -> None: + try: + from signalwire.rest import Client as SWClient # type: ignore[import] + except ImportError as exc: + raise ImportError( + "SignalWire SDK is required for SignalWireBackend. " + "Install with: pip install cf-voice[signalwire]" + ) from exc + + self._project_id = project_id or os.environ["CF_SW_PROJECT_ID"] + self._auth_token = auth_token or os.environ["CF_SW_AUTH_TOKEN"] + self._space_url = space_url or os.environ["CF_SW_SPACE_URL"] + self._client = SWClient( + self._project_id, + self._auth_token, + signalwire_space_url=self._space_url, + ) + self._loop = asyncio.get_event_loop() + + async def dial( + self, + to: str, + from_: str, + webhook_url: str, + *, + amd: bool = False, + ) -> CallSession: + call_kwargs: dict = dict( + to=to, + from_=from_, + url=webhook_url, + status_callback=webhook_url, + ) + if amd: + call_kwargs["machine_detection"] = "Enable" + call_kwargs["async_amd"] = True + + call = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls.create(**call_kwargs), + ) + return CallSession( + call_sid=call.sid, + to=to, + from_=from_, + state="dialing", + ) + + async def send_dtmf(self, call_sid: str, digits: str) -> None: + await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls(call_sid).update( + twiml=f"" + ), + ) + + async def bridge(self, call_sid: str, target: str) -> None: + await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls(call_sid).update( + twiml=( + f"{target}" + ) + ), + ) + + async def hangup(self, call_sid: str) -> None: + await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls(call_sid).update(status="completed"), + ) + + async def announce( + self, + call_sid: str, + text: str, + voice: str = "alice", + ) -> None: + await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls(call_sid).update( + twiml=f"{text}" + ), + ) + + async def get_state(self, call_sid: str) -> CallState: + call = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._client.calls(call_sid).fetch(), + ) + _sw_map: dict[str, CallState] = { + "queued": "dialing", "ringing": "ringing", "in-progress": "in_progress", + "completed": "completed", "failed": "failed", "busy": "busy", + "no-answer": "no_answer", + } + return _sw_map.get(call.status, "failed") + + +# ── FreeSWITCH backend (BSL 1.1) ───────────────────────────────────────────── + + +class FreeSWITCHBackend: + """ + Self-hosted FreeSWITCH outbound telephony via ESL (event socket layer). + + BSL 1.1 — requires free tier + user-provisioned FreeSWITCH + VoIP.ms SIP trunk. + + Credentials sourced from environment: + CF_ESL_HOST — FreeSWITCH ESL host (default: 127.0.0.1) + CF_ESL_PORT — FreeSWITCH ESL port (default: 8021) + CF_ESL_PASSWORD — FreeSWITCH ESL password + + Requires: pip install cf-voice[freeswitch] + + Note: FreeSWITCH AMD (mod_vad + custom heuristic or Whisper pipe) is not + yet implemented. The amd parameter is accepted but amd_result stays "unknown". + """ + + def __init__( + self, + host: str | None = None, + port: int | None = None, + password: str | None = None, + ) -> None: + try: + import ESL # type: ignore[import] + except ImportError as exc: + raise ImportError( + "FreeSWITCH ESL bindings are required for FreeSWITCHBackend. " + "Install with: pip install cf-voice[freeswitch]" + ) from exc + + self._host = host or os.environ.get("CF_ESL_HOST", "127.0.0.1") + self._port = int(port or os.environ.get("CF_ESL_PORT", 8021)) + self._password = password or os.environ["CF_ESL_PASSWORD"] + self._esl = ESL + + def _connect(self): + conn = self._esl.ESLconnection(self._host, str(self._port), self._password) + if not conn.connected(): + raise RuntimeError( + f"Could not connect to FreeSWITCH ESL at {self._host}:{self._port}" + ) + return conn + + async def dial( + self, + to: str, + from_: str, + webhook_url: str, + *, + amd: bool = False, + ) -> CallSession: + def _originate() -> str: + conn = self._connect() + # ESL originate: sofia/gateway/voipms/{to} {from_} XML default + cmd = ( + f"originate {{origination_caller_id_number={from_}," + f"origination_caller_id_name=CircuitForge}}" + f"sofia/gateway/voipms/{to.lstrip('+')} &park()" + ) + result = conn.api("originate", cmd) + return result.getBody().strip() + + body = await asyncio.get_event_loop().run_in_executor(None, _originate) + # FreeSWITCH returns "+OK " on success + if not body.startswith("+OK"): + raise RuntimeError(f"FreeSWITCH originate failed: {body}") + uuid = body.removeprefix("+OK").strip() + return CallSession(call_sid=uuid, to=to, from_=from_, state="dialing") + + async def send_dtmf(self, call_sid: str, digits: str) -> None: + def _dtmf() -> None: + conn = self._connect() + conn.api("uuid_send_dtmf", f"{call_sid} {digits}") + + await asyncio.get_event_loop().run_in_executor(None, _dtmf) + + async def bridge(self, call_sid: str, target: str) -> None: + def _bridge() -> None: + conn = self._connect() + conn.api( + "uuid_bridge", + f"{call_sid} sofia/gateway/voipms/{target.lstrip('+')}", + ) + + await asyncio.get_event_loop().run_in_executor(None, _bridge) + + async def hangup(self, call_sid: str) -> None: + def _hangup() -> None: + conn = self._connect() + conn.api("uuid_kill", call_sid) + + await asyncio.get_event_loop().run_in_executor(None, _hangup) + + async def announce( + self, + call_sid: str, + text: str, + voice: str = "default", + ) -> None: + # FreeSWITCH TTS via mod_tts_commandline or Piper pipe + def _say() -> None: + conn = self._connect() + conn.api("uuid_broadcast", f"{call_sid} say::en CHAT SPOKEN {text}") + + await asyncio.get_event_loop().run_in_executor(None, _say) + + async def get_state(self, call_sid: str) -> CallState: + def _fetch() -> str: + conn = self._connect() + return conn.api("uuid_getvar", f"{call_sid} call_state").getBody().strip() + + raw = await asyncio.get_event_loop().run_in_executor(None, _fetch) + _fs_map: dict[str, CallState] = { + "CS_INIT": "dialing", "CS_ROUTING": "ringing", + "CS_EXECUTE": "in_progress", "CS_HANGUP": "completed", + "CS_DESTROY": "completed", + } + return _fs_map.get(raw, "failed") + + +# ── Factory ─────────────────────────────────────────────────────────────────── + + +def make_telephony( + mock: bool | None = None, + backend: str | None = None, +) -> MockTelephonyBackend | SignalWireBackend | FreeSWITCHBackend: + """ + Factory: return a TelephonyBackend appropriate for the current environment. + + Resolution order: + 1. mock=True or CF_VOICE_MOCK=1 → MockTelephonyBackend + 2. backend="signalwire" or CF_SW_PROJECT_ID present → SignalWireBackend + 3. backend="freeswitch" or CF_ESL_PASSWORD present → FreeSWITCHBackend + 4. Raises RuntimeError — no usable backend configured + + In production, backend selection is driven by the tier system: + Free tier → FreeSWITCHBackend (BYOK VoIP) + Paid tier → SignalWireBackend (CF-provisioned) + """ + use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" + if use_mock: + return MockTelephonyBackend() + + resolved_backend = backend or ( + "signalwire" if os.environ.get("CF_SW_PROJECT_ID") else + "freeswitch" if os.environ.get("CF_ESL_PASSWORD") else + None + ) + + if resolved_backend == "signalwire": + return SignalWireBackend() + + if resolved_backend == "freeswitch": + return FreeSWITCHBackend() + + raise RuntimeError( + "No telephony backend configured. " + "Set CF_VOICE_MOCK=1 for mock mode, or provide SignalWire / FreeSWITCH credentials." + ) diff --git a/cf_voice/trajectory.py b/cf_voice/trajectory.py new file mode 100644 index 0000000..3b2608c --- /dev/null +++ b/cf_voice/trajectory.py @@ -0,0 +1,288 @@ +# cf_voice/trajectory.py — affect trajectory and SER/VAD coherence signals +# +# MIT licensed — derived computation only, no inference models. +# +# Two signal families: +# +# 1. TrajectorySignal — rolling arousal/valence trend across the last N windows. +# Detects escalation, de-escalation, suppression, worsening, improving. +# +# 2. CoherenceSignal — cross-model comparison between SER (categorical affect) +# and VAD (continuous dimensional valence). Disagreement indicates affect +# suppression, controlled presentation, or surface-only semantic reframe. +# +# Both signals activate only after BASELINE_MIN windows per speaker are buffered. +# All thresholds are relative to the per-speaker rolling mean, not absolute — +# this is required for ND/neurodivergent speaker safety (see design doc). +# +# Safety note: these signals must never be labelled "deception" in any +# user-facing context. Use: "affect divergence", "controlled presentation", +# "framing shift". The user interprets; the system observes. +from __future__ import annotations + +from collections import deque +from dataclasses import dataclass + +from cf_voice.dimensional import DimensionalResult + +# Rolling window depth per speaker +BUFFER_WINDOW = 5 + +# Minimum frames before signals activate (relative baseline requirement) +BASELINE_MIN = 3 + +# Minimum arousal/valence delta per window to count as directional movement +_DELTA_THRESHOLD = 0.05 + +# Arousal threshold above which "neutral SER + high arousal" = suppression candidate +_SUPPRESSION_AROUSAL_MIN = 0.65 + +# SER affects that imply low arousal presentation (used for suppression detection) +_LOW_PRESENTATION_AFFECTS = frozenset({"neutral", "scripted", "tired", "apologetic"}) + +# Expected valence ranges derived from MSP-Podcast emotion distribution. +# Used to determine whether SER affect label and dimensional valence agree. +_AFFECT_VALENCE_PRIOR: dict[str, tuple[float, float]] = { + "warm": (0.60, 1.00), + "genuine": (0.55, 1.00), + "optimistic": (0.55, 0.90), + "neutral": (0.35, 0.65), + "confused": (0.30, 0.60), + "scripted": (0.30, 0.65), + "apologetic": (0.20, 0.55), + "tired": (0.10, 0.50), + "frustrated": (0.10, 0.45), + "dismissive": (0.15, 0.50), + "condescending": (0.10, 0.45), + "urgent": (0.15, 0.55), +} + +# Ordinal positivity for reframe direction detection. +# Higher = more positive presentation. +_AFFECT_POSITIVITY: dict[str, int] = { + "urgent": 1, + "frustrated": 1, + "condescending": 1, + "dismissive": 2, + "tired": 2, + "apologetic": 3, + "confused": 3, + "scripted": 4, + "neutral": 4, + "optimistic": 5, + "genuine": 5, + "warm": 6, +} + + +@dataclass +class TrajectorySignal: + """ + Rolling trend across recent dimensional frames for one speaker. + + All delta values: current_frame_value - mean(buffer_values). + Positive arousal_delta = current frame is more activated than baseline. + Negative valence_delta = current frame is more negative than baseline. + + trend values: + "calibrating" not enough frames yet (< BASELINE_MIN) + "stable" no significant directional movement + "escalating" arousal rising: current > mean by DELTA_THRESHOLD, consecutive + "de-escalating" arousal falling after elevated period + "worsening" valence falling: current < mean, consecutive + "improving" valence rising after depressed period + "suppressed" SER affect is calm/neutral, VAD arousal is elevated + """ + arousal_delta: float + valence_delta: float + dominance_delta: float + arousal_trend: str # "rising" | "falling" | "flat" + valence_trend: str # "rising" | "falling" | "flat" + trend: str + frames_in_buffer: int + baseline_established: bool + + +@dataclass +class CoherenceSignal: + """ + Cross-signal comparison: SER categorical affect vs. VAD dimensional valence. + + coherence_score: + 1.0 = SER label and VAD valence are fully consistent. + 0.0 = maximum disagreement. + + suppression_flag: + True when the speaker is presenting as calm/neutral (SER) but VAD arousal + is elevated. Indicates controlled presentation with activation underneath. + This is relative to a per-session threshold — not a universal claim. + + reframe_type: + "none" no SER category shift this window + "genuine" SER shifted toward more positive AND dimensional valence also + improved (>= DELTA_THRESHOLD in this window) + "surface" SER shifted toward more positive BUT dimensional valence + continued its prior trajectory unchanged or worsening + + affect_divergence: + Signed: VAD-implied valence minus SER-implied valence midpoint. + Negative = VAD more negative than SER label implies (masking candidate). + Positive = VAD more positive than SER label implies (unusual). + """ + coherence_score: float + suppression_flag: bool + reframe_type: str # "none" | "genuine" | "surface" + affect_divergence: float + + +# ── Public helpers ───────────────────────────────────────────────────────────── + + +def affect_coherence(affect: str, valence: float) -> float: + """ + Compute coherence between a SER affect category and a VAD valence score. + + Returns 1.0 when valence falls inside the expected range for the affect. + Returns 0.0 when the gap between valence and the nearest range boundary + exceeds 0.40 (the full range of a typical incoherence gap). + """ + lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70)) + if lo <= valence <= hi: + return 1.0 + gap = min(abs(valence - lo), abs(valence - hi)) + return round(max(0.0, 1.0 - (gap / 0.40)), 3) + + +def affect_divergence_score(affect: str, valence: float) -> float: + """ + Signed divergence: actual VAD valence minus the midpoint of the expected range. + + Negative = VAD more negative than SER label implies. + Positive = VAD more positive than SER label implies. + """ + lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70)) + midpoint = (lo + hi) / 2.0 + return round(valence - midpoint, 3) + + +def compute_trajectory( + buffer: deque, + current: DimensionalResult, + ser_affect: str, + prior_ser_affect: str | None, +) -> tuple[TrajectorySignal, CoherenceSignal]: + """ + Compute trajectory and coherence signals for one speaker at one window. + + buffer Rolling deque of prior DimensionalResult for this speaker. + Must be updated AFTER this call (append current to buffer). + current DimensionalResult for the window being classified. + ser_affect SER affect label for this window (from ToneClassifier). + prior_ser_affect SER affect label from the previous window, for reframe detection. + Pass None on the first window or when not tracking. + + Returns (TrajectorySignal, CoherenceSignal). Both have baseline_established=False + and trend="calibrating" when buffer has fewer than BASELINE_MIN entries. + """ + n = len(buffer) + + # Coherence can be computed without a buffer + coh_score = affect_coherence(ser_affect, current.valence) + div_score = affect_divergence_score(ser_affect, current.valence) + + suppression = ( + ser_affect in _LOW_PRESENTATION_AFFECTS + and current.arousal > _SUPPRESSION_AROUSAL_MIN + and current.valence < 0.50 + ) + + reframe = "none" + if prior_ser_affect and prior_ser_affect != ser_affect: + if _is_more_positive(ser_affect, prior_ser_affect): + # Valence actually improved in this window vs. single prior frame + if n >= 1: + prev_valence = list(buffer)[-1].valence + dim_improved = (current.valence - prev_valence) >= _DELTA_THRESHOLD + else: + dim_improved = False + reframe = "genuine" if dim_improved else "surface" + + coher = CoherenceSignal( + coherence_score=coh_score, + suppression_flag=suppression, + reframe_type=reframe, + affect_divergence=div_score, + ) + + if n < BASELINE_MIN: + traj = TrajectorySignal( + arousal_delta=0.0, + valence_delta=0.0, + dominance_delta=0.0, + arousal_trend="flat", + valence_trend="flat", + trend="calibrating", + frames_in_buffer=n, + baseline_established=False, + ) + return traj, coher + + mean_arousal = sum(f.arousal for f in buffer) / n + mean_valence = sum(f.valence for f in buffer) / n + mean_dominance = sum(f.dominance for f in buffer) / n + + a_delta = current.arousal - mean_arousal + v_delta = current.valence - mean_valence + d_delta = current.dominance - mean_dominance + + a_trend = ( + "rising" if a_delta > _DELTA_THRESHOLD else + "falling" if a_delta < -_DELTA_THRESHOLD else + "flat" + ) + v_trend = ( + "rising" if v_delta > _DELTA_THRESHOLD else + "falling" if v_delta < -_DELTA_THRESHOLD else + "flat" + ) + + # Consecutive movement: check whether the most recent buffered frame + # was already moving in the same direction as the current frame. + buf_list = list(buffer) + prev = buf_list[-1] + a_consecutive = a_trend == "rising" and (current.arousal - prev.arousal) > 0.03 + v_consecutive = v_trend == "falling" and (current.valence - prev.valence) < -0.03 + + # Composite trend label + if suppression: + trend = "suppressed" + elif a_trend == "rising" and a_consecutive: + trend = "escalating" + elif a_trend == "falling" and mean_arousal > 0.55: + trend = "de-escalating" + elif v_trend == "falling" and v_consecutive: + trend = "worsening" + elif v_trend == "rising" and mean_valence < 0.45: + trend = "improving" + else: + trend = "stable" + + traj = TrajectorySignal( + arousal_delta=round(a_delta, 3), + valence_delta=round(v_delta, 3), + dominance_delta=round(d_delta, 3), + arousal_trend=a_trend, + valence_trend=v_trend, + trend=trend, + frames_in_buffer=n, + baseline_established=True, + ) + return traj, coher + + +# ── Internal helpers ─────────────────────────────────────────────────────────── + + +def _is_more_positive(current: str, prior: str) -> bool: + """True when the current SER affect is ranked more positive than prior.""" + return _AFFECT_POSITIVITY.get(current, 4) > _AFFECT_POSITIVITY.get(prior, 4) diff --git a/pyproject.toml b/pyproject.toml index 0aa0c6c..5c67341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ requires-python = ">=3.11" license = {text = "MIT"} dependencies = [ "pydantic>=2.0", + "fastapi>=0.111", + "uvicorn[standard]>=0.29", ] [project.optional-dependencies] @@ -26,6 +28,14 @@ inference = [ "pyannote.audio>=3.1", "python-dotenv>=1.0", ] +signalwire = [ + "signalwire>=2.0", +] +freeswitch = [ + # ESL Python bindings are compiled from FreeSWITCH source. + # See: https://developer.signalwire.com/freeswitch/FreeSWITCH-Explained/Client-and-Developer-Interfaces/Event-Socket-Library/ + "python-ESL", +] dev = [ "pytest>=8.0", "pytest-asyncio>=0.23", diff --git a/scripts/test_classify_e2e.py b/scripts/test_classify_e2e.py new file mode 100644 index 0000000..a16c962 --- /dev/null +++ b/scripts/test_classify_e2e.py @@ -0,0 +1,69 @@ +""" +End-to-end integration test for the cf-voice /classify endpoint. + +Extracts a 2-second window from a local media file, base64-encodes the +raw PCM, and POSTs it to the running cf-voice service at localhost:8009. +Prints each returned AudioEvent for quick inspection. + +Requires: + - cf-voice running at localhost:8009 (CF_VOICE_DIARIZE=1 for speaker labels) + - ffmpeg on PATH + - A local audio/video file (edit MEDIA_FILE below) + +Run: + python scripts/test_classify_e2e.py +""" +from __future__ import annotations + +import base64 +import json +import subprocess +import urllib.request + +import numpy as np + +MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv" +START_S = 120 +DURATION_S = 2 +SAMPLE_RATE = 16_000 +CF_VOICE_URL = "http://localhost:8009" + +proc = subprocess.run( + [ + "ffmpeg", "-i", MEDIA_FILE, + "-ss", str(START_S), + "-t", str(DURATION_S), + "-ar", str(SAMPLE_RATE), + "-ac", "1", + "-f", "s16le", + "-", + ], + capture_output=True, + check=True, +) + +pcm = proc.stdout +audio = np.frombuffer(pcm, dtype=np.int16) +print(f"audio samples: {len(audio)}, duration: {len(audio) / SAMPLE_RATE:.2f}s") + +payload = json.dumps({ + "audio_chunk": base64.b64encode(pcm).decode(), + "timestamp": float(START_S), + "session_id": "test", +}).encode() + +req = urllib.request.Request( + f"{CF_VOICE_URL}/classify", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST", +) +with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read()) + +for ev in result["events"]: + print( + f" {ev['event_type']:10}" + f" speaker_id={ev.get('speaker_id', 'N/A'):14}" + f" label={ev.get('label', '')}" + ) diff --git a/scripts/test_diarize_real.py b/scripts/test_diarize_real.py new file mode 100644 index 0000000..4f38183 --- /dev/null +++ b/scripts/test_diarize_real.py @@ -0,0 +1,65 @@ +""" +Manual integration test for speaker diarization via pyannote. + +Requires: + - HF_TOKEN env var (or set below) + - CF_VOICE_DIARIZE=1 + - ffmpeg on PATH + - A local audio/video file (edit MEDIA_FILE below) + - pip install cf-voice[inference] + +Run: + HF_TOKEN=hf_... CF_VOICE_DIARIZE=1 python scripts/test_diarize_real.py +""" +from __future__ import annotations + +import asyncio +import os +import subprocess + +import numpy as np + +# Override if not in env +if not os.environ.get("HF_TOKEN"): + raise SystemExit("Set HF_TOKEN in env before running this script.") +os.environ.setdefault("CF_VOICE_DIARIZE", "1") + +MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv" +START_S = 120 +DURATION_S = 2 +SAMPLE_RATE = 16_000 + +from cf_voice.diarize import Diarizer, SpeakerTracker # noqa: E402 + + +async def main() -> None: + d = Diarizer.from_env() + tracker = SpeakerTracker() + + proc = subprocess.run( + [ + "ffmpeg", "-i", MEDIA_FILE, + "-ss", str(START_S), + "-t", str(DURATION_S), + "-ar", str(SAMPLE_RATE), + "-ac", "1", + "-f", "s16le", + "-", + ], + capture_output=True, + check=True, + ) + audio = np.frombuffer(proc.stdout, dtype=np.int16).astype(np.float32) / 32768.0 + rms = float(np.sqrt(np.mean(audio**2))) + print(f"audio: {len(audio)} samples, {len(audio) / SAMPLE_RATE:.2f}s, rms={rms:.4f}") + + segs = await d.diarize_async(audio) + print(f"segments ({len(segs)}): {segs}") + + mid = len(audio) / 2.0 / SAMPLE_RATE + label = d.speaker_at(segs, mid, tracker) + print(f"speaker_at({mid:.2f}s): {label}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/test_acoustic.py b/tests/test_acoustic.py new file mode 100644 index 0000000..07e60bc --- /dev/null +++ b/tests/test_acoustic.py @@ -0,0 +1,119 @@ +import pytest +from cf_voice.acoustic import ( + AcousticBackend, + AcousticResult, + ASTAcousticBackend, + MockAcousticBackend, + make_acoustic, +) +from cf_voice.events import AudioEvent + + +class TestAcousticResult: + def test_fields(self): + evt = AudioEvent(timestamp=1.0, event_type="queue", label="ringback", confidence=0.9) + result = AcousticResult(queue=evt, speaker=None, environ=None, scene=None, timestamp=1.0) + assert result.queue.label == "ringback" + assert result.speaker is None + assert result.environ is None + assert result.scene is None + + +class TestMockAcousticBackend: + def test_classify_returns_result(self): + backend = MockAcousticBackend(seed=0) + result = backend.classify_window(b"", timestamp=0.0) + assert isinstance(result, AcousticResult) + assert result.timestamp == 0.0 + + def test_all_events_present(self): + backend = MockAcousticBackend(seed=1) + result = backend.classify_window(b"", timestamp=1.0) + assert result.queue is not None + assert result.speaker is not None + assert result.environ is not None + assert result.scene is not None + + def test_event_types_correct(self): + backend = MockAcousticBackend(seed=2) + result = backend.classify_window(b"", timestamp=2.0) + assert result.queue.event_type == "queue" + assert result.speaker.event_type == "speaker" + assert result.environ.event_type == "environ" + assert result.scene.event_type == "scene" + + def test_confidence_in_range(self): + backend = MockAcousticBackend(seed=3) + for _ in range(5): + result = backend.classify_window(b"", timestamp=0.0) + assert 0.0 <= result.queue.confidence <= 1.0 + assert 0.0 <= result.speaker.confidence <= 1.0 + assert 0.0 <= result.environ.confidence <= 1.0 + assert 0.0 <= result.scene.confidence <= 1.0 + + def test_lifecycle_advances(self): + """Phases should change after their duration elapses.""" + import time + backend = MockAcousticBackend(seed=42) + # Force phase to advance by manipulating phase_start + backend._phase_start -= 1000 # pretend 1000s elapsed + result = backend.classify_window(b"", timestamp=0.0) + # Should have advanced — just verify it doesn't crash and returns valid + assert result.queue.label in ( + "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone" + ) + + def test_isinstance_protocol(self): + backend = MockAcousticBackend() + assert isinstance(backend, AcousticBackend) + + def test_deterministic_with_seed(self): + b1 = MockAcousticBackend(seed=99) + b2 = MockAcousticBackend(seed=99) + r1 = b1.classify_window(b"", timestamp=0.0) + r2 = b2.classify_window(b"", timestamp=0.0) + assert r1.queue.label == r2.queue.label + assert r1.queue.confidence == r2.queue.confidence + + +class TestASTAcousticBackend: + def test_raises_import_error_without_deps(self, monkeypatch): + """ASTAcousticBackend should raise ImportError when transformers is unavailable.""" + import builtins + real_import = builtins.__import__ + + def mock_import(name, *args, **kwargs): + if name in ("transformers",): + raise ImportError(f"Mocked: {name} not available") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", mock_import) + with pytest.raises(ImportError, match="transformers"): + ASTAcousticBackend() + + +class TestMakeAcoustic: + def test_mock_flag(self): + backend = make_acoustic(mock=True) + assert isinstance(backend, MockAcousticBackend) + + def test_mock_env(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_MOCK", "1") + backend = make_acoustic() + assert isinstance(backend, MockAcousticBackend) + + def test_real_falls_back_to_mock_without_deps(self, monkeypatch, capsys): + """make_acoustic(mock=False) falls back to mock when deps are missing.""" + import builtins + real_import = builtins.__import__ + + def mock_import(name, *args, **kwargs): + if name in ("transformers",): + raise ImportError(f"Mocked: {name} not available") + return real_import(name, *args, **kwargs) + + monkeypatch.delenv("CF_VOICE_MOCK", raising=False) + monkeypatch.setattr(builtins, "__import__", mock_import) + backend = make_acoustic(mock=False) + # Should fall back gracefully, never raise + assert isinstance(backend, MockAcousticBackend) diff --git a/tests/test_diarize.py b/tests/test_diarize.py new file mode 100644 index 0000000..0bbc4fc --- /dev/null +++ b/tests/test_diarize.py @@ -0,0 +1,131 @@ +# tests/test_diarize.py — SpeakerTracker and speaker_at() diarization logic +# +# All tests are pure Python — no GPU, no pyannote, no HF_TOKEN required. +# The Diarizer class itself is only tested for its from_env() guard and the +# speaker_at() method, both of which run without loading the model. +from __future__ import annotations + +import os +import pytest + +from cf_voice.diarize import ( + Diarizer, + SpeakerSegment, + SpeakerTracker, + SPEAKER_MULTIPLE, + SPEAKER_UNKNOWN, +) + + +# ── SpeakerTracker ──────────────────────────────────────────────────────────── + +def test_tracker_first_speaker_is_a(): + t = SpeakerTracker() + assert t.label("SPEAKER_00") == "Speaker A" + + +def test_tracker_second_speaker_is_b(): + t = SpeakerTracker() + t.label("SPEAKER_00") + assert t.label("SPEAKER_01") == "Speaker B" + + +def test_tracker_same_id_returns_same_label(): + t = SpeakerTracker() + first = t.label("SPEAKER_00") + second = t.label("SPEAKER_00") + assert first == second == "Speaker A" + + +def test_tracker_26_speakers(): + t = SpeakerTracker() + labels = [t.label(f"SPEAKER_{i:02d}") for i in range(26)] + assert labels[0] == "Speaker A" + assert labels[25] == "Speaker Z" + + +def test_tracker_27th_speaker_wraps(): + t = SpeakerTracker() + for i in range(26): + t.label(f"SPEAKER_{i:02d}") + label_27 = t.label("SPEAKER_26") + assert label_27 == "Speaker AA" + + +def test_tracker_reset_clears_map(): + t = SpeakerTracker() + t.label("SPEAKER_00") + t.label("SPEAKER_01") + t.reset() + # After reset, SPEAKER_01 is seen as new and maps to "Speaker A" again + assert t.label("SPEAKER_01") == "Speaker A" + + +# ── Diarizer.speaker_at() ───────────────────────────────────────────────────── + +def _segs(*items: tuple[str, float, float]) -> list[SpeakerSegment]: + return [SpeakerSegment(speaker_id=s, start_s=st, end_s=en) for s, st, en in items] + + +def test_speaker_at_single_speaker(): + d = object.__new__(Diarizer) # bypass __init__ (no GPU needed) + segs = _segs(("SPEAKER_00", 0.0, 2.0)) + t = SpeakerTracker() + assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A" + + +def test_speaker_at_no_coverage_returns_unknown(): + d = object.__new__(Diarizer) + segs = _segs(("SPEAKER_00", 0.0, 1.0)) + assert d.speaker_at(segs, 1.5) == SPEAKER_UNKNOWN + + +def test_speaker_at_empty_segments_returns_unknown(): + d = object.__new__(Diarizer) + assert d.speaker_at([], 1.0) == SPEAKER_UNKNOWN + + +def test_speaker_at_overlap_returns_multiple(): + d = object.__new__(Diarizer) + segs = _segs( + ("SPEAKER_00", 0.0, 2.0), + ("SPEAKER_01", 0.5, 2.0), # overlaps SPEAKER_00 from 0.5s + ) + assert d.speaker_at(segs, 1.0) == SPEAKER_MULTIPLE + + +def test_speaker_at_boundary_inclusive(): + d = object.__new__(Diarizer) + segs = _segs(("SPEAKER_00", 1.0, 2.0)) + t = SpeakerTracker() + # Exact boundary timestamps are included + assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A" + assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker A" + + +def test_speaker_at_without_tracker_returns_raw_id(): + d = object.__new__(Diarizer) + segs = _segs(("SPEAKER_00", 0.0, 2.0)) + assert d.speaker_at(segs, 1.0) == "SPEAKER_00" + + +def test_speaker_at_two_speakers_no_overlap(): + d = object.__new__(Diarizer) + t = SpeakerTracker() + segs = _segs( + ("SPEAKER_00", 0.0, 1.0), + ("SPEAKER_01", 1.5, 2.5), + ) + assert d.speaker_at(segs, 0.5, tracker=t) == "Speaker A" + assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker B" + # Gap at 1.2s: window [0.7, 1.7] → SPEAKER_00 has 0.3s, SPEAKER_01 has 0.2s + # Dominant speaker (SPEAKER_00 = "Speaker A") is returned, not SPEAKER_UNKNOWN. + assert d.speaker_at(segs, 1.2, tracker=t) == "Speaker A" + + +# ── Diarizer.from_env() guard ───────────────────────────────────────────────── + +def test_from_env_raises_without_hf_token(monkeypatch): + monkeypatch.delenv("HF_TOKEN", raising=False) + with pytest.raises(EnvironmentError, match="HF_TOKEN"): + Diarizer.from_env() diff --git a/tests/test_models.py b/tests/test_models.py index 5743df0..e057456 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -75,10 +75,60 @@ class TestMockVoiceIO: io = make_io() assert isinstance(io, MockVoiceIO) - def test_make_io_real_raises(self, monkeypatch): + def test_make_io_real_returns_mic_io(self, monkeypatch): + """make_io(mock=False) returns MicVoiceIO when sounddevice/numpy are installed.""" + from cf_voice.capture import MicVoiceIO monkeypatch.delenv("CF_VOICE_MOCK", raising=False) - with pytest.raises(NotImplementedError): - make_io(mock=False) + io = make_io(mock=False) + assert isinstance(io, MicVoiceIO) + + +class TestContextClassifierChunk: + """Tests for classify_chunk() — multi-class event output.""" + + def test_mock_returns_four_event_types(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=10) + events = classifier.classify_chunk(timestamp=1.0) + types = {e.event_type for e in events} + # In mock mode all four event types should be present + assert "tone" in types + assert "queue" in types + assert "speaker" in types + assert "environ" in types + + def test_mock_tone_event_has_subtext(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=11) + events = classifier.classify_chunk(timestamp=0.0) + tone_events = [e for e in events if e.event_type == "tone"] + assert len(tone_events) == 1 + assert tone_events[0].subtext is not None + + def test_elcor_override_flag(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=12) + events_generic = classifier.classify_chunk(timestamp=0.0, elcor=False) + events_elcor = classifier.classify_chunk(timestamp=0.0, elcor=True) + + def subtext(evs): + return next(e.subtext for e in evs if e.event_type == "tone") + + generic_sub = subtext(events_generic) + elcor_sub = subtext(events_elcor) + # Generic format: "Tone: X". Elcor format: "With X:" or "Warmly:" etc. + assert generic_sub.startswith("Tone:") or not generic_sub.endswith(":") + # Elcor format ends with ":" + assert elcor_sub.endswith(":") + + def test_session_id_propagates(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=13) + events = classifier.classify_chunk(timestamp=0.0, session_id="ses_test") + tone_events = [e for e in events if e.event_type == "tone"] + assert tone_events[0].session_id == "ses_test" + + def test_prior_frames_zero_means_no_shift(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=14) + events = classifier.classify_chunk(timestamp=0.0, prior_frames=0) + tone_events = [e for e in events if e.event_type == "tone"] + assert tone_events[0].shift_magnitude == 0.0 class TestContextClassifier: diff --git a/tests/test_prefs.py b/tests/test_prefs.py new file mode 100644 index 0000000..42b588b --- /dev/null +++ b/tests/test_prefs.py @@ -0,0 +1,109 @@ +import os +import pytest +from cf_voice.prefs import ( + PREF_CONFIDENCE_THRESHOLD, + PREF_ELCOR_MODE, + PREF_ELCOR_PRIOR_FRAMES, + PREF_WHISPER_MODEL, + get_confidence_threshold, + get_elcor_prior_frames, + get_voice_pref, + get_whisper_model, + is_elcor_enabled, + set_voice_pref, +) + + +class _DictStore: + """In-memory preference store for testing.""" + + def __init__(self, data: dict | None = None) -> None: + self._data: dict = data or {} + + def get(self, user_id, path, default=None): + return self._data.get(path, default) + + def set(self, user_id, path, value): + self._data[path] = value + + +class TestGetVoicePref: + def test_returns_default_when_nothing_set(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_ELCOR", raising=False) + val = get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) + assert val is False + + def test_explicit_store_takes_priority(self): + store = _DictStore({PREF_ELCOR_MODE: True}) + assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True + + def test_env_fallback_bool(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_ELCOR", "1") + assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is True + + def test_env_fallback_false(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_ELCOR", "0") + assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is False + + def test_env_fallback_float(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_CONFIDENCE_THRESHOLD", "0.7") + val = get_voice_pref(PREF_CONFIDENCE_THRESHOLD, store=_DictStore()) + assert abs(val - 0.7) < 1e-9 + + def test_env_fallback_int(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_ELCOR_PRIOR_FRAMES", "6") + val = get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, store=_DictStore()) + assert val == 6 + + def test_env_fallback_str(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_WHISPER_MODEL", "medium") + val = get_voice_pref(PREF_WHISPER_MODEL, store=_DictStore()) + assert val == "medium" + + def test_store_beats_env(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_ELCOR", "1") + store = _DictStore({PREF_ELCOR_MODE: False}) + # store has explicit False — but store.get returns None for falsy values + # only if the key is absent; here key IS set so store wins + store._data[PREF_ELCOR_MODE] = True + assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True + + def test_unknown_key_returns_none(self): + val = get_voice_pref("voice.nonexistent", store=_DictStore()) + assert val is None + + +class TestSetVoicePref: + def test_sets_in_store(self): + store = _DictStore() + set_voice_pref(PREF_ELCOR_MODE, True, store=store) + assert store._data[PREF_ELCOR_MODE] is True + + def test_no_store_raises(self, monkeypatch): + # Patch _cf_core_store to return None (simulates no cf-core installed) + import cf_voice.prefs as prefs_mod + monkeypatch.setattr(prefs_mod, "_cf_core_store", lambda: None) + with pytest.raises(RuntimeError, match="No writable preference store"): + set_voice_pref(PREF_ELCOR_MODE, True) + + +class TestConvenienceHelpers: + def test_is_elcor_enabled_false_default(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_ELCOR", raising=False) + assert is_elcor_enabled(store=_DictStore()) is False + + def test_is_elcor_enabled_true_from_store(self): + store = _DictStore({PREF_ELCOR_MODE: True}) + assert is_elcor_enabled(store=store) is True + + def test_get_confidence_threshold_default(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_CONFIDENCE_THRESHOLD", raising=False) + assert get_confidence_threshold(store=_DictStore()) == pytest.approx(0.55) + + def test_get_whisper_model_default(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_WHISPER_MODEL", raising=False) + assert get_whisper_model(store=_DictStore()) == "small" + + def test_get_elcor_prior_frames_default(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_ELCOR_PRIOR_FRAMES", raising=False) + assert get_elcor_prior_frames(store=_DictStore()) == 4 diff --git a/tests/test_telephony.py b/tests/test_telephony.py new file mode 100644 index 0000000..d83dbaf --- /dev/null +++ b/tests/test_telephony.py @@ -0,0 +1,141 @@ +import asyncio +import pytest +from cf_voice.telephony import ( + CallSession, + MockTelephonyBackend, + TelephonyBackend, + make_telephony, +) + + +class TestCallSession: + def test_defaults(self): + s = CallSession(call_sid="sid_1", to="+15551234567", from_="+18005550000") + assert s.state == "dialing" + assert s.amd_result == "unknown" + assert s.duration_s == 0.0 + assert s.error is None + + def test_state_mutation(self): + s = CallSession(call_sid="sid_2", to="+1", from_="+2", state="in_progress") + s.state = "completed" + assert s.state == "completed" + + +class TestMockTelephonyBackend: + @pytest.mark.asyncio + async def test_dial_returns_session(self): + backend = MockTelephonyBackend() + session = await backend.dial("+15551234567", "+18005550000", "https://example.com/wh") + assert isinstance(session, CallSession) + assert session.call_sid.startswith("mock_sid_") + assert session.to == "+15551234567" + assert session.from_ == "+18005550000" + + @pytest.mark.asyncio + async def test_dial_transitions_to_in_progress(self): + backend = MockTelephonyBackend() + session = await backend.dial("+15551234567", "+18005550000", "https://x.com") + # give the background task a moment to transition + await asyncio.sleep(0.1) + assert session.state == "in_progress" + + @pytest.mark.asyncio + async def test_amd_resolves_human(self): + backend = MockTelephonyBackend(amd_delay_s=0.05) + session = await backend.dial("+1555", "+1800", "https://x.com", amd=True) + await asyncio.sleep(0.2) + assert session.amd_result == "human" + + @pytest.mark.asyncio + async def test_send_dtmf(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + # should not raise + await backend.send_dtmf(session.call_sid, "1234#") + + @pytest.mark.asyncio + async def test_send_dtmf_unknown_sid_raises(self): + backend = MockTelephonyBackend() + with pytest.raises(KeyError): + await backend.send_dtmf("nonexistent_sid", "1") + + @pytest.mark.asyncio + async def test_bridge_updates_state(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + await backend.bridge(session.call_sid, "+15559999999") + assert session.state == "bridged" + + @pytest.mark.asyncio + async def test_hangup_sets_completed(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + await backend.hangup(session.call_sid) + assert session.state == "completed" + + @pytest.mark.asyncio + async def test_hangup_idempotent(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + await backend.hangup(session.call_sid) + await backend.hangup(session.call_sid) + assert session.state == "completed" + + @pytest.mark.asyncio + async def test_announce_does_not_raise(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + await backend.announce(session.call_sid, "Hello, this is an automated assistant.") + + @pytest.mark.asyncio + async def test_get_state(self): + backend = MockTelephonyBackend() + session = await backend.dial("+1", "+2", "https://x.com") + state = await backend.get_state(session.call_sid) + assert state in ("ringing", "in_progress", "dialing") + + @pytest.mark.asyncio + async def test_multiple_calls_unique_sids(self): + backend = MockTelephonyBackend() + s1 = await backend.dial("+1", "+2", "https://x.com") + s2 = await backend.dial("+3", "+4", "https://x.com") + assert s1.call_sid != s2.call_sid + + def test_isinstance_protocol(self): + backend = MockTelephonyBackend() + assert isinstance(backend, TelephonyBackend) + + +class TestMakeTelephony: + def test_mock_flag(self): + backend = make_telephony(mock=True) + assert isinstance(backend, MockTelephonyBackend) + + def test_mock_env(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_MOCK", "1") + backend = make_telephony() + assert isinstance(backend, MockTelephonyBackend) + + def test_no_config_raises(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_MOCK", raising=False) + monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False) + monkeypatch.delenv("CF_ESL_PASSWORD", raising=False) + with pytest.raises(RuntimeError, match="No telephony backend configured"): + make_telephony() + + def test_signalwire_selected_by_env(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_MOCK", raising=False) + monkeypatch.setenv("CF_SW_PROJECT_ID", "proj_123") + # SignalWireBackend will raise ImportError (signalwire SDK not installed) + # but only at instantiation — make_telephony should call the constructor + with pytest.raises((ImportError, RuntimeError)): + make_telephony() + + def test_freeswitch_selected_by_env(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_MOCK", raising=False) + monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False) + monkeypatch.setenv("CF_ESL_PASSWORD", "s3cret") + # FreeSWITCHBackend will raise ImportError (ESL not installed) + with pytest.raises((ImportError, RuntimeError)): + make_telephony()