diff --git a/.env.example b/.env.example
index 2e58c43..96497bc 100644
--- a/.env.example
+++ b/.env.example
@@ -3,14 +3,29 @@
# load it via python-dotenv in their own startup. For standalone cf-voice
# dev/testing, source this file manually or install python-dotenv.
-# ── HuggingFace ───────────────────────────────────────────────────────────────
-# Required for pyannote.audio speaker diarization model download.
-# Get a free token at https://huggingface.co/settings/tokens
-# Also accept the gated model terms at:
-# https://huggingface.co/pyannote/speaker-diarization-3.1
-# https://huggingface.co/pyannote/segmentation-3.0
+# ── HuggingFace — free tier / local use ──────────────────────────────────────
+# Used by the local diarization path (free tier, user's own machine).
+# Each user must:
+# 1. Create a free account at huggingface.co
+# 2. Accept the gated model terms at:
+# https://huggingface.co/pyannote/speaker-diarization-3.1
+# https://huggingface.co/pyannote/segmentation-3.0
+# 3. Generate a read token at huggingface.co/settings/tokens
HF_TOKEN=
+# ── HuggingFace — paid tier / cf-orch backend ─────────────────────────────────
+# Used by cf-orch when running diarization as a managed service on Heimdall.
+# This is a CircuitForge org token — NOT the user's personal token.
+#
+# Prerequisites (one-time, manual — tracked in circuitforge-orch#27):
+# 1. Create CircuitForge org on huggingface.co
+# 2. Accept pyannote/speaker-diarization-3.1 terms under the org account
+# 3. Accept pyannote/segmentation-3.0 terms under the org account
+# 4. Generate a read-only org token and set it here
+#
+# Leave blank on local installs — HF_TOKEN above is used instead.
+CF_HF_TOKEN=
+
# ── Whisper STT ───────────────────────────────────────────────────────────────
# Model size: tiny | base | small | medium | large-v2 | large-v3
# Smaller = faster / less VRAM; larger = more accurate.
@@ -29,3 +44,16 @@ CF_VOICE_MOCK=
# ── Tone classifier ───────────────────────────────────────────────────────────
# Minimum confidence to emit a VoiceFrame (below this = frame skipped).
CF_VOICE_CONFIDENCE_THRESHOLD=0.55
+
+# ── Elcor annotation mode ─────────────────────────────────────────────────────
+# Accessibility feature for autistic and ND users. Switches tone subtext from
+# generic format ("Tone: Frustrated") to Elcor-style prefix format
+# ("With barely concealed frustration:"). Opt-in, local-only.
+# Overridden by cf-core preferences store when circuitforge_core is installed.
+# 1 = enabled, 0 or unset = disabled (default).
+CF_VOICE_ELCOR=0
+
+# Number of prior VoiceFrames to include as context for Elcor label generation.
+# Larger windows = more contextually aware annotations, higher LLM prompt cost.
+# Default: 4 frames (~10 seconds of rolling context at 2.5s intervals).
+CF_VOICE_ELCOR_PRIOR_FRAMES=4
diff --git a/README.md b/README.md
index 1e14e8a..cc4595f 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,64 @@ Host apps subscribing via `` receive `MessageEvent` with `type =
---
+---
+
+## Telephony
+
+`cf_voice.telephony` provides the outbound call abstraction for Osprey, Harrier, Ibis, and Kestrel.
+
+### Quick start
+
+```python
+from cf_voice.telephony import make_telephony
+
+# Mock mode — no real calls placed (CF_VOICE_MOCK=1 or mock=True)
+backend = make_telephony(mock=True)
+
+session = await backend.dial(
+ to="+15551234567",
+ from_="+18005550000",
+ webhook_url="https://yourapp.example.com/voice/events",
+ amd=True, # answering machine detection
+)
+
+# Adaptive service identification (osprey#21)
+await backend.announce(session.call_sid, "This is an automated assistant.")
+
+# Navigate IVR
+await backend.send_dtmf(session.call_sid, "2") # Press 2 for billing
+
+# Bridge to user's phone once human agent answers
+await backend.bridge(session.call_sid, "+14155550100")
+
+await backend.hangup(session.call_sid)
+```
+
+### Backend selection
+
+`make_telephony()` resolves the backend in this order:
+
+| Condition | Backend |
+|---|---|
+| `CF_VOICE_MOCK=1` or `mock=True` | `MockTelephonyBackend` (dev/CI) |
+| `CF_SW_PROJECT_ID` env set | `SignalWireBackend` (paid tier) |
+| `CF_ESL_PASSWORD` env set | `FreeSWITCHBackend` (free tier, self-hosted) |
+| none | `RuntimeError` |
+
+### Installing real backends
+
+```bash
+# Paid tier — SignalWire managed telephony
+pip install cf-voice[signalwire]
+
+# Free tier — self-hosted FreeSWITCH (requires compiled ESL bindings)
+pip install cf-voice[freeswitch]
+```
+
+Set credentials in `.env` (see `.env.example`).
+
+---
+
## Mock mode
Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode.
@@ -139,6 +197,7 @@ Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `Voice
| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
| `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types |
| `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory |
+| `cf_voice.telephony` | MIT (Protocol + Mock), BSL (backends) | `TelephonyBackend` Protocol, `MockTelephonyBackend`, `SignalWireBackend`, `FreeSWITCHBackend`, `make_telephony()` |
| `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing |
| `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper |
| `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody |
@@ -149,6 +208,19 @@ BSL applies to inference modules. IO + types + wire format = MIT.
---
+---
+
+## Attribution
+
+Speaker diarization uses [pyannote.audio](https://github.com/pyannote/pyannote-audio) (MIT) and the following gated HuggingFace models (CC BY 4.0):
+
+- `pyannote/speaker-diarization-3.1` — Hervé Bredin et al.
+- `pyannote/segmentation-3.0` — Hervé Bredin et al.
+
+CC BY 4.0 requires attribution in any distributed product. The models are gated: each user must accept the license terms on HuggingFace before their `HF_TOKEN` will authorize a download.
+
+---
+
## Consumed by
- `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer)
diff --git a/cf_voice/accent.py b/cf_voice/accent.py
new file mode 100644
index 0000000..5882510
--- /dev/null
+++ b/cf_voice/accent.py
@@ -0,0 +1,152 @@
+# cf_voice/accent.py — accent / language identification classifier
+#
+# MIT licensed (AccentResult dataclass + mock). BSL 1.1 (real inference).
+# Gated by CF_VOICE_ACCENT=1 — off by default (GPU cost + privacy sensitivity).
+#
+# Accent alone is not high-risk, but combined with birdsong or a quiet rural
+# background it becomes location-identifying. The privacy scorer accounts for
+# this compound signal.
+#
+# Real backend: facebook/mms-lid-126 for language detection, wav2vec2 accent
+# fine-tune for region. Lazy-loaded to keep startup fast.
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AccentResult:
+ """
+ Language + regional accent classification for the primary speaker.
+
+ language: BCP-47 language tag (e.g. "en", "fr", "zh")
+ region: cf-voice ACCENT_LABEL string (e.g. "en_gb", "en_us", "other")
+ confidence: float in [0, 1]
+ """
+ language: str
+ region: str
+ confidence: float
+
+
+class MockAccentClassifier:
+ """
+ Synthetic accent classifier for development and CI.
+
+ Returns a fixed result so the privacy scorer can exercise all code paths
+ without loading a real model.
+ """
+
+ def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
+ return AccentResult(language="en", region="en_gb", confidence=0.72)
+
+
+class AccentClassifier:
+ """
+ Real accent / language classifier.
+
+ BSL 1.1 — requires [inference] extras.
+
+ Language detection: facebook/mms-lid-126 (126 languages, MIT licensed).
+ Accent region: maps language tag to a regional ACCENT_LABEL.
+
+ VRAM: ~500 MB on CUDA.
+ """
+
+ _LANG_MODEL_ID = "facebook/mms-lid-126"
+
+ def __init__(self) -> None:
+ try:
+ from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
+ except ImportError as exc:
+ raise ImportError(
+ "transformers is required for accent classification. "
+ "Install with: pip install cf-voice[inference]"
+ ) from exc
+
+ import torch
+
+ self._device = "cuda" if torch.cuda.is_available() else "cpu"
+ logger.info("Loading language ID model %s on %s", self._LANG_MODEL_ID, self._device)
+ self._extractor = AutoFeatureExtractor.from_pretrained(self._LANG_MODEL_ID)
+ self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
+ self._LANG_MODEL_ID
+ ).to(self._device)
+ self._model.eval()
+
+ def classify(self, audio: "list[float] | bytes") -> AccentResult | None:
+ import numpy as np
+ import torch
+
+ if isinstance(audio, bytes):
+ audio_np = np.frombuffer(audio, dtype=np.float32)
+ else:
+ audio_np = np.asarray(audio, dtype=np.float32)
+
+ if len(audio_np) < 1600: # need at least 100ms at 16kHz
+ return None
+
+ inputs = self._extractor(
+ audio_np, sampling_rate=16_000, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self._device) for k, v in inputs.items()}
+
+ with torch.no_grad():
+ logits = self._model(**inputs).logits
+ probs = torch.softmax(logits, dim=-1)[0]
+
+ top_idx = int(probs.argmax())
+ confidence = float(probs[top_idx])
+ language = self._model.config.id2label.get(top_idx, "other")
+
+ region = _lang_to_region(language)
+ return AccentResult(language=language, region=region, confidence=confidence)
+
+
+def _lang_to_region(lang: str) -> str:
+ """Map a BCP-47 / ISO 639-3 language tag to a cf-voice ACCENT_LABEL."""
+ _MAP: dict[str, str] = {
+ "eng": "en_us", # MMS uses ISO 639-3; sub-regional accent needs fine-tune
+ "fra": "fr",
+ "spa": "es",
+ "deu": "de",
+ "zho": "zh",
+ "jpn": "ja",
+ "en": "en_us",
+ "en-GB": "en_gb",
+ "en-AU": "en_au",
+ "en-CA": "en_ca",
+ "en-IN": "en_in",
+ "fr": "fr",
+ "de": "de",
+ "es": "es",
+ "zh": "zh",
+ "ja": "ja",
+ }
+ return _MAP.get(lang, "other")
+
+
+def make_accent_classifier(
+ mock: bool | None = None,
+) -> "MockAccentClassifier | AccentClassifier | None":
+ """
+ Factory: return an AccentClassifier if CF_VOICE_ACCENT=1, else None.
+
+ Callers must check for None before invoking classify().
+ """
+ enabled = os.environ.get("CF_VOICE_ACCENT", "") == "1"
+ if not enabled:
+ return None
+
+ use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
+ if use_mock:
+ return MockAccentClassifier()
+
+ try:
+ return AccentClassifier()
+ except (ImportError, Exception) as exc:
+ logger.warning("AccentClassifier unavailable (%s) — using mock", exc)
+ return MockAccentClassifier()
diff --git a/cf_voice/acoustic.py b/cf_voice/acoustic.py
new file mode 100644
index 0000000..575bf87
--- /dev/null
+++ b/cf_voice/acoustic.py
@@ -0,0 +1,366 @@
+# cf_voice/acoustic.py — queue / environ / speaker acoustic event classifier
+#
+# MIT licensed (Protocol + mock). BSL 1.1 (real YAMNet inference).
+# Requires [inference] extras for real mode.
+#
+# This module is the AMD (answering machine detection) backbone for Osprey.
+# It runs in parallel with the STT pipeline — it never processes words,
+# only acoustic features (pitch, timbre, background, DTMF tones, ringback).
+#
+# Navigation v0.2.x wires the real YAMNet model.
+# Current: mock emits a plausible call-lifecycle sequence.
+from __future__ import annotations
+
+import asyncio
+import logging
+import random
+import time
+from dataclasses import dataclass
+from typing import AsyncIterator, Protocol, Sequence, runtime_checkable
+
+from cf_voice.events import AudioEvent, QUEUE_LABELS, SPEAKER_LABELS, ENVIRON_LABELS, SCENE_LABELS
+
+logger = logging.getLogger(__name__)
+
+_SAMPLE_RATE = 16_000
+
+
+@dataclass
+class AcousticResult:
+ """Batch of AudioEvents produced from a single audio window."""
+ queue: AudioEvent | None
+ speaker: AudioEvent | None
+ environ: AudioEvent | None
+ scene: AudioEvent | None
+ timestamp: float
+
+
+@runtime_checkable
+class AcousticBackend(Protocol):
+ """
+ Interface for acoustic event classifiers.
+
+ classify_window() takes a PCM float32 buffer (mono, 16kHz) and returns an
+ AcousticResult covering one analysis window (~2s). It is synchronous and
+ runs in a thread pool when called from async code.
+ """
+
+ def classify_window(
+ self,
+ audio: "list[float] | bytes",
+ timestamp: float = 0.0,
+ ) -> AcousticResult:
+ ...
+
+
+@runtime_checkable
+class SceneBackend(Protocol):
+ """
+ Interface for dedicated acoustic scene classifiers.
+
+ Separate from AcousticBackend to allow future swapping to a specialised
+ scene model (e.g. AudioSet acoustic-scene subset) without touching the
+ telephony event classifier.
+ """
+
+ def classify_scene(
+ self,
+ audio: "list[float] | bytes",
+ timestamp: float = 0.0,
+ ) -> AudioEvent | None:
+ ...
+
+
+# ── Call lifecycle for mock mode ──────────────────────────────────────────────
+# Approximates what a real outbound call looks like acoustically.
+# Phases: ringing → ivr_greeting → ivr_navigation → human_answer → call_center
+
+_MOCK_LIFECYCLE: list[dict] = [
+ # (min_s, max_s): how long to stay in this phase
+ {"queue": "ringback", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (2, 5)},
+ {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 2)},
+ {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (2, 8)},
+ {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (1, 3)},
+ {"queue": "dtmf_tone", "speaker": "no_speaker", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
+ {"queue": "silence", "speaker": "ivr_synth", "environ": "quiet", "scene": "indoor_quiet", "dur": (0.5, 1)},
+ {"queue": "hold_music", "speaker": "no_speaker", "environ": "music", "scene": "indoor_quiet", "dur": (3, 12)},
+ # AMD moment: background_shift is the primary signal
+ {"queue": "silence", "speaker": "no_speaker", "environ": "background_shift", "scene": "indoor_crowd", "dur": (0.5, 1)},
+ {"queue": "silence", "speaker": "human_single", "environ": "call_center", "scene": "indoor_crowd", "dur": (30, 60)},
+]
+
+
+class MockAcousticBackend:
+ """
+ Synthetic acoustic classifier for development and CI.
+
+ Cycles through a plausible call lifecycle so Osprey's IVR state machine
+ can be tested without real telephony. The AMD signal (background_shift →
+ human_single) is emitted at the right point in the sequence.
+
+ Usage:
+ backend = MockAcousticBackend(seed=42)
+ result = backend.classify_window(b"", timestamp=4.5)
+ print(result.environ.label) # → "hold_music", "background_shift", etc.
+ """
+
+ def __init__(self, seed: int | None = None) -> None:
+ self._rng = random.Random(seed)
+ self._phase_idx = 0
+ self._phase_start = time.monotonic()
+ self._phase_dur = self._draw_phase_dur(0)
+
+ def _draw_phase_dur(self, idx: int) -> float:
+ lo, hi = _MOCK_LIFECYCLE[idx % len(_MOCK_LIFECYCLE)]["dur"]
+ return self._rng.uniform(lo, hi)
+
+ def _current_phase(self) -> dict:
+ now = time.monotonic()
+ elapsed = now - self._phase_start
+ if elapsed >= self._phase_dur:
+ self._phase_idx = (self._phase_idx + 1) % len(_MOCK_LIFECYCLE)
+ self._phase_start = now
+ self._phase_dur = self._draw_phase_dur(self._phase_idx)
+ return _MOCK_LIFECYCLE[self._phase_idx]
+
+ def _make_event(
+ self,
+ event_type: str,
+ label: str,
+ timestamp: float,
+ ) -> AudioEvent:
+ return AudioEvent(
+ timestamp=timestamp,
+ event_type=event_type, # type: ignore[arg-type]
+ label=label,
+ confidence=self._rng.uniform(0.72, 0.97),
+ )
+
+ def classify_window(
+ self,
+ audio: "list[float] | bytes",
+ timestamp: float = 0.0,
+ ) -> AcousticResult:
+ phase = self._current_phase()
+ return AcousticResult(
+ queue=self._make_event("queue", phase["queue"], timestamp),
+ speaker=self._make_event("speaker", phase["speaker"], timestamp),
+ environ=self._make_event("environ", phase["environ"], timestamp),
+ scene=self._make_event("scene", phase["scene"], timestamp),
+ timestamp=timestamp,
+ )
+
+
+# ── AST acoustic backend (BSL 1.1) ───────────────────────────────────────────
+
+
+class ASTAcousticBackend:
+ """
+ Audio Spectrogram Transformer acoustic event classifier.
+
+ BSL 1.1 — requires [inference] extras.
+
+ Uses MIT/ast-finetuned-audioset-10-10-0.4593 (527 AudioSet classes) to
+ classify queue state, speaker type, and background environment from a
+ single forward pass. Top-15 predictions are scanned; the highest-confidence
+ match per event category is emitted.
+
+ Model: MIT/ast-finetuned-audioset-10-10-0.4593
+ VRAM: ~300 MB on CUDA (fp32)
+ Input: float32 16kHz mono audio (any length; feature extractor pads/truncates)
+
+ Replaces the YAMNet stub. Synchronous — run from a thread pool executor
+ when called from async code.
+ """
+
+ _MODEL_ID = "MIT/ast-finetuned-audioset-10-10-0.4593"
+ _SAMPLE_RATE = 16_000
+ _TOP_K = 20 # scan more classes — many relevant ones are in the 10-20 range
+
+ # Minimum confidence below which an event is suppressed even if it's the
+ # top match in its category.
+ _MIN_CONFIDENCE: dict[str, float] = {
+ "queue": 0.10,
+ "speaker": 0.08,
+ "environ": 0.12,
+ "scene": 0.08, # scenes fire reliably — lower bar is fine
+ }
+
+ # AudioSet class name → (event_type, cf-voice label).
+ # Top-K predictions are scanned; highest confidence per category wins.
+ # "call_center" requires dedicated call-centre acoustics, not generic indoor.
+ # "Music" was previously duplicated (queue + environ) — Python dicts keep the
+ # last entry, silently losing the queue mapping. Fixed: use the specific
+ # "Musical instrument" AudioSet parent for hold_music; "Music" maps to environ.
+ _LABEL_MAP: dict[str, tuple[str, str]] = {
+ # ── Queue / call-state labels ──────────────────────────────────────────
+ "Ringtone": ("queue", "ringback"),
+ "Telephone bell ringing": ("queue", "ringback"),
+ "Busy signal": ("queue", "busy"),
+ "Dial tone": ("queue", "dtmf_tone"),
+ "DTMF": ("queue", "dtmf_tone"),
+ "Silence": ("queue", "silence"),
+ # ── Speaker type labels ────────────────────────────────────────────────
+ "Speech": ("speaker", "human_single"),
+ "Male speech, man speaking": ("speaker", "human_single"),
+ "Female speech, woman speaking": ("speaker", "human_single"),
+ "Child speech, kid speaking": ("speaker", "human_single"),
+ "Crowd": ("speaker", "human_multi"),
+ "Hubbub, speech noise, speech babble": ("speaker", "human_multi"),
+ "Laughter": ("speaker", "human_multi"),
+ "Chuckle, chortle": ("speaker", "human_multi"),
+ "Speech synthesizer": ("speaker", "ivr_synth"),
+ # ── Environmental labels ───────────────────────────────────────────────
+ # Telephony — requires specific call-centre acoustics, not generic indoor
+ "Telephone": ("environ", "call_center"),
+ "Telephone dialing, DTMF": ("environ", "call_center"),
+ "Reverberation": ("environ", "background_shift"),
+ "Echo": ("environ", "background_shift"),
+ "Background noise": ("environ", "noise_floor_change"),
+ "Noise": ("environ", "noise_floor_change"),
+ "White noise": ("environ", "noise_floor_change"),
+ "Pink noise": ("environ", "noise_floor_change"),
+ "Static": ("environ", "noise_floor_change"),
+ "Music": ("environ", "music"),
+ # Nature
+ "Bird": ("environ", "birdsong"),
+ "Bird vocalization, bird call, bird song": ("environ", "birdsong"),
+ "Chirp, tweet": ("environ", "birdsong"),
+ "Wind": ("environ", "wind"),
+ "Wind noise (microphone)": ("environ", "wind"),
+ "Rain": ("environ", "rain"),
+ "Rain on surface": ("environ", "rain"),
+ "Water": ("environ", "water"),
+ "Stream": ("environ", "water"),
+ # Urban
+ "Traffic noise, roadway noise": ("environ", "traffic"),
+ "Vehicle": ("environ", "traffic"),
+ "Crowd": ("environ", "crowd_chatter"),
+ "Chatter": ("environ", "crowd_chatter"),
+ "Construction": ("environ", "construction"),
+ "Drill": ("environ", "construction"),
+ # Indoor
+ "Air conditioning": ("environ", "hvac"),
+ "Mechanical fan": ("environ", "hvac"),
+ "Computer keyboard": ("environ", "keyboard_typing"),
+ "Typing": ("environ", "keyboard_typing"),
+ "Restaurant": ("environ", "restaurant"),
+ "Dishes, pots, and pans": ("environ", "restaurant"),
+ # ── Acoustic scene labels ──────────────────────────────────────────────
+ # "Inside, small/large room" moved from environ to scene — they correctly
+ # describe the acoustic scene but are NOT specific enough for call_center.
+ "Inside, small room": ("scene", "indoor_quiet"),
+ "Inside, large room or hall": ("scene", "indoor_crowd"),
+ "Outside, urban or manmade": ("scene", "outdoor_urban"),
+ "Field recording": ("scene", "outdoor_nature"),
+ "Rail transport": ("scene", "public_transit"),
+ "Bus": ("scene", "public_transit"),
+ "Train": ("scene", "public_transit"),
+ "Car": ("scene", "vehicle"),
+ "Truck": ("scene", "vehicle"),
+ "Motorcycle": ("scene", "vehicle"),
+ # Music in the queue sense — "Musical instrument" is more specific
+ # than the ambiguous top-level "Music" class
+ "Musical instrument": ("queue", "hold_music"),
+ "Piano": ("queue", "hold_music"),
+ "Guitar": ("queue", "hold_music"),
+ }
+
+ def __init__(self) -> None:
+ try:
+ from transformers import ASTFeatureExtractor, ASTForAudioClassification
+ except ImportError as exc:
+ raise ImportError(
+ "transformers is required for AST acoustic classification. "
+ "Install with: pip install cf-voice[inference]"
+ ) from exc
+
+ import torch
+
+ self._device = "cuda" if torch.cuda.is_available() else "cpu"
+ logger.info("Loading AST acoustic model %s on %s", self._MODEL_ID, self._device)
+ self._extractor = ASTFeatureExtractor.from_pretrained(self._MODEL_ID)
+ self._model = ASTForAudioClassification.from_pretrained(self._MODEL_ID).to(
+ self._device
+ )
+ self._model.eval()
+
+ def classify_window(
+ self,
+ audio: "list[float] | bytes",
+ timestamp: float = 0.0,
+ ) -> AcousticResult:
+ import numpy as np
+ import torch
+
+ if isinstance(audio, bytes):
+ audio_np = np.frombuffer(audio, dtype=np.float32)
+ else:
+ audio_np = np.asarray(audio, dtype=np.float32)
+
+ if len(audio_np) == 0:
+ return AcousticResult(queue=None, speaker=None, environ=None, scene=None, timestamp=timestamp)
+
+ inputs = self._extractor(
+ audio_np, sampling_rate=self._SAMPLE_RATE, return_tensors="pt"
+ )
+ inputs = {k: v.to(self._device) for k, v in inputs.items()}
+
+ with torch.no_grad():
+ logits = self._model(**inputs).logits
+ probs = torch.softmax(logits, dim=-1)[0]
+ id2label = self._model.config.id2label
+
+ top_k = min(self._TOP_K, len(probs))
+ top_indices = probs.topk(top_k).indices.tolist()
+ predictions = [(id2label[i], float(probs[i])) for i in top_indices]
+
+ # Take highest-confidence match per category
+ best: dict[str, tuple[str, float]] = {} # event_type → (label, conf)
+ for ast_label, conf in predictions:
+ mapping = self._LABEL_MAP.get(ast_label)
+ if mapping is None:
+ continue
+ etype, cf_label = mapping
+ if etype not in best or conf > best[etype][1]:
+ best[etype] = (cf_label, conf)
+
+ def _make_event(etype: str, label: str, conf: float) -> AudioEvent:
+ return AudioEvent(
+ timestamp=timestamp,
+ event_type=etype, # type: ignore[arg-type]
+ label=label,
+ confidence=round(conf, 4),
+ )
+
+ def _above_threshold(etype: str) -> bool:
+ if etype not in best:
+ return False
+ _, conf = best[etype]
+ return conf >= self._MIN_CONFIDENCE.get(etype, 0.10)
+
+ return AcousticResult(
+ queue=_make_event("queue", *best["queue"]) if _above_threshold("queue") else None,
+ speaker=_make_event("speaker", *best["speaker"]) if _above_threshold("speaker") else None,
+ environ=_make_event("environ", *best["environ"]) if _above_threshold("environ") else None,
+ scene=_make_event("scene", *best["scene"]) if _above_threshold("scene") else None,
+ timestamp=timestamp,
+ )
+
+
+def make_acoustic(mock: bool | None = None) -> "MockAcousticBackend | ASTAcousticBackend":
+ """
+ Factory: return an AcousticBackend for the current environment.
+
+ mock=True or CF_VOICE_MOCK=1 → MockAcousticBackend
+ Otherwise → ASTAcousticBackend (falls back to mock on import error)
+ """
+ import os
+ use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
+ if use_mock:
+ return MockAcousticBackend()
+ try:
+ return ASTAcousticBackend()
+ except (ImportError, Exception) as exc:
+ logger.warning("ASTAcousticBackend unavailable (%s) — using mock", exc)
+ return MockAcousticBackend()
diff --git a/cf_voice/app.py b/cf_voice/app.py
new file mode 100644
index 0000000..2aac11a
--- /dev/null
+++ b/cf_voice/app.py
@@ -0,0 +1,197 @@
+"""
+cf-voice FastAPI service — managed by cf-orch.
+
+Tone/affect classification sidecar for Linnet and any product that needs
+real-time audio context annotation. Wraps ContextClassifier so it runs as an
+independent managed process rather than embedded in the consumer's process.
+
+Endpoints:
+ GET /health → {"status": "ok", "mode": "mock"|"real"}
+ POST /classify → ClassifyResponse
+
+Usage:
+ python -m cf_voice.app --port 8007 --gpu-id 0
+
+Mock mode (no GPU, no audio hardware required):
+ CF_VOICE_MOCK=1 python -m cf_voice.app --port 8007
+"""
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+from cf_voice.context import ContextClassifier, model_status
+
+logger = logging.getLogger(__name__)
+
+_classifier: ContextClassifier | None = None
+_mock_mode: bool = False
+
+
+# ── Request / response models ─────────────────────────────────────────────────
+
+
+class ClassifyRequest(BaseModel):
+ audio_chunk: str | None = None # base64-encoded PCM int16 mono 16kHz; None in mock mode
+ timestamp: float = 0.0
+ elcor: bool | None = None
+ prior_frames: int | None = None
+ session_id: str = ""
+ language: str | None = None # BCP-47 hint for Whisper ("en", "es", …); None = auto-detect
+ num_speakers: int | None = None # pyannote hint: None = auto; 1–8 = fixed min+max
+
+
+class AudioEventOut(BaseModel):
+ event_type: str
+ label: str
+ confidence: float
+ timestamp: float
+ speaker_id: str = "speaker_a"
+ subtext: str | None = None
+ affect: str | None = None
+ shift_magnitude: float | None = None
+ shift_direction: str | None = None
+ prosody_flags: list[str] = []
+ # Dimensional emotion (audeering model) — None when classifier disabled
+ valence: float | None = None
+ arousal: float | None = None
+ dominance: float | None = None
+ # Prosodic signals (openSMILE) — None when extractor disabled
+ sarcasm_risk: float | None = None
+ flat_f0_score: float | None = None
+ # Trajectory signals — None until BASELINE_MIN frames buffered per speaker
+ arousal_delta: float | None = None
+ valence_delta: float | None = None
+ trend: str | None = None
+ # Coherence signals (SER vs VAD)
+ coherence_score: float | None = None
+ suppression_flag: bool | None = None
+ reframe_type: str | None = None
+ affect_divergence: float | None = None
+
+
+class ClassifyResponse(BaseModel):
+ events: list[AudioEventOut]
+
+
+# ── App factory ───────────────────────────────────────────────────────────────
+
+
+def create_app(gpu_id: int = 0, mock: bool = False) -> FastAPI:
+ global _classifier, _mock_mode
+
+ # Signal GPU to the inference backends (wav2vec2 loads via transformers pipeline)
+ if not mock:
+ os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
+
+ _mock_mode = mock or os.environ.get("CF_VOICE_MOCK", "") == "1"
+ _classifier = ContextClassifier.mock() if _mock_mode else ContextClassifier.from_env()
+ logger.info("cf-voice ready: mode=%s", "mock" if _mock_mode else "real")
+
+ app = FastAPI(title="cf-voice", version="0.1.0")
+
+ @app.on_event("startup")
+ async def _startup_prewarm() -> None:
+ """Pre-warm all configured models so downloads happen at startup, not
+ on the first classify call (which has a hard 9-second timeout)."""
+ if _classifier is not None:
+ import asyncio as _asyncio
+ _asyncio.create_task(_classifier.prewarm())
+
+ @app.get("/health")
+ def health() -> dict:
+ result: dict = {
+ "status": "ok",
+ "mode": "mock" if _mock_mode else "real",
+ "models": dict(model_status),
+ }
+ # Surface misconfigured-but-silent diarizer so Linnet can warn the user.
+ # Check env vars only — no model loading needed at health-check time.
+ warnings: list[str] = []
+ if os.environ.get("CF_VOICE_DIARIZE", "0") == "1":
+ token = os.environ.get("HF_TOKEN", "").strip()
+ if not token:
+ warnings.append(
+ "Diarization is enabled (CF_VOICE_DIARIZE=1) but HF_TOKEN is not set. "
+ "Speaker identity badges will not appear. "
+ "Set HF_TOKEN in your .env and accept pyannote model terms at huggingface.co."
+ )
+ if warnings:
+ result["warnings"] = warnings
+ return result
+
+ @app.post("/classify")
+ async def classify(req: ClassifyRequest) -> ClassifyResponse:
+ if _classifier is None:
+ raise HTTPException(503, detail="classifier not initialised")
+ try:
+ events = await _classifier.classify_chunk_async(
+ audio_b64=req.audio_chunk,
+ timestamp=req.timestamp,
+ prior_frames=req.prior_frames,
+ elcor=req.elcor,
+ session_id=req.session_id,
+ language=req.language,
+ num_speakers=req.num_speakers,
+ )
+ except NotImplementedError as exc:
+ raise HTTPException(501, detail=str(exc))
+
+ from cf_voice.events import ToneEvent
+
+ out: list[AudioEventOut] = []
+ for e in events:
+ is_tone = isinstance(e, ToneEvent)
+ out.append(AudioEventOut(
+ event_type=e.event_type,
+ label=e.label,
+ confidence=round(e.confidence, 4),
+ timestamp=e.timestamp,
+ speaker_id=getattr(e, "speaker_id", "speaker_a") or "speaker_a",
+ subtext=getattr(e, "subtext", None),
+ affect=getattr(e, "affect", None) if is_tone else None,
+ shift_magnitude=getattr(e, "shift_magnitude", None) if is_tone else None,
+ shift_direction=getattr(e, "shift_direction", None) if is_tone else None,
+ prosody_flags=getattr(e, "prosody_flags", []) if is_tone else [],
+ valence=getattr(e, "valence", None) if is_tone else None,
+ arousal=getattr(e, "arousal", None) if is_tone else None,
+ dominance=getattr(e, "dominance", None) if is_tone else None,
+ sarcasm_risk=getattr(e, "sarcasm_risk", None) if is_tone else None,
+ flat_f0_score=getattr(e, "flat_f0_score", None) if is_tone else None,
+ arousal_delta=getattr(e, "arousal_delta", None) if is_tone else None,
+ valence_delta=getattr(e, "valence_delta", None) if is_tone else None,
+ trend=getattr(e, "trend", None) if is_tone else None,
+ coherence_score=getattr(e, "coherence_score", None) if is_tone else None,
+ suppression_flag=getattr(e, "suppression_flag", None) if is_tone else None,
+ reframe_type=getattr(e, "reframe_type", None) if is_tone else None,
+ affect_divergence=getattr(e, "affect_divergence", None) if is_tone else None,
+ ))
+ return ClassifyResponse(events=out)
+
+ return app
+
+
+# ── CLI entrypoint ────────────────────────────────────────────────────────────
+
+
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description="cf-voice tone classification server")
+ parser.add_argument("--port", type=int, default=8007)
+ parser.add_argument("--host", default="0.0.0.0")
+ parser.add_argument("--gpu-id", type=int, default=0)
+ parser.add_argument("--mock", action="store_true",
+ help="Run in mock mode (no GPU, no audio hardware needed)")
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.INFO,
+ format="%(asctime)s %(levelname)s %(name)s — %(message)s")
+ args = _parse_args()
+ app = create_app(gpu_id=args.gpu_id, mock=args.mock)
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/cf_voice/classify.py b/cf_voice/classify.py
index f5dff6a..256d011 100644
--- a/cf_voice/classify.py
+++ b/cf_voice/classify.py
@@ -82,13 +82,21 @@ class ToneClassifier:
Tone/affect classifier: wav2vec2 SER + librosa prosody.
Loads the model lazily on first call to avoid import-time GPU allocation.
- Thread-safe for concurrent classify() calls — the pipeline is stateless
+ Thread-safe for concurrent classify() calls — the model is stateless
per-call; session state lives in the caller (ContextClassifier).
+
+ Uses AutoFeatureExtractor + AutoModelForAudioClassification directly
+ rather than hf_pipeline to avoid torchcodec audio backend initialization.
+ torchcodec 0.11.0 requires libnvrtc.so.13, which is absent on CUDA 12.x
+ systems. Calling the model directly bypasses the pipeline's audio backend
+ selection entirely since we already have float32 at 16kHz.
"""
def __init__(self, threshold: float = _DEFAULT_THRESHOLD) -> None:
self._threshold = threshold
- self._pipeline = None # lazy-loaded
+ self._feature_extractor = None # lazy-loaded
+ self._model = None # lazy-loaded
+ self._device: str = "cpu"
@classmethod
def from_env(cls) -> "ToneClassifier":
@@ -96,23 +104,41 @@ class ToneClassifier:
return cls(threshold=threshold)
def _load_pipeline(self) -> None:
- if self._pipeline is not None:
+ if self._model is not None:
return
try:
- from transformers import pipeline as hf_pipeline
+ from transformers import (
+ AutoFeatureExtractor,
+ AutoModelForAudioClassification,
+ )
except ImportError as exc:
raise ImportError(
"transformers is required for tone classification. "
"Install with: pip install cf-voice[inference]"
) from exc
- device = 0 if _cuda_available() else -1
- logger.info("Loading SER model %s on device %s", _SER_MODEL_ID, device)
- self._pipeline = hf_pipeline(
- "audio-classification",
- model=_SER_MODEL_ID,
- device=device,
+ import torch
+
+ if _cuda_available():
+ self._device = "cuda:0"
+ # fp16 halves VRAM from ~6.7 GB to ~3.3 GB on RTX 4000.
+ # Only supported on CUDA — CPU must stay float32.
+ torch_dtype = torch.float16
+ else:
+ self._device = "cpu"
+ torch_dtype = torch.float32
+
+ logger.info(
+ "Loading SER model %s on device=%s dtype=%s",
+ _SER_MODEL_ID, self._device, torch_dtype,
)
+ self._feature_extractor = AutoFeatureExtractor.from_pretrained(_SER_MODEL_ID)
+ self._model = AutoModelForAudioClassification.from_pretrained(
+ _SER_MODEL_ID,
+ torch_dtype=torch_dtype,
+ ).to(self._device)
+ # Switch to inference mode — disables dropout, batch norm tracks running stats
+ self._model.train(False)
def classify(self, audio_float32: np.ndarray, transcript: str = "") -> ToneResult:
"""
@@ -121,13 +147,33 @@ class ToneClassifier:
transcript is used as a weak signal for ambiguous cases (e.g. words
like "unfortunately" bias toward apologetic even on a neutral voice).
"""
+ import torch
+
self._load_pipeline()
# Ensure the model sees float32 at the right rate
assert audio_float32.dtype == np.float32, "audio must be float32"
- # Run SER
- preds = self._pipeline({"raw": audio_float32, "sampling_rate": _SAMPLE_RATE})
+ # Run SER — call feature extractor + model directly to bypass the
+ # hf_pipeline audio backend (avoids torchcodec / libnvrtc dependency).
+ inputs = self._feature_extractor(
+ audio_float32,
+ sampling_rate=_SAMPLE_RATE,
+ return_tensors="pt",
+ )
+ inputs = {k: v.to(self._device) for k, v in inputs.items()}
+ if self._model.dtype == torch.float16:
+ inputs = {k: v.to(torch.float16) for k, v in inputs.items()}
+
+ with torch.no_grad():
+ logits = self._model(**inputs).logits
+ probs = torch.softmax(logits, dim=-1)[0]
+ id2label = self._model.config.id2label
+ preds = [
+ {"label": id2label[i], "score": float(probs[i])}
+ for i in range(len(probs))
+ ]
+
best = max(preds, key=lambda p: p["score"])
emotion = best["label"].lower()
confidence = float(best["score"])
@@ -158,7 +204,7 @@ class ToneClassifier:
self, audio_float32: np.ndarray, transcript: str = ""
) -> ToneResult:
"""classify() without blocking the event loop."""
- loop = asyncio.get_event_loop()
+ loop = asyncio.get_running_loop()
fn = partial(self.classify, audio_float32, transcript)
return await loop.run_in_executor(None, fn)
diff --git a/cf_voice/context.py b/cf_voice/context.py
index 7b450f4..27d5951 100644
--- a/cf_voice/context.py
+++ b/cf_voice/context.py
@@ -1,99 +1,289 @@
-# cf_voice/context.py — tone classification and context enrichment
+# cf_voice/context.py — parallel audio context classifier (orchestrator)
#
# BSL 1.1 when real inference models are integrated.
-# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
+# Mock mode: MIT licensed (no real inference).
#
-# Real implementation (Notation v0.1.x) will:
-# - Run YAMNet acoustic event detection on the audio buffer
-# - Run wav2vec2-based SER (speech emotion recognition)
-# - Run librosa prosody extraction (pitch, energy, rate)
-# - Combine into enriched VoiceFrame label + confidence
-# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
+# Runs three classifiers in parallel against the same audio window:
+# 1. Tone/affect (classify.py) — wav2vec2 SER + librosa prosody
+# 2. Queue/environ (acoustic.py) — YAMNet acoustic event detection
+# 3. Speaker type/VAD (diarize.py) — pyannote.audio (Navigation v0.2.x)
+#
+# Combined output is a list[AudioEvent] per window, merged into VoiceFrame
+# for the streaming path.
+#
+# Elcor mode reads from cf-core preferences (cf_voice.prefs) so that the
+# annotation format is user-configurable without per-request flags.
from __future__ import annotations
+import asyncio
+import logging
import os
from typing import AsyncIterator
+from cf_voice.acoustic import MockAcousticBackend, make_acoustic
from cf_voice.events import AudioEvent, ToneEvent, tone_event_from_voice_frame
from cf_voice.io import MockVoiceIO, VoiceIO, make_io
from cf_voice.models import VoiceFrame
+from cf_voice.prefs import get_elcor_prior_frames, is_elcor_enabled
+
+logger = logging.getLogger(__name__)
+
+# ── Per-model download/load status registry ───────────────────────────────────
+# Written by _load_* methods; read by the /health endpoint in app.py.
+# Values: "disabled" | "loading" | "ready" | "error"
+# Thread-safe: individual str assignment is atomic in CPython.
+model_status: dict[str, str] = {}
+
+
+# ── No-op coroutines for disabled/unavailable classifiers ─────────────────────
+
+async def _noop_stt() -> None:
+ """Placeholder when STT is disabled or unavailable."""
+ return None
+
+
+async def _noop_diarize() -> list:
+ """Placeholder when diarization is disabled or unavailable."""
+ return []
+
+
+# ─────────────────────────────────────────────────────────────────────────────
class ContextClassifier:
"""
High-level voice context classifier.
- Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
- In stub mode the frames pass through unchanged — the enrichment pipeline
- (YAMNet + wav2vec2 + librosa) is filled in incrementally.
+ Wraps a VoiceIO source and runs three parallel classifiers on each audio
+ window: tone (SER), queue/environ (YAMNet), and speaker (pyannote).
+
+ In mock mode all classifiers produce synthetic events — no GPU, microphone,
+ or HuggingFace token required.
Usage
-----
classifier = ContextClassifier.from_env()
async for frame in classifier.stream():
print(frame.label, frame.confidence)
+
+ For the full multi-class event list (queue + speaker + tone):
+ events = classifier.classify_chunk(audio_b64, timestamp=4.5)
"""
- def __init__(self, io: VoiceIO) -> None:
+ def __init__(
+ self,
+ io: VoiceIO,
+ user_id: str | None = None,
+ store=None,
+ ) -> None:
self._io = io
+ self._user_id = user_id
+ self._store = store
+ self._acoustic = make_acoustic(
+ mock=isinstance(io, MockVoiceIO)
+ or os.environ.get("CF_VOICE_MOCK", "") == "1"
+ )
+ # Lazy — loaded on first real classify call, then reused.
+ self._tone: "ToneClassifier | None" = None
+ # STT: loaded if faster-whisper is installed. Controlled by CF_VOICE_STT (default: 1).
+ self._stt: "WhisperSTT | None" = None
+ self._stt_loaded: bool = False # False = not yet attempted
+ # Diarizer: optional — requires HF_TOKEN and CF_VOICE_DIARIZE=1.
+ self._diarizer: "Diarizer | None" = None
+ self._diarizer_loaded: bool = False
+ # Per-session speaker label tracker — maps pyannote IDs → "Speaker A/B/..."
+ # Reset at session end (when the ContextClassifier is stopped).
+ from cf_voice.diarize import SpeakerTracker
+ self._speaker_tracker: SpeakerTracker = SpeakerTracker()
+ # One-at-a-time GPU classify gate. All three models share the same GPU;
+ # running them "in parallel" just serializes at the CUDA level while
+ # filling the thread pool. Drop incoming frames when a classify is
+ # already in flight — freshness beats completeness for real-time audio.
+ self._classify_lock: asyncio.Lock = asyncio.Lock()
+ # Dimensional classifier (audeering) — lazy, CF_VOICE_DIMENSIONAL=1
+ self._dimensional: "DimensionalClassifier | None" = None
+ self._dimensional_loaded: bool = False
+ # Prosodic extractor (openSMILE) — lazy, CF_VOICE_PROSODY=1
+ self._prosodic: "ProsodicExtractor | None" = None
+ self._prosodic_loaded: bool = False
+ # Per-speaker rolling dimensional buffers for trajectory/coherence signals.
+ # Keys are speaker_id strings; values are deques of DimensionalResult.
+ # Reset at session end alongside SpeakerTracker.
+ from collections import deque as _deque
+ from cf_voice.trajectory import BUFFER_WINDOW
+ self._dim_buffer: dict[str, "_deque"] = {}
+ self._last_ser_affect: dict[str, str] = {}
+ self._buffer_window = BUFFER_WINDOW
+ # Accent classifier — lazy, gated by CF_VOICE_ACCENT=1
+ self._accent: "MockAccentClassifier | AccentClassifier | None" = None
+ self._accent_loaded: bool = False
@classmethod
- def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
+ def from_env(
+ cls,
+ interval_s: float = 2.5,
+ user_id: str | None = None,
+ store=None,
+ ) -> "ContextClassifier":
"""
Create a ContextClassifier from environment.
- CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
+
+ CF_VOICE_MOCK=1 activates full mock mode (no GPU, no audio hardware).
+ If real audio hardware is unavailable (faster-whisper not installed),
+ falls back to mock mode automatically.
+ user_id + store are forwarded to cf-core preferences for Elcor/threshold
+ lookups.
"""
- io = make_io(interval_s=interval_s)
- return cls(io=io)
+ if os.environ.get("CF_VOICE_MOCK", "") == "1":
+ return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
+ try:
+ io = make_io(interval_s=interval_s)
+ except (NotImplementedError, ImportError):
+ # Real audio hardware or inference extras unavailable — fall back to
+ # mock mode so the coordinator starts cleanly on headless nodes.
+ return cls.mock(interval_s=interval_s, user_id=user_id, store=store)
+ return cls(io=io, user_id=user_id, store=store)
@classmethod
- def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
+ def mock(
+ cls,
+ interval_s: float = 2.5,
+ seed: int | None = None,
+ user_id: str | None = None,
+ store=None,
+ ) -> "ContextClassifier":
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
- from cf_voice.io import MockVoiceIO
- return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
+ return cls(
+ io=MockVoiceIO(interval_s=interval_s, seed=seed),
+ user_id=user_id,
+ store=store,
+ )
async def stream(self) -> AsyncIterator[VoiceFrame]:
"""
Yield enriched VoiceFrames continuously.
Stub: frames from the IO layer pass through unchanged.
- Real: enrichment pipeline runs here before yield.
+ Real (Navigation v0.2.x): acoustic + diarization enrichment runs here.
"""
async for frame in self._io.stream():
yield self._enrich(frame)
async def stop(self) -> None:
await self._io.stop()
+ self._speaker_tracker.reset()
+ self._dim_buffer.clear()
+ self._last_ser_affect.clear()
def classify_chunk(
self,
- audio_b64: str,
+ audio_b64: str | None = None,
timestamp: float = 0.0,
- prior_frames: int = 0,
- elcor: bool = False,
+ prior_frames: int | None = None,
+ elcor: bool | None = None,
+ session_id: str = "",
) -> list[AudioEvent]:
"""
- Classify a single audio chunk and return AudioEvents.
+ Classify a single audio window and return all AudioEvents.
- This is the request-response path used by the cf-orch endpoint.
+ Returns a heterogeneous list containing zero or one of each:
+ - ToneEvent (event_type="tone")
+ - AudioEvent (event_type="queue")
+ - AudioEvent (event_type="speaker")
+ - AudioEvent (event_type="environ")
+
+ This is the request-response path used by the cf-orch SSE endpoint.
The streaming path (async generator) is for continuous consumers.
- elcor=True switches subtext format to Mass Effect Elcor prefix style.
- Generic tone annotation is always present regardless of elcor flag.
+ audio_b64 Base64-encoded PCM int16 mono 16kHz bytes.
+ Pass None in mock mode (ignored).
+ timestamp Session-relative seconds since capture started.
+ prior_frames Rolling context window size for Elcor LLM.
+ Defaults to user preference (PREF_ELCOR_PRIOR_FRAMES).
+ elcor Override Elcor mode for this request.
+ None = read from user preference (PREF_ELCOR_MODE).
+ session_id Caller-assigned correlation ID for the session.
"""
- if isinstance(self._io, MockVoiceIO):
- return self._classify_chunk_mock(timestamp, prior_frames, elcor)
+ use_elcor = elcor if elcor is not None else is_elcor_enabled(
+ user_id=self._user_id, store=self._store
+ )
+ context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
+ user_id=self._user_id, store=self._store
+ )
- return self._classify_chunk_real(audio_b64, timestamp, elcor)
+ if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
+ return self._classify_mock(timestamp, context_frames, use_elcor, session_id)
- def _classify_chunk_mock(
- self, timestamp: float, prior_frames: int, elcor: bool
+ if not audio_b64:
+ return []
+
+ return self._classify_real(audio_b64, timestamp, use_elcor, session_id)
+
+ async def classify_chunk_async(
+ self,
+ audio_b64: str | None = None,
+ timestamp: float = 0.0,
+ prior_frames: int | None = None,
+ elcor: bool | None = None,
+ session_id: str = "",
+ language: str | None = None,
+ num_speakers: int | None = None,
) -> list[AudioEvent]:
- """Synthetic path — used in mock mode and CI."""
+ """
+ Async variant of classify_chunk.
+
+ Runs tone, STT, diarization, and acoustic classification in parallel
+ using asyncio.gather(). Use this from async contexts (FastAPI routes)
+ to get true concurrency across all four inference paths.
+ """
+ use_elcor = elcor if elcor is not None else is_elcor_enabled(
+ user_id=self._user_id, store=self._store
+ )
+ context_frames = prior_frames if prior_frames is not None else get_elcor_prior_frames(
+ user_id=self._user_id, store=self._store
+ )
+
+ if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
+ return self._classify_mock(timestamp, context_frames, use_elcor, session_id)
+
+ if not audio_b64:
+ return []
+
+ # Drop frame if a classify is already in flight — GPU models serialize
+ # anyway, so queuing just adds latency without improving output.
+ if self._classify_lock.locked():
+ logger.debug("classify busy — dropping frame at t=%.2f", timestamp)
+ return []
+
+ async with self._classify_lock:
+ # Diarization (pyannote) can take 3–8 s on first invocations even with GPU.
+ # 25 s gives enough headroom without stalling the stream for too long.
+ try:
+ return await asyncio.wait_for(
+ self._classify_real_async(audio_b64, timestamp, use_elcor, session_id, language, num_speakers),
+ timeout=25.0,
+ )
+ except asyncio.TimeoutError:
+ logger.warning("classify_real_async timed out at t=%.2f — dropping frame", timestamp)
+ return []
+
+ def _classify_mock(
+ self,
+ timestamp: float,
+ prior_frames: int,
+ elcor: bool,
+ session_id: str,
+ ) -> list[AudioEvent]:
+ """
+ Synthetic multi-class event batch.
+
+ Tone event comes from the MockVoiceIO RNG (consistent seed behaviour).
+ Queue/speaker/environ come from MockAcousticBackend (call lifecycle simulation).
+ """
rng = self._io._rng # type: ignore[attr-defined]
- import time as _time
label = rng.choice(self._io._labels) # type: ignore[attr-defined]
shift = rng.uniform(0.1, 0.7) if prior_frames > 0 else 0.0
+
frame = VoiceFrame(
label=label,
confidence=rng.uniform(0.6, 0.97),
@@ -101,30 +291,54 @@ class ContextClassifier:
shift_magnitude=round(shift, 3),
timestamp=timestamp,
)
- tone = tone_event_from_voice_frame(
+ tone: ToneEvent = tone_event_from_voice_frame(
frame_label=frame.label,
frame_confidence=frame.confidence,
shift_magnitude=frame.shift_magnitude,
timestamp=frame.timestamp,
elcor=elcor,
)
- return [tone]
+ tone.session_id = session_id
- def _classify_chunk_real(
- self, audio_b64: str, timestamp: float, elcor: bool
+ acoustic = self._acoustic.classify_window(b"", timestamp=timestamp)
+
+ events: list[AudioEvent] = [tone]
+ if acoustic.queue:
+ events.append(acoustic.queue)
+ if acoustic.speaker:
+ events.append(acoustic.speaker)
+ if acoustic.environ:
+ events.append(acoustic.environ)
+ if acoustic.scene:
+ events.append(acoustic.scene)
+ return events
+
+ def _classify_real(
+ self,
+ audio_b64: str,
+ timestamp: float,
+ elcor: bool,
+ session_id: str,
) -> list[AudioEvent]:
- """Real inference path — used when CF_VOICE_MOCK is unset."""
- import asyncio
+ """
+ Real inference path — used when CF_VOICE_MOCK is unset.
+
+ Tone: wav2vec2 SER via ToneClassifier (classify.py).
+ Acoustic: YAMNet via YAMNetAcousticBackend (Navigation v0.2.x stub).
+ Speaker: pyannote VAD (diarize.py) — merged in ContextClassifier, not here.
+ """
import base64
+
import numpy as np
+
from cf_voice.classify import ToneClassifier
pcm = base64.b64decode(audio_b64)
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
- # ToneClassifier is stateless per-call, safe to instantiate inline
- classifier = ToneClassifier.from_env()
- tone_result = classifier.classify(audio)
+ if self._tone is None:
+ self._tone = ToneClassifier.from_env()
+ tone_result = self._tone.classify(audio)
frame = VoiceFrame(
label=tone_result.label,
@@ -133,20 +347,398 @@ class ContextClassifier:
shift_magnitude=0.0,
timestamp=timestamp,
)
- event = tone_event_from_voice_frame(
+ tone: ToneEvent = tone_event_from_voice_frame(
frame_label=frame.label,
frame_confidence=frame.confidence,
shift_magnitude=frame.shift_magnitude,
timestamp=frame.timestamp,
elcor=elcor,
)
- return [event]
+ tone.session_id = session_id
+
+ events: list[AudioEvent] = [tone]
+
+ # Acoustic events: Navigation v0.2.x (YAMNet not yet implemented)
+ # YAMNetAcousticBackend raises NotImplementedError at construction —
+ # we catch and log rather than failing the entire classify call.
+ try:
+ acoustic = self._acoustic.classify_window(audio.tobytes(), timestamp=timestamp)
+ if acoustic.queue:
+ events.append(acoustic.queue)
+ if acoustic.speaker:
+ events.append(acoustic.speaker)
+ if acoustic.environ:
+ events.append(acoustic.environ)
+ if acoustic.scene:
+ events.append(acoustic.scene)
+ except NotImplementedError:
+ pass
+
+ return events
+
+ def _load_stt(self) -> "WhisperSTT | None":
+ """Lazy-load WhisperSTT once. Returns None if unavailable or disabled."""
+ if self._stt_loaded:
+ return self._stt
+ self._stt_loaded = True
+ if os.environ.get("CF_VOICE_STT", "1") != "1":
+ model_status["stt"] = "disabled"
+ return None
+ model_status["stt"] = "loading"
+ try:
+ from cf_voice.stt import WhisperSTT
+ self._stt = WhisperSTT.from_env()
+ model_status["stt"] = "ready"
+ logger.info("WhisperSTT loaded (model=%s)", os.environ.get("CF_VOICE_WHISPER_MODEL", "small"))
+ except Exception as exc:
+ model_status["stt"] = "error"
+ logger.warning("WhisperSTT unavailable: %s", exc)
+ return self._stt
+
+ def _load_diarizer(self) -> "Diarizer | None":
+ """Lazy-load Diarizer once. Returns None if HF_TOKEN absent or CF_VOICE_DIARIZE!=1."""
+ if self._diarizer_loaded:
+ return self._diarizer
+ self._diarizer_loaded = True
+ if os.environ.get("CF_VOICE_DIARIZE", "0") != "1":
+ model_status["diarizer"] = "disabled"
+ return None
+ model_status["diarizer"] = "loading"
+ try:
+ from cf_voice.diarize import Diarizer
+ self._diarizer = Diarizer.from_env()
+ model_status["diarizer"] = "ready"
+ logger.info("Diarizer loaded")
+ except Exception as exc:
+ model_status["diarizer"] = "error"
+ logger.warning("Diarizer unavailable: %s", exc)
+ return self._diarizer
+
+ def _load_dimensional(self) -> "DimensionalClassifier | None":
+ """Lazy-load DimensionalClassifier once. Returns None if CF_VOICE_DIMENSIONAL!=1."""
+ if self._dimensional_loaded:
+ return self._dimensional
+ self._dimensional_loaded = True
+ if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
+ model_status["dimensional"] = "disabled"
+ return None
+ model_status["dimensional"] = "loading"
+ try:
+ from cf_voice.dimensional import DimensionalClassifier
+ self._dimensional = DimensionalClassifier()
+ model_status["dimensional"] = "ready"
+ logger.info("DimensionalClassifier loaded (audeering VAD model)")
+ except Exception as exc:
+ model_status["dimensional"] = "error"
+ logger.warning("DimensionalClassifier unavailable: %s", exc)
+ return self._dimensional
+
+ def _load_accent(self) -> "MockAccentClassifier | AccentClassifier | None":
+ """Lazy-load AccentClassifier once. Returns None if CF_VOICE_ACCENT!=1."""
+ if self._accent_loaded:
+ return self._accent
+ self._accent_loaded = True
+ from cf_voice.accent import make_accent_classifier
+ result = make_accent_classifier(
+ mock=isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1"
+ )
+ self._accent = result
+ if result is None:
+ model_status["accent"] = "disabled"
+ else:
+ model_status["accent"] = "ready"
+ logger.info("AccentClassifier loaded (mock=%s)", isinstance(result, type(result).__mro__[0]))
+ return self._accent
+
+ def _load_prosodic(self) -> "ProsodicExtractor | None":
+ """Lazy-load ProsodicExtractor once. Returns None if CF_VOICE_PROSODY!=1."""
+ if self._prosodic_loaded:
+ return self._prosodic
+ self._prosodic_loaded = True
+ if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
+ model_status["prosody"] = "disabled"
+ return None
+ model_status["prosody"] = "loading"
+ try:
+ from cf_voice.prosody import ProsodicExtractor
+ self._prosodic = ProsodicExtractor()
+ model_status["prosody"] = "ready"
+ logger.info("ProsodicExtractor loaded (openSMILE eGeMAPS)")
+ except Exception as exc:
+ model_status["prosody"] = "error"
+ logger.warning("ProsodicExtractor unavailable: %s", exc)
+ return self._prosodic
+
+ async def prewarm(self) -> None:
+ """Pre-load all configured models in a thread-pool so downloads happen at
+ startup rather than on the first classify call. Safe to call multiple times
+ (each _load_* method is idempotent after the first call)."""
+ if isinstance(self._io, MockVoiceIO) or os.environ.get("CF_VOICE_MOCK", "") == "1":
+ return
+ loop = asyncio.get_running_loop()
+ # Load each model in its own executor slot so status updates are visible
+ # as each one completes rather than all at once.
+ await loop.run_in_executor(None, self._load_stt)
+ await loop.run_in_executor(None, self._load_diarizer)
+ await loop.run_in_executor(None, self._load_dimensional)
+ await loop.run_in_executor(None, self._load_prosodic)
+ logger.info("cf-voice prewarm complete: %s", model_status)
+
+ async def _classify_real_async(
+ self,
+ audio_b64: str,
+ timestamp: float,
+ elcor: bool,
+ session_id: str,
+ language: str | None = None,
+ num_speakers: int | None = None,
+ ) -> list[AudioEvent]:
+ """
+ Real inference path running all classifiers in parallel.
+
+ Tone (wav2vec2) + STT (Whisper) + Diarization (pyannote, optional) +
+ Acoustic (AST) all run concurrently via asyncio.gather(). Each result
+ is type-checked after gather — a single classifier failure does not
+ abort the call.
+
+ Transcript text is fed back to ToneClassifier as a weak signal (e.g.
+ "unfortunately" biases toward apologetic). Diarizer output sets the
+ speaker_id on the VoiceFrame.
+ """
+ import base64
+ from functools import partial
+
+ import numpy as np
+
+ from cf_voice.classify import ToneClassifier, _apply_transcript_hints, _AFFECT_TO_LABEL
+
+ pcm = base64.b64decode(audio_b64)
+ audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32_768.0
+
+ # Lazy-load models on first real call
+ if self._tone is None:
+ self._tone = ToneClassifier.from_env()
+ stt = self._load_stt()
+ diarizer = self._load_diarizer()
+ dimensional = self._load_dimensional()
+ prosodic = self._load_prosodic()
+ accent_clf = self._load_accent()
+
+ # Build coroutines — all run in thread pool executors internally.
+ # Dimensional, prosodic, and accent run in parallel with SER/STT/diarization.
+ tone_coro = self._tone.classify_async(audio)
+ stt_coro = stt.transcribe_chunk_async(pcm, language=language) if stt else _noop_stt()
+ diarize_coro = diarizer.diarize_async(audio, num_speakers=num_speakers) if diarizer else _noop_diarize()
+ loop = asyncio.get_running_loop()
+ acoustic_coro = loop.run_in_executor(
+ None, partial(self._acoustic.classify_window, audio.tobytes(), timestamp)
+ )
+ dimensional_coro = dimensional.classify_async(audio) if dimensional else _noop_stt()
+ prosodic_coro = prosodic.extract_async(audio) if prosodic else _noop_stt()
+ accent_coro = loop.run_in_executor(
+ None, partial(accent_clf.classify, audio.tobytes())
+ ) if accent_clf else _noop_stt()
+
+ (
+ tone_result, stt_result, diarize_segs, acoustic,
+ dimensional_result, prosodic_result, accent_result,
+ ) = await asyncio.gather(
+ tone_coro, stt_coro, diarize_coro, acoustic_coro,
+ dimensional_coro, prosodic_coro, accent_coro,
+ return_exceptions=True,
+ )
+
+ # Extract transcript text (STT optional)
+ transcript = ""
+ if stt_result and not isinstance(stt_result, BaseException):
+ transcript = stt_result.text # type: ignore[union-attr]
+
+ # Apply transcript weak signal to affect if STT produced text
+ if transcript and not isinstance(tone_result, BaseException):
+ new_affect = _apply_transcript_hints(tone_result.affect, transcript) # type: ignore[union-attr]
+ if new_affect != tone_result.affect: # type: ignore[union-attr]
+ from cf_voice.classify import ToneResult
+ tone_result = ToneResult( # type: ignore[assignment]
+ label=_AFFECT_TO_LABEL.get(new_affect, tone_result.label), # type: ignore[union-attr]
+ affect=new_affect,
+ confidence=tone_result.confidence, # type: ignore[union-attr]
+ prosody_flags=tone_result.prosody_flags, # type: ignore[union-attr]
+ )
+
+ # Get speaker_id from diarization (falls back to "speaker_a")
+ speaker_id = "speaker_a"
+ if isinstance(diarize_segs, BaseException):
+ logger.warning("Diarizer failed in gather: %s", diarize_segs)
+ elif diarizer and diarize_segs is not None:
+ window_mid = len(audio) / 2.0 / 16_000.0
+ speaker_id = diarizer.speaker_at( # type: ignore[arg-type]
+ diarize_segs, window_mid, tracker=self._speaker_tracker
+ )
+ logger.debug("diarize: segs=%d speaker=%s mid=%.3f", len(diarize_segs), speaker_id, window_mid)
+
+ if isinstance(tone_result, BaseException):
+ logger.warning("Tone classifier failed: %s", tone_result)
+ return []
+
+ # Unpack dimensional result (None when classifier is disabled or failed)
+ dim = None
+ if dimensional_result and not isinstance(dimensional_result, BaseException):
+ dim = dimensional_result
+
+ # Unpack prosodic result. If dimensional is also available, pass the
+ # calm-positive score so sarcasm_risk benefits from both signals.
+ pros = None
+ if prosodic_result and not isinstance(prosodic_result, BaseException):
+ if dim is not None:
+ # Re-compute sarcasm_risk with dimensional context
+ from cf_voice.prosody import _compute_sarcasm_risk
+ calm_pos = dim.calm_positive_score()
+ updated_risk = _compute_sarcasm_risk(
+ flat_f0=prosodic_result.flat_f0_score, # type: ignore[union-attr]
+ calm_positive=calm_pos,
+ )
+ from cf_voice.prosody import ProsodicSignal
+ pros = ProsodicSignal(
+ f0_mean=prosodic_result.f0_mean, # type: ignore[union-attr]
+ f0_std=prosodic_result.f0_std, # type: ignore[union-attr]
+ jitter=prosodic_result.jitter, # type: ignore[union-attr]
+ shimmer=prosodic_result.shimmer, # type: ignore[union-attr]
+ loudness=prosodic_result.loudness, # type: ignore[union-attr]
+ flat_f0_score=prosodic_result.flat_f0_score, # type: ignore[union-attr]
+ sarcasm_risk=updated_risk,
+ )
+ else:
+ pros = prosodic_result
+
+ frame = VoiceFrame(
+ label=tone_result.label, # type: ignore[union-attr]
+ confidence=tone_result.confidence, # type: ignore[union-attr]
+ speaker_id=speaker_id,
+ shift_magnitude=0.0,
+ timestamp=timestamp,
+ valence=dim.valence if dim else None,
+ arousal=dim.arousal if dim else None,
+ dominance=dim.dominance if dim else None,
+ sarcasm_risk=pros.sarcasm_risk if pros else None,
+ flat_f0_score=pros.flat_f0_score if pros else None,
+ )
+ tone_event: ToneEvent = tone_event_from_voice_frame(
+ frame_label=frame.label,
+ frame_confidence=frame.confidence,
+ shift_magnitude=frame.shift_magnitude,
+ timestamp=frame.timestamp,
+ elcor=elcor,
+ )
+ tone_event.session_id = session_id
+ tone_event.speaker_id = speaker_id
+ # Attach dimensional and prosodic results to the wire event
+ tone_event.valence = frame.valence
+ tone_event.arousal = frame.arousal
+ tone_event.dominance = frame.dominance
+ tone_event.sarcasm_risk = frame.sarcasm_risk
+ tone_event.flat_f0_score = frame.flat_f0_score
+
+ # Trajectory and coherence signals — only when dimensional is running
+ if dim:
+ from collections import deque as _deque
+ from cf_voice.trajectory import compute_trajectory
+
+ spk_buffer = self._dim_buffer.setdefault(
+ speaker_id, _deque(maxlen=self._buffer_window)
+ )
+ prior_affect = self._last_ser_affect.get(speaker_id)
+ traj, coher = compute_trajectory(
+ spk_buffer, dim, tone_result.affect, prior_affect # type: ignore[union-attr]
+ )
+ # Update buffer and affect history after computing (not before)
+ spk_buffer.append(dim)
+ self._last_ser_affect[speaker_id] = tone_result.affect # type: ignore[union-attr]
+
+ tone_event.arousal_delta = traj.arousal_delta if traj.baseline_established else None
+ tone_event.valence_delta = traj.valence_delta if traj.baseline_established else None
+ tone_event.trend = traj.trend if traj.baseline_established else None
+ tone_event.coherence_score = coher.coherence_score
+ tone_event.suppression_flag = coher.suppression_flag
+ tone_event.reframe_type = coher.reframe_type if coher.reframe_type != "none" else None
+ tone_event.affect_divergence = coher.affect_divergence
+
+ logger.debug(
+ "Dimensional: valence=%.3f arousal=%.3f dominance=%.3f quadrant=%s "
+ "trend=%s coherence=%.2f suppressed=%s reframe=%s",
+ dim.valence, dim.arousal, dim.dominance, dim.affect_quadrant(),
+ traj.trend, coher.coherence_score, coher.suppression_flag, coher.reframe_type,
+ )
+
+ if pros:
+ logger.debug(
+ "Prosodic: flat_f0=%.3f sarcasm_risk=%.3f",
+ pros.flat_f0_score, pros.sarcasm_risk,
+ )
+
+ events: list[AudioEvent] = [tone_event]
+
+ # Emit transcript event so consumers can display live STT
+ if transcript:
+ events.append(AudioEvent(
+ timestamp=timestamp,
+ event_type="transcript", # type: ignore[arg-type]
+ label=transcript,
+ confidence=1.0,
+ speaker_id=speaker_id,
+ ))
+
+ # Acoustic events (queue / speaker type / environ / scene)
+ scene_label: str | None = None
+ environ_labels: list[str] = []
+ speaker_label: str | None = None
+ if not isinstance(acoustic, BaseException):
+ if acoustic.queue: # type: ignore[union-attr]
+ events.append(acoustic.queue) # type: ignore[union-attr]
+ if acoustic.speaker: # type: ignore[union-attr]
+ events.append(acoustic.speaker) # type: ignore[union-attr]
+ speaker_label = acoustic.speaker.label # type: ignore[union-attr]
+ if acoustic.environ: # type: ignore[union-attr]
+ events.append(acoustic.environ) # type: ignore[union-attr]
+ environ_labels = [acoustic.environ.label] # type: ignore[union-attr]
+ if acoustic.scene: # type: ignore[union-attr]
+ events.append(acoustic.scene) # type: ignore[union-attr]
+ scene_label = acoustic.scene.label # type: ignore[union-attr]
+
+ # Accent event (optional — gated by CF_VOICE_ACCENT=1)
+ accent_region: str | None = None
+ if accent_result and not isinstance(accent_result, BaseException):
+ accent_region = accent_result.region # type: ignore[union-attr]
+ events.append(AudioEvent(
+ timestamp=timestamp,
+ event_type="accent", # type: ignore[arg-type]
+ label=accent_region,
+ confidence=accent_result.confidence, # type: ignore[union-attr]
+ speaker_id=speaker_id,
+ ))
+
+ # Privacy risk scoring — local only, never transmitted
+ from cf_voice.privacy import score_privacy_risk
+ risk = score_privacy_risk(
+ scene=scene_label,
+ environ_labels=environ_labels,
+ speaker=speaker_label,
+ accent=accent_region,
+ )
+ if risk.level != "low":
+ logger.info(
+ "privacy_risk=%s flags=%s session=%s",
+ risk.level, risk.flags, session_id,
+ )
+ # Attach risk to the tone event so Linnet can surface the gate
+ tone_event.prosody_flags = list(tone_event.prosody_flags) + [f"privacy:{risk.level}"]
+
+ return events
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
"""
- Apply tone classification to a raw frame.
+ Apply tone classification to a raw frame (streaming path).
Stub: identity transform — returns frame unchanged.
- Real: replace label + confidence with classifier output.
+ Real (Navigation v0.2.x): replace label + confidence with classifier output.
"""
return frame
diff --git a/cf_voice/diarize.py b/cf_voice/diarize.py
index 217dd51..4d8ed18 100644
--- a/cf_voice/diarize.py
+++ b/cf_voice/diarize.py
@@ -7,12 +7,16 @@
# Requires accepting gated model terms at:
# https://huggingface.co/pyannote/speaker-diarization-3.1
# https://huggingface.co/pyannote/segmentation-3.0
+#
+# Enable with: CF_VOICE_DIARIZE=1 (default off)
+# Requires: HF_TOKEN set in environment
from __future__ import annotations
import asyncio
import logging
import os
-from dataclasses import dataclass
+import string
+from dataclasses import dataclass, field
import numpy as np
@@ -21,11 +25,16 @@ logger = logging.getLogger(__name__)
_DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
_SAMPLE_RATE = 16_000
+# Label returned when two speakers overlap in the same window
+SPEAKER_MULTIPLE = "Multiple"
+# Label returned when no speaker segment covers the timestamp (silence / VAD miss)
+SPEAKER_UNKNOWN = "speaker_a"
+
@dataclass
class SpeakerSegment:
"""A speaker-labelled time range within an audio window."""
- speaker_id: str # ephemeral local label, e.g. "SPEAKER_00"
+ speaker_id: str # raw pyannote label, e.g. "SPEAKER_00"
start_s: float
end_s: float
@@ -34,6 +43,51 @@ class SpeakerSegment:
return self.end_s - self.start_s
+class SpeakerTracker:
+ """
+ Maps ephemeral pyannote speaker IDs to stable per-session friendly labels.
+
+ pyannote returns IDs like "SPEAKER_00", "SPEAKER_01" which are opaque and
+ may differ across audio windows. SpeakerTracker assigns a consistent
+ friendly label ("Speaker A", "Speaker B", ...) for the lifetime of one
+ session, based on first-seen order.
+
+ Speaker embeddings are never stored — only the raw_id → label string map,
+ which contains no biometric information. Call reset() at session end to
+ discard the map.
+
+ For sessions with more than 26 speakers, labels wrap to "Speaker AA",
+ "Speaker AB", etc. (unlikely in practice but handled gracefully).
+ """
+
+ def __init__(self) -> None:
+ self._map: dict[str, str] = {}
+ self._counter: int = 0
+
+ def label(self, raw_id: str) -> str:
+ """Return the friendly label for a pyannote speaker ID."""
+ if raw_id not in self._map:
+ self._map[raw_id] = self._next_label()
+ return self._map[raw_id]
+
+ def reset(self) -> None:
+ """Discard all label mappings. Call at session end."""
+ self._map.clear()
+ self._counter = 0
+
+ def _next_label(self) -> str:
+ idx = self._counter
+ self._counter += 1
+ letters = string.ascii_uppercase
+ n = len(letters)
+ if idx < n:
+ return f"Speaker {letters[idx]}"
+ # Two-letter suffix for >26 speakers
+ outer = idx // n
+ inner = idx % n
+ return f"Speaker {letters[outer - 1]}{letters[inner]}"
+
+
class Diarizer:
"""
Async wrapper around pyannote.audio speaker diarization pipeline.
@@ -47,9 +101,9 @@ class Diarizer:
Usage
-----
diarizer = Diarizer.from_env()
+ tracker = SpeakerTracker()
segments = await diarizer.diarize_async(audio_float32)
- for seg in segments:
- print(seg.speaker_id, seg.start_s, seg.end_s)
+ label = diarizer.speaker_at(segments, timestamp_s=1.0, tracker=tracker)
Navigation v0.2.x wires this into ContextClassifier so that each
VoiceFrame carries the correct speaker_id from diarization output.
@@ -67,7 +121,7 @@ class Diarizer:
logger.info("Loading diarization pipeline %s", _DIARIZATION_MODEL)
self._pipeline = Pipeline.from_pretrained(
_DIARIZATION_MODEL,
- use_auth_token=hf_token,
+ token=hf_token,
)
# Move to GPU if available
@@ -92,16 +146,29 @@ class Diarizer:
return cls(hf_token=token)
def _diarize_sync(
- self, audio_float32: np.ndarray, sample_rate: int = _SAMPLE_RATE
+ self,
+ audio_float32: np.ndarray,
+ sample_rate: int = _SAMPLE_RATE,
+ num_speakers: int | None = None,
) -> list[SpeakerSegment]:
- """Synchronous diarization — always call via diarize_async."""
+ """Synchronous diarization — always call via diarize_async.
+
+ num_speakers: when set, passed as min_speakers=max_speakers to pyannote,
+ which skips the agglomeration heuristic and improves boundary accuracy
+ for known-size conversations (e.g. 2-person call).
+ """
import torch
# pyannote expects (channels, samples) float32 tensor
waveform = torch.from_numpy(audio_float32[np.newaxis, :].astype(np.float32))
- diarization = self._pipeline(
- {"waveform": waveform, "sample_rate": sample_rate}
- )
+ pipeline_kwargs: dict = {"waveform": waveform, "sample_rate": sample_rate}
+ if num_speakers and num_speakers > 0:
+ pipeline_kwargs["min_speakers"] = num_speakers
+ pipeline_kwargs["max_speakers"] = num_speakers
+ output = self._pipeline(pipeline_kwargs)
+ # pyannote >= 3.3 wraps results in DiarizeOutput; earlier versions return
+ # Annotation directly. Normalise to Annotation before iterating.
+ diarization = getattr(output, "speaker_diarization", output)
segments: list[SpeakerSegment] = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
@@ -118,6 +185,7 @@ class Diarizer:
self,
audio_float32: np.ndarray,
sample_rate: int = _SAMPLE_RATE,
+ num_speakers: int | None = None,
) -> list[SpeakerSegment]:
"""
Diarize an audio window without blocking the event loop.
@@ -125,22 +193,58 @@ class Diarizer:
audio_float32 should be 16kHz mono float32.
Typical input is a 2-second window from MicVoiceIO (32000 samples).
Returns segments ordered by start_s.
+
+ num_speakers: passed through to pyannote as min_speakers=max_speakers
+ when set and > 0. Improves accuracy for known speaker counts.
"""
- loop = asyncio.get_event_loop()
+ from functools import partial
+ loop = asyncio.get_running_loop()
return await loop.run_in_executor(
- None, self._diarize_sync, audio_float32, sample_rate
+ None,
+ partial(self._diarize_sync, audio_float32, sample_rate, num_speakers),
)
def speaker_at(
- self, segments: list[SpeakerSegment], timestamp_s: float
+ self,
+ segments: list[SpeakerSegment],
+ timestamp_s: float,
+ tracker: SpeakerTracker | None = None,
+ window_s: float = 1.0,
) -> str:
"""
- Return the speaker_id active at a given timestamp within the window.
+ Return the friendly speaker label dominating a window around timestamp_s.
- Falls back to "speaker_a" if no segment covers the timestamp
- (e.g. during silence or at window boundaries).
+ Strategy (in order):
+ 1. If segments directly cover timestamp_s: use majority rule among them.
+ 2. If timestamp_s falls in a silence gap: use the speaker with the most
+ total speaking time across the whole window [0, window_s]. This handles
+ pauses between pyannote segments without falling back to "speaker_a".
+ 3. No segments at all: SPEAKER_UNKNOWN.
+
+ tracker is optional; if omitted, raw pyannote IDs are returned as-is.
"""
+ if not segments:
+ return SPEAKER_UNKNOWN
+
+ covering = [seg for seg in segments if seg.start_s <= timestamp_s <= seg.end_s]
+
+ if len(covering) >= 2:
+ return SPEAKER_MULTIPLE
+
+ if len(covering) == 1:
+ raw_id = covering[0].speaker_id
+ return tracker.label(raw_id) if tracker else raw_id
+
+ # Midpoint fell in a silence gap — find dominant speaker over the window.
+ from collections import defaultdict
+ duration_by_speaker: dict[str, float] = defaultdict(float)
+ win_start = max(0.0, timestamp_s - window_s / 2)
+ win_end = timestamp_s + window_s / 2
for seg in segments:
- if seg.start_s <= timestamp_s <= seg.end_s:
- return seg.speaker_id
- return "speaker_a"
+ overlap = min(seg.end_s, win_end) - max(seg.start_s, win_start)
+ if overlap > 0:
+ duration_by_speaker[seg.speaker_id] += overlap
+ if not duration_by_speaker:
+ return SPEAKER_UNKNOWN
+ raw_id = max(duration_by_speaker, key=lambda k: duration_by_speaker[k])
+ return tracker.label(raw_id) if tracker else raw_id
diff --git a/cf_voice/dimensional.py b/cf_voice/dimensional.py
new file mode 100644
index 0000000..26ace1d
--- /dev/null
+++ b/cf_voice/dimensional.py
@@ -0,0 +1,190 @@
+# cf_voice/dimensional.py — audeering dimensional emotion model
+#
+# BSL 1.1: real inference. Requires [inference] extras.
+#
+# Model: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
+# Outputs three continuous 0-1 scores:
+# valence: negative (0) to positive (1)
+# arousal: low energy (0) to high energy (1)
+# dominance: submissive (0) to dominant (1)
+#
+# Trained on MSP-Podcast (in-the-wild conversational speech), not acted speech.
+# This is the key differentiator from SER models trained on RAVDESS/IEMOCAP.
+#
+# Enable with: CF_VOICE_DIMENSIONAL=1 (default off until audeering model is
+# downloaded — ~1.5GB, adds ~800MB GPU VRAM)
+#
+# HuggingFace model page:
+# https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from dataclasses import dataclass
+from functools import partial
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+_SAMPLE_RATE = 16_000
+_DIMENSIONAL_MODEL_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+
+
+@dataclass
+class DimensionalResult:
+ """
+ Output of the audeering dimensional emotion model.
+
+ All scores are 0.0-1.0 continuous values:
+ valence: negative affect (0) to positive affect (1)
+ arousal: low energy / calm (0) to high energy / excited (1)
+ dominance: submissive / uncertain (0) to dominant / assertive (1)
+
+ Sarcasm signal: low arousal + higher valence = "calm-positive" profile.
+ Combined with flat F0 (prosody.py) and text divergence (linnet#22) for
+ the full multi-signal sarcasm heuristic.
+ """
+ valence: float
+ arousal: float
+ dominance: float
+
+ def affect_quadrant(self) -> str:
+ """
+ Map VAD position to a descriptive quadrant label.
+
+ These are reference labels for logging and debugging, not user-facing.
+ The annotation layer (Elcor) handles user-facing interpretation.
+ """
+ v_high = self.valence >= 0.5
+ a_high = self.arousal >= 0.5
+ if v_high and a_high:
+ return "enthusiastic"
+ if v_high and not a_high:
+ return "calm_positive" # sarcasm candidate when paired with flat F0
+ if not v_high and a_high:
+ return "frustrated_urgent"
+ return "sad_resigned"
+
+ def calm_positive_score(self) -> float:
+ """
+ 0-1 score indicating how strongly the VAD position matches the
+ calm-positive sarcasm candidate profile (low arousal, higher valence).
+
+ Used as one component of the combined sarcasm heuristic.
+ """
+ valence_pos = max(0.0, self.valence - 0.5) * 2.0 # how positive
+ arousal_low = 1.0 - self.arousal # how calm
+ return (valence_pos * 0.5 + arousal_low * 0.5)
+
+
+class DimensionalClassifier:
+ """
+ Async wrapper around the audeering wav2vec2 dimensional emotion model.
+
+ The model runs in a thread pool executor to avoid blocking asyncio.
+ Loaded once on first call and reused; the underlying wav2vec2 model
+ lands on CUDA when available (same device as the SER model in classify.py).
+
+ Usage
+ -----
+ clf = DimensionalClassifier.from_env()
+ result = await clf.classify_async(audio_float32)
+ print(result.valence, result.arousal, result.dominance)
+ """
+
+ def __init__(self) -> None:
+ self._model = None
+ self._processor = None
+ self._loaded = False
+
+ def _ensure_loaded(self) -> None:
+ """Load model and processor on first inference call (not at construction)."""
+ if self._loaded:
+ return
+ self._loaded = True
+
+ try:
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
+ except ImportError as exc:
+ raise ImportError(
+ "transformers is required for dimensional emotion classification. "
+ "Install with: pip install cf-voice[inference]"
+ ) from exc
+
+ logger.info("Loading dimensional emotion model %s", _DIMENSIONAL_MODEL_ID)
+ self._processor = Wav2Vec2Processor.from_pretrained(_DIMENSIONAL_MODEL_ID)
+ self._model = Wav2Vec2ForSequenceClassification.from_pretrained(_DIMENSIONAL_MODEL_ID)
+
+ try:
+ import torch
+ if torch.cuda.is_available():
+ self._model = self._model.to(torch.device("cuda"))
+ logger.info("Dimensional model on CUDA")
+ except ImportError:
+ pass
+
+ self._model.eval()
+
+ def _classify_sync(self, audio_float32: np.ndarray) -> DimensionalResult:
+ """
+ Synchronous inference. Always call via classify_async.
+
+ The audeering model outputs [valence, arousal, dominance] as logits
+ in the range 0-1 (sigmoid regression heads, not softmax). The model was
+ fine-tuned on MSP-Podcast with per-dimension regression, not classification.
+ """
+ self._ensure_loaded()
+
+ try:
+ import torch
+ except ImportError as exc:
+ raise ImportError("torch is required for dimensional inference") from exc
+
+ inputs = self._processor(
+ audio_float32,
+ sampling_rate=_SAMPLE_RATE,
+ return_tensors="pt",
+ padding=True,
+ )
+
+ if torch.cuda.is_available():
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+ with torch.no_grad():
+ logits = self._model(**inputs).logits
+
+ # Model outputs [valence, arousal, dominance] in a single (1, 3) tensor
+ scores = logits[0].cpu().float().numpy()
+ valence = float(np.clip(scores[0], 0.0, 1.0))
+ arousal = float(np.clip(scores[1], 0.0, 1.0))
+ dominance = float(np.clip(scores[2], 0.0, 1.0))
+
+ return DimensionalResult(
+ valence=round(valence, 4),
+ arousal=round(arousal, 4),
+ dominance=round(dominance, 4),
+ )
+
+ async def classify_async(self, audio_float32: np.ndarray) -> DimensionalResult:
+ """
+ Classify audio without blocking the event loop.
+
+ Runs in a thread pool executor. Designed to be gathered alongside
+ the SER and diarization coroutines in context._classify_real_async().
+ """
+ loop = asyncio.get_running_loop()
+ return await loop.run_in_executor(
+ None, partial(self._classify_sync, audio_float32)
+ )
+
+ @classmethod
+ def from_env(cls) -> "DimensionalClassifier":
+ """Construct from environment. Raises if CF_VOICE_DIMENSIONAL is not set."""
+ if os.environ.get("CF_VOICE_DIMENSIONAL", "0") != "1":
+ raise EnvironmentError(
+ "CF_VOICE_DIMENSIONAL=1 is required to enable the audeering dimensional model. "
+ "Add it to your .env file. The model requires ~800MB GPU VRAM."
+ )
+ return cls()
diff --git a/cf_voice/events.py b/cf_voice/events.py
index eb4a578..3148391 100644
--- a/cf_voice/events.py
+++ b/cf_voice/events.py
@@ -10,10 +10,10 @@ from __future__ import annotations
from dataclasses import dataclass, field
from typing import Literal
-EventType = Literal["queue", "speaker", "environ", "tone"]
+EventType = Literal["queue", "speaker", "environ", "tone", "transcript", "scene", "accent"]
# ── Queue state labels ────────────────────────────────────────────────────────
-# Detected from YAMNet acoustic event classification
+# Detected from AST acoustic event classification
QUEUE_LABELS = Literal[
"hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
]
@@ -21,13 +21,36 @@ QUEUE_LABELS = Literal[
# ── Speaker type labels ───────────────────────────────────────────────────────
# Detected from pyannote VAD + custom IVR-vs-human head
SPEAKER_LABELS = Literal[
- "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker"
+ "ivr_synth", "human_single", "human_multi", "transfer", "no_speaker",
+ "background_voices",
]
# ── Environmental labels ──────────────────────────────────────────────────────
-# Background shift is the primary AMD (answering machine detection) signal
+# Background shift is the primary AMD (answering machine detection) signal.
+# Telephony labels + general-purpose acoustic scene labels.
ENVIRON_LABELS = Literal[
- "call_center", "music", "background_shift", "noise_floor_change", "quiet"
+ # Telephony
+ "call_center", "music", "background_shift", "noise_floor_change", "quiet",
+ # Nature
+ "birdsong", "wind", "rain", "water",
+ # Urban
+ "traffic", "crowd_chatter", "street_signal", "construction",
+ # Indoor
+ "hvac", "keyboard_typing", "restaurant",
+]
+
+# ── Acoustic scene labels ─────────────────────────────────────────────────────
+# Broad scene category — primary input to privacy risk scoring.
+SCENE_LABELS = Literal[
+ "indoor_quiet", "indoor_crowd", "outdoor_urban", "outdoor_nature",
+ "vehicle", "public_transit",
+]
+
+# ── Accent / language labels ──────────────────────────────────────────────────
+# Regional accent of primary speaker. Gated by CF_VOICE_ACCENT=1.
+ACCENT_LABELS = Literal[
+ "en_gb", "en_us", "en_au", "en_ca", "en_in",
+ "fr", "es", "de", "zh", "ja", "other",
]
# ── Tone / affect labels ──────────────────────────────────────────────────────
@@ -86,12 +109,35 @@ class ToneEvent(AudioEvent):
The subtext field carries the human-readable annotation.
Format is controlled by the caller (elcor flag in the classify request).
+
+ Dimensional emotion (Navigation v0.2.x — audeering model):
+ valence / arousal / dominance are None when the dimensional classifier
+ is not enabled (CF_VOICE_DIMENSIONAL != "1").
+
+ Prosodic signals (Navigation v0.2.x — openSMILE):
+ sarcasm_risk / flat_f0_score are None when extractor is not enabled.
"""
affect: str = "neutral"
shift_magnitude: float = 0.0
shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable"
prosody_flags: list[str] = field(default_factory=list)
session_id: str = "" # caller-assigned; correlates events to a session
+ # Dimensional emotion scores (audeering, optional)
+ valence: float | None = None
+ arousal: float | None = None
+ dominance: float | None = None
+ # Prosodic signals (openSMILE, optional)
+ sarcasm_risk: float | None = None
+ flat_f0_score: float | None = None
+ # Trajectory signals (rolling buffer — activates after BASELINE_MIN frames)
+ arousal_delta: float | None = None
+ valence_delta: float | None = None
+ trend: str | None = None # "stable"|"escalating"|"suppressed"|…
+ # Coherence signals (SER vs VAD cross-comparison)
+ coherence_score: float | None = None
+ suppression_flag: bool | None = None
+ reframe_type: str | None = None # "none"|"genuine"|"surface"
+ affect_divergence: float | None = None
def __post_init__(self) -> None:
# Force event_type to "tone" regardless of what the caller passed.
diff --git a/cf_voice/io.py b/cf_voice/io.py
index 0257afd..03ad7d8 100644
--- a/cf_voice/io.py
+++ b/cf_voice/io.py
@@ -118,5 +118,12 @@ def make_io(
if use_mock:
return MockVoiceIO(interval_s=interval_s)
- from cf_voice.capture import MicVoiceIO
- return MicVoiceIO(device_index=device_index)
+ try:
+ from cf_voice.capture import MicVoiceIO
+ return MicVoiceIO(device_index=device_index)
+ except ImportError as exc:
+ raise NotImplementedError(
+ "Real audio capture requires [inference] extras. "
+ "Install with: pip install cf-voice[inference]\n"
+ f"Missing: {exc}"
+ ) from exc
diff --git a/cf_voice/models.py b/cf_voice/models.py
index 8cd6535..f456347 100644
--- a/cf_voice/models.py
+++ b/cf_voice/models.py
@@ -13,19 +13,30 @@ class VoiceFrame:
A single annotated moment in a voice stream.
Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
- (tone classification, speaker diarization).
+ (tone classification, speaker diarization, dimensional emotion).
Fields
------
label Tone annotation, e.g. "Warmly impatient" or "Deflecting".
Generic by default; Elcor-style prefix format is an
easter egg surfaced by the product UI, not set here.
- confidence 0.0–1.0. Below ~0.5 the annotation is speculative.
+ confidence 0.0-1.0. Below ~0.5 the annotation is speculative.
speaker_id Ephemeral local label ("speaker_a", "speaker_b").
Not tied to identity — resets each session.
- shift_magnitude Delta from the previous frame's tone, 0.0–1.0.
+ shift_magnitude Delta from the previous frame's tone, 0.0-1.0.
High values indicate a meaningful register shift.
timestamp Session-relative seconds since capture started.
+
+ Dimensional emotion (audeering model — Navigation v0.2.x, optional):
+ valence 0.0-1.0. Negative affect (0) to positive affect (1).
+ arousal 0.0-1.0. Low energy / calm (0) to high energy / excited (1).
+ dominance 0.0-1.0. Submissive / uncertain (0) to assertive / dominant (1).
+
+ Prosodic features (openSMILE eGeMAPS — Navigation v0.2.x, optional):
+ sarcasm_risk 0.0-1.0 heuristic score: flat F0 + calm-positive VAD +
+ text divergence (linnet#22). All three signals required for
+ high confidence — audio-only signals are weak priors.
+ flat_f0_score Normalised F0 flatness: 1.0 = maximally flat pitch.
"""
label: str
@@ -34,6 +45,15 @@ class VoiceFrame:
shift_magnitude: float
timestamp: float
+ # Dimensional emotion scores — None when dimensional classifier is disabled
+ valence: float | None = None
+ arousal: float | None = None
+ dominance: float | None = None
+
+ # Prosodic signals — None when prosodic extractor is disabled
+ sarcasm_risk: float | None = None
+ flat_f0_score: float | None = None
+
def is_reliable(self, threshold: float = 0.6) -> bool:
"""Return True when confidence meets the given threshold."""
return self.confidence >= threshold
diff --git a/cf_voice/prefs.py b/cf_voice/prefs.py
new file mode 100644
index 0000000..a4b7420
--- /dev/null
+++ b/cf_voice/prefs.py
@@ -0,0 +1,181 @@
+# cf_voice/prefs.py — user preference hooks for cf-core preferences module
+#
+# MIT licensed. Provides voice-specific preference keys and helpers.
+#
+# When circuitforge_core is installed, reads/writes from the shared preference
+# store (LocalFileStore or cloud backend). When it is not installed (standalone
+# cf-voice use), falls back to environment variables only.
+#
+# Preference paths use dot-separated notation (cf-core convention):
+# "voice.elcor_mode" bool — Elcor-style tone annotations
+# "voice.confidence_threshold" float — minimum confidence to emit a frame
+# "voice.whisper_model" str — faster-whisper model size
+# "voice.elcor_prior_frames" int — rolling context window for Elcor LLM
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# ── Preference key constants ──────────────────────────────────────────────────
+
+PREF_ELCOR_MODE = "voice.elcor_mode"
+PREF_CONFIDENCE_THRESHOLD = "voice.confidence_threshold"
+PREF_WHISPER_MODEL = "voice.whisper_model"
+PREF_ELCOR_PRIOR_FRAMES = "voice.elcor_prior_frames"
+
+# Defaults used when neither preference store nor environment has a value
+_DEFAULTS: dict[str, Any] = {
+ PREF_ELCOR_MODE: False,
+ PREF_CONFIDENCE_THRESHOLD: 0.55,
+ PREF_WHISPER_MODEL: "small",
+ PREF_ELCOR_PRIOR_FRAMES: 4,
+}
+
+# ── Environment variable fallbacks ────────────────────────────────────────────
+
+_ENV_KEYS: dict[str, str] = {
+ PREF_ELCOR_MODE: "CF_VOICE_ELCOR",
+ PREF_CONFIDENCE_THRESHOLD: "CF_VOICE_CONFIDENCE_THRESHOLD",
+ PREF_WHISPER_MODEL: "CF_VOICE_WHISPER_MODEL",
+ PREF_ELCOR_PRIOR_FRAMES: "CF_VOICE_ELCOR_PRIOR_FRAMES",
+}
+
+_COERCE: dict[str, type] = {
+ PREF_ELCOR_MODE: bool,
+ PREF_CONFIDENCE_THRESHOLD: float,
+ PREF_WHISPER_MODEL: str,
+ PREF_ELCOR_PRIOR_FRAMES: int,
+}
+
+
+def _from_env(pref_path: str) -> Any:
+ """Read a preference from its environment variable fallback."""
+ env_key = _ENV_KEYS.get(pref_path)
+ if env_key is None:
+ return None
+ raw = os.environ.get(env_key)
+ if raw is None:
+ return None
+ coerce = _COERCE.get(pref_path, str)
+ try:
+ if coerce is bool:
+ return raw.strip().lower() in ("1", "true", "yes")
+ return coerce(raw)
+ except (ValueError, TypeError):
+ logger.warning("prefs: could not parse env %s=%r as %s", env_key, raw, coerce)
+ return None
+
+
+def _cf_core_store():
+ """Return the cf-core default preference store, or None if not available."""
+ try:
+ from circuitforge_core.preferences import store as _store_mod
+ return _store_mod._DEFAULT_STORE
+ except ImportError:
+ return None
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+
+def get_voice_pref(
+ pref_path: str,
+ user_id: str | None = None,
+ store=None,
+) -> Any:
+ """
+ Read a voice preference value.
+
+ Resolution order:
+ 1. Explicit store (passed in by caller — used for testing or cloud backends)
+ 2. cf-core LocalFileStore (if circuitforge_core is installed)
+ 3. Environment variable fallback
+ 4. Built-in default
+
+ pref_path One of the PREF_* constants, e.g. PREF_ELCOR_MODE.
+ user_id Passed to the store for cloud backends; local store ignores it.
+ """
+ # 1. Explicit store
+ if store is not None:
+ val = store.get(user_id=user_id, path=pref_path, default=None)
+ if val is not None:
+ return val
+
+ # 2. cf-core default store
+ cf_store = _cf_core_store()
+ if cf_store is not None:
+ val = cf_store.get(user_id=user_id, path=pref_path, default=None)
+ if val is not None:
+ return val
+
+ # 3. Environment variable
+ env_val = _from_env(pref_path)
+ if env_val is not None:
+ return env_val
+
+ # 4. Built-in default
+ return _DEFAULTS.get(pref_path)
+
+
+def set_voice_pref(
+ pref_path: str,
+ value: Any,
+ user_id: str | None = None,
+ store=None,
+) -> None:
+ """
+ Write a voice preference value.
+
+ Writes to the explicit store if provided, otherwise to the cf-core default
+ store. Raises RuntimeError if neither is available (env-only mode has no
+ writable persistence).
+ """
+ target = store or _cf_core_store()
+ if target is None:
+ raise RuntimeError(
+ "No writable preference store available. "
+ "Install circuitforge_core or pass a store explicitly."
+ )
+ target.set(user_id=user_id, path=pref_path, value=value)
+
+
+def is_elcor_enabled(user_id: str | None = None, store=None) -> bool:
+ """
+ Convenience: return True if the user has Elcor annotation mode enabled.
+
+ Elcor mode switches tone subtext from generic format ("Tone: Frustrated")
+ to the Mass Effect Elcor prefix format ("With barely concealed frustration:").
+ It is an accessibility feature for autistic and ND users who benefit from
+ explicit tonal annotation. Opt-in, local-only — no data leaves the device.
+
+ Defaults to False.
+ """
+ return bool(get_voice_pref(PREF_ELCOR_MODE, user_id=user_id, store=store))
+
+
+def get_confidence_threshold(user_id: str | None = None, store=None) -> float:
+ """Return the minimum confidence threshold for emitting VoiceFrames (0.0–1.0)."""
+ return float(
+ get_voice_pref(PREF_CONFIDENCE_THRESHOLD, user_id=user_id, store=store)
+ )
+
+
+def get_whisper_model(user_id: str | None = None, store=None) -> str:
+ """Return the faster-whisper model name to use (e.g. "small", "medium")."""
+ return str(get_voice_pref(PREF_WHISPER_MODEL, user_id=user_id, store=store))
+
+
+def get_elcor_prior_frames(user_id: str | None = None, store=None) -> int:
+ """
+ Return the number of prior VoiceFrames to include as context for Elcor
+ label generation. Larger windows produce more contextually aware annotations
+ but increase LLM prompt length and latency.
+
+ Default: 4 frames (~8–10 seconds of rolling context at 2s intervals).
+ """
+ return int(
+ get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, user_id=user_id, store=store)
+ )
diff --git a/cf_voice/privacy.py b/cf_voice/privacy.py
new file mode 100644
index 0000000..4613749
--- /dev/null
+++ b/cf_voice/privacy.py
@@ -0,0 +1,115 @@
+# cf_voice/privacy.py — local acoustic privacy risk scoring
+#
+# MIT licensed. Never transmitted to cloud. Never logged server-side.
+#
+# Derives a privacy_risk level (low / moderate / high) from the combined
+# acoustic fingerprint: scene + environ labels + speaker type + accent.
+#
+# Design rationale (#20):
+# - "outdoor_urban" + "crowd_chatter" + "traffic" → low: clearly public
+# - "indoor_quiet" + "background_voices" → moderate: conversation overheard
+# - "outdoor_nature" + "birdsong" + regional accent → moderate-high: location-identifying compound
+# - "indoor_quiet" + no background voices → low
+#
+# Risk gates (Linnet):
+# high: warn before sending audio chunk to cloud STT; offer local-only fallback
+# moderate: attach privacy_flags to session state, no blocking action
+# low: proceed normally
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal
+
+PrivacyLevel = Literal["low", "moderate", "high"]
+
+
+@dataclass
+class PrivacyRisk:
+ """
+ Locally-computed privacy risk for a single audio window.
+
+ level: aggregate risk level
+ flags: ordered list of contributing signal descriptions
+ """
+ level: PrivacyLevel
+ flags: list[str] = field(default_factory=list)
+
+
+# ── Signal sets ───────────────────────────────────────────────────────────────
+
+_PUBLIC_SCENES = {"outdoor_urban", "public_transit"}
+_NATURE_SCENES = {"outdoor_nature"}
+_QUIET_SCENES = {"indoor_quiet"}
+
+_LOCATION_ENVIRON = {"birdsong", "wind", "rain", "water"}
+_URBAN_ENVIRON = {"traffic", "crowd_chatter", "street_signal", "construction"}
+
+
+def score_privacy_risk(
+ scene: str | None,
+ environ_labels: list[str],
+ speaker: str | None,
+ accent: str | None,
+) -> PrivacyRisk:
+ """
+ Derive a PrivacyRisk from the current acoustic fingerprint.
+
+ All inputs are nullable — this function handles partial signals gracefully.
+ Called per audio window; results are never persisted or transmitted.
+
+ Args:
+ scene: SCENE_LABEL string or None
+ environ_labels: list of ENVIRON_LABEL strings active in this window
+ speaker: SPEAKER_LABEL string or None
+ accent: ACCENT_LABEL string or None (None when CF_VOICE_ACCENT disabled)
+ """
+ flags: list[str] = []
+ score = 0 # internal accumulator; maps to level at the end
+
+ environ_set = set(environ_labels)
+
+ # ── Clearly public environments → reduce risk ─────────────────────────────
+ if scene in _PUBLIC_SCENES or environ_set & _URBAN_ENVIRON:
+ flags.append("public_environment")
+ score -= 1
+
+ # ── Background voices: conversation may be overheard ─────────────────────
+ if speaker == "background_voices":
+ flags.append("background_voices_detected")
+ score += 2
+
+ # ── Quiet indoor: no background noise reduces identifiability ────────────
+ if scene in _QUIET_SCENES and speaker not in ("background_voices", "human_multi"):
+ flags.append("controlled_environment")
+ # No score change — neutral
+
+ # ── Nature sounds: alone they suggest a quiet, potentially identifiable location
+ nature_match = environ_set & _LOCATION_ENVIRON
+ if nature_match:
+ flags.append(f"location_signal: {', '.join(sorted(nature_match))}")
+ score += 1
+
+ # ── Nature scene + nature sounds: compound location-identifying signal ────
+ if scene in _NATURE_SCENES and nature_match:
+ flags.append("compound_location_signal")
+ score += 1
+
+ # ── Regional accent + nature: narrows location to region + environment ────
+ if accent and accent not in ("en_us", "other") and nature_match:
+ flags.append(f"accent_plus_location: {accent}")
+ score += 1
+
+ # ── Quiet indoor + background voices: overheard conversation ─────────────
+ if scene in _QUIET_SCENES and speaker == "background_voices":
+ flags.append("overheard_conversation")
+ score += 1
+
+ # ── Map score to level ────────────────────────────────────────────────────
+ if score <= 0:
+ level: PrivacyLevel = "low"
+ elif score <= 2:
+ level = "moderate"
+ else:
+ level = "high"
+
+ return PrivacyRisk(level=level, flags=flags)
diff --git a/cf_voice/prosody.py b/cf_voice/prosody.py
new file mode 100644
index 0000000..f897116
--- /dev/null
+++ b/cf_voice/prosody.py
@@ -0,0 +1,208 @@
+# cf_voice/prosody.py — openSMILE eGeMAPS prosodic feature extraction
+#
+# MIT licensed (opensmile-python package is MIT).
+#
+# Extracts 88 hand-crafted acoustic features from the eGeMAPS v02 feature set:
+# F0 mean / std / percentiles (pitch)
+# Jitter / Shimmer (cycle-to-cycle variation — vocal tension)
+# Energy / loudness envelope
+# MFCCs, spectral centroid
+# Speaking rate, pause ratio
+#
+# Runs on CPU in a thread pool executor — no GPU required. Designed to run
+# in parallel with the GPU classifiers in context._classify_real_async() via
+# asyncio.gather().
+#
+# Enable with: CF_VOICE_PROSODY=1 (default off)
+# Install: pip install opensmile
+#
+# openSMILE docs: https://audeering.github.io/opensmile-python/
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from dataclasses import dataclass
+from functools import partial
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+_SAMPLE_RATE = 16_000
+
+# F0 std normalisation constant: values below this threshold indicate flat prosody.
+# Derived from eGeMAPS feature "F0semitoneFrom27.5Hz_sma3nz_stddevNorm".
+# A typical conversational F0 std is ~0.3-0.5 semitones. Values under 0.2 are flat.
+_F0_STD_NORM_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_stddevNorm"
+_F0_MEAN_FEATURE = "F0semitoneFrom27.5Hz_sma3nz_amean"
+_LOUDNESS_FEATURE = "loudness_sma3_amean"
+_JITTER_FEATURE = "jitterLocal_sma3nz_amean"
+_SHIMMER_FEATURE = "shimmerLocaldB_sma3nz_amean"
+_SPEECH_RATE_FEATURE = "VoicedSegmentsPerSec"
+
+
+@dataclass
+class ProsodicSignal:
+ """
+ Summary prosodic features for a single audio window.
+
+ These are derived from the openSMILE eGeMAPS v02 feature set.
+ All values are raw feature magnitudes unless noted otherwise.
+
+ f0_mean: Mean F0 in semitones from 27.5Hz reference
+ f0_std: Normalised F0 standard deviation (flatness indicator)
+ jitter: Cycle-to-cycle pitch variation (vocal tension)
+ shimmer: Cycle-to-cycle amplitude variation (vocal stress)
+ loudness: Mean loudness (energy proxy)
+ sarcasm_risk: 0-1 heuristic score combining flat F0, calm-positive
+ audio (from DimensionalResult if available), and optional
+ text-audio divergence (linnet#22 signal, not yet wired).
+ flat_f0_score: Normalised flatness: 1.0 = maximally flat, 0.0 = varied.
+ """
+ f0_mean: float
+ f0_std: float
+ jitter: float
+ shimmer: float
+ loudness: float
+ flat_f0_score: float
+ sarcasm_risk: float
+
+
+def _compute_sarcasm_risk(
+ flat_f0: float,
+ calm_positive: float = 0.0,
+ text_divergence: float = 0.0,
+) -> float:
+ """
+ Heuristic sarcasm indicator. Not a trained model — a signal to combine
+ with text divergence (linnet#22) for the final confidence score.
+
+ flat_f0: Normalised F0 flatness (1.0 = flat, 0.0 = varied).
+ calm_positive: DimensionalResult.calm_positive_score() when available.
+ text_divergence: abs(transcript_sentiment - audio_valence) from linnet#22.
+ Pass 0.0 until the parallel text classifier is wired.
+
+ Weights: flat_f0 (40%), calm_positive (30%), text_divergence (30%).
+ """
+ return min(1.0, flat_f0 * 0.4 + calm_positive * 0.3 + text_divergence * 0.3)
+
+
+class ProsodicExtractor:
+ """
+ openSMILE eGeMAPS feature extractor for a single audio window.
+
+ CPU-bound inference — uses thread pool executor to avoid blocking asyncio.
+ Lazy-loads opensmile on first call so import cost is deferred.
+
+ Usage
+ -----
+ extractor = ProsodicExtractor()
+ signal = await extractor.extract_async(audio_float32)
+ print(signal.flat_f0_score, signal.sarcasm_risk)
+ """
+
+ def __init__(self) -> None:
+ self._smile = None
+
+ def _ensure_loaded(self) -> None:
+ """Lazy-load opensmile on first extract call."""
+ if self._smile is not None:
+ return
+
+ try:
+ import opensmile
+ except ImportError as exc:
+ raise ImportError(
+ "opensmile is required for prosodic feature extraction. "
+ "Install with: pip install opensmile"
+ ) from exc
+
+ self._smile = opensmile.Smile(
+ feature_set=opensmile.FeatureSet.eGeMAPSv02,
+ feature_level=opensmile.FeatureLevel.Functionals,
+ )
+ logger.info("openSMILE eGeMAPS loaded")
+
+ def _extract_sync(
+ self,
+ audio_float32: np.ndarray,
+ calm_positive: float = 0.0,
+ text_divergence: float = 0.0,
+ ) -> ProsodicSignal:
+ """
+ Synchronous feature extraction. Always call via extract_async.
+
+ Returns a ProsodicSignal with eGeMAPS features and a sarcasm risk score.
+ If opensmile raises (e.g. audio too short, no voiced frames), returns a
+ zero-filled ProsodicSignal so the caller does not need to handle exceptions.
+ """
+ self._ensure_loaded()
+
+ try:
+ feats = self._smile.process_signal(audio_float32, _SAMPLE_RATE)
+ row = feats.iloc[0]
+
+ f0_mean = float(row.get(_F0_MEAN_FEATURE, 0.0))
+ f0_std = float(row.get(_F0_STD_NORM_FEATURE, 0.0))
+ jitter = float(row.get(_JITTER_FEATURE, 0.0))
+ shimmer = float(row.get(_SHIMMER_FEATURE, 0.0))
+ loudness = float(row.get(_LOUDNESS_FEATURE, 0.0))
+
+ except Exception as exc:
+ logger.debug("openSMILE extraction failed (likely silent window): %s", exc)
+ return ProsodicSignal(
+ f0_mean=0.0, f0_std=0.0, jitter=0.0,
+ shimmer=0.0, loudness=0.0, flat_f0_score=0.0, sarcasm_risk=0.0,
+ )
+
+ # Normalise F0 variance to a flatness score.
+ # f0_std of 0.4 semitones = neutral baseline → flat_f0 = 0.0
+ # f0_std of 0.0 = maximally flat → flat_f0 = 1.0
+ flat_f0 = 1.0 - min(f0_std / 0.4, 1.0)
+
+ sarcasm = _compute_sarcasm_risk(
+ flat_f0=flat_f0,
+ calm_positive=calm_positive,
+ text_divergence=text_divergence,
+ )
+
+ return ProsodicSignal(
+ f0_mean=round(f0_mean, 4),
+ f0_std=round(f0_std, 4),
+ jitter=round(jitter, 6),
+ shimmer=round(shimmer, 6),
+ loudness=round(loudness, 4),
+ flat_f0_score=round(flat_f0, 4),
+ sarcasm_risk=round(sarcasm, 4),
+ )
+
+ async def extract_async(
+ self,
+ audio_float32: np.ndarray,
+ calm_positive: float = 0.0,
+ text_divergence: float = 0.0,
+ ) -> ProsodicSignal:
+ """
+ Extract prosodic features without blocking the event loop.
+
+ calm_positive: Pass DimensionalResult.calm_positive_score() when
+ dimensional classification has already run.
+ text_divergence: Pass abs(transcript_sentiment - valence) when the
+ parallel text classifier (linnet#22) is wired.
+ """
+ loop = asyncio.get_running_loop()
+ return await loop.run_in_executor(
+ None,
+ partial(self._extract_sync, audio_float32, calm_positive, text_divergence),
+ )
+
+ @classmethod
+ def from_env(cls) -> "ProsodicExtractor":
+ """Construct from environment. Raises if CF_VOICE_PROSODY is not set."""
+ if os.environ.get("CF_VOICE_PROSODY", "0") != "1":
+ raise EnvironmentError(
+ "CF_VOICE_PROSODY=1 is required to enable openSMILE eGeMAPS extraction. "
+ "Add it to your .env and install opensmile: pip install opensmile"
+ )
+ return cls()
diff --git a/cf_voice/stt.py b/cf_voice/stt.py
index 7bb3685..2b62c81 100644
--- a/cf_voice/stt.py
+++ b/cf_voice/stt.py
@@ -46,6 +46,17 @@ class WhisperSTT:
print(result.text)
"""
+ # Known single-token hallucinations that Whisper emits on music/noise with
+ # low no_speech_prob (i.e. Whisper thinks it heard speech). These are too
+ # short to be real utterances in any supported language context.
+ _HALLUCINATION_TOKENS: frozenset[str] = frozenset({
+ "ty", "t y", "bye", "hmm", "mm", "mhm", "uh", "um",
+ })
+
+ # Suppress a transcript if it repeats unchanged across this many consecutive
+ # windows — indicates Whisper is locked into a hallucination loop.
+ _MAX_REPEATS = 2
+
def __init__(
self,
model_name: str = "small",
@@ -77,6 +88,8 @@ class WhisperSTT:
self._device = device
self._model_name = model_name
self._session_prompt: str = ""
+ self._last_text: str = ""
+ self._repeat_count: int = 0
@classmethod
def from_env(cls) -> "WhisperSTT":
@@ -91,7 +104,14 @@ class WhisperSTT:
"""Estimated VRAM usage in MB for this model/compute_type combination."""
return _VRAM_ESTIMATES_MB.get(self._model_name, 1500)
- def _transcribe_sync(self, audio_float32: np.ndarray) -> STTResult:
+ # Segments above this no_speech_prob are hallucinations (silence/music/noise).
+ # faster-whisper sets this per-segment; 0.6 catches the "thank you" / "thanks
+ # for watching" family without cutting off genuine low-energy speech.
+ _NO_SPEECH_THRESHOLD = 0.6
+
+ def _transcribe_sync(
+ self, audio_float32: np.ndarray, language: str | None = None
+ ) -> STTResult:
"""Synchronous transcription — always call via transcribe_chunk_async."""
duration = len(audio_float32) / 16_000.0
@@ -100,22 +120,49 @@ class WhisperSTT:
text="", language="en", duration_s=duration, is_final=False
)
+ # Energy gate: skip Whisper entirely on silent/near-silent audio.
+ # In the sidecar path there is no upstream MicVoiceIO silence gate,
+ # so we must check here. RMS < 0.005 is inaudible; Whisper will
+ # hallucinate "thank you" or "thanks for watching" on silence.
+ rms = float(np.sqrt(np.mean(audio_float32 ** 2)))
+ if rms < 0.005:
+ return STTResult(text="", language="en", duration_s=duration, is_final=False)
+
segments, info = self._model.transcribe(
audio_float32,
- language=None,
- initial_prompt=self._session_prompt or None,
- vad_filter=False, # silence gating happens upstream in MicVoiceIO
+ language=language or None, # None = Whisper auto-detect
+ initial_prompt=None, # No session prompt — on 1s windows it causes
+ # phrase lock-in (model anchors on prior text
+ # rather than fresh audio). Reset via reset_session()
+ # at conversation boundaries instead.
+ vad_filter=True, # Silero VAD — skips non-speech frames
word_timestamps=False,
beam_size=3,
temperature=0.0,
)
- text = " ".join(s.text.strip() for s in segments).strip()
+ # Filter hallucinated segments: discard any segment where Whisper itself
+ # says there is likely no speech (no_speech_prob > threshold). This is
+ # the correct defense against "thank you" / music hallucinations — VAD
+ # alone is insufficient because music harmonics look speech-like to Silero.
+ text = " ".join(
+ s.text.strip()
+ for s in segments
+ if s.no_speech_prob <= self._NO_SPEECH_THRESHOLD
+ ).strip()
- # Rolling context: keep last ~50 words so the next chunk has prior text
- if text:
- words = (self._session_prompt + " " + text).split()
- self._session_prompt = " ".join(words[-50:])
+ # Gate 1: single-token hallucinations that slip past no_speech_prob.
+ if text.lower().rstrip(".,!?") in self._HALLUCINATION_TOKENS:
+ text = ""
+
+ # Gate 2: repetition lock — same non-empty text N windows in a row.
+ if text and text == self._last_text:
+ self._repeat_count += 1
+ if self._repeat_count >= self._MAX_REPEATS:
+ text = ""
+ else:
+ self._last_text = text
+ self._repeat_count = 0
return STTResult(
text=text,
@@ -124,19 +171,29 @@ class WhisperSTT:
is_final=duration >= 1.0 and info.language_probability > 0.5,
)
- async def transcribe_chunk_async(self, pcm_int16: bytes) -> STTResult:
+ async def transcribe_chunk_async(
+ self, pcm_int16: bytes, language: str | None = None
+ ) -> STTResult:
"""
Transcribe a raw PCM Int16 chunk, non-blocking.
pcm_int16 should be 16kHz mono bytes. Typical input is 20 × 100ms
chunks accumulated by MicVoiceIO (2-second window = 64000 bytes).
+
+ language: BCP-47 hint (e.g. "en", "es"). None = Whisper auto-detects,
+ which is slower and more hallucination-prone on short clips.
"""
+ from functools import partial
audio = (
np.frombuffer(pcm_int16, dtype=np.int16).astype(np.float32) / 32768.0
)
- loop = asyncio.get_event_loop()
- return await loop.run_in_executor(None, self._transcribe_sync, audio)
+ loop = asyncio.get_running_loop()
+ return await loop.run_in_executor(
+ None, partial(self._transcribe_sync, audio, language)
+ )
def reset_session(self) -> None:
- """Clear the rolling prompt. Call at the start of each new conversation."""
+ """Clear rolling state. Call at the start of each new conversation."""
self._session_prompt = ""
+ self._last_text = ""
+ self._repeat_count = 0
diff --git a/cf_voice/telephony.py b/cf_voice/telephony.py
new file mode 100644
index 0000000..84d824f
--- /dev/null
+++ b/cf_voice/telephony.py
@@ -0,0 +1,500 @@
+# cf_voice/telephony.py — outbound telephony abstraction
+#
+# Protocol + mock backend: MIT licensed.
+# SignalWireBackend, FreeSWITCHBackend: BSL 1.1 (real telephony, cloud credentials).
+#
+# Consumers (Osprey, Harrier, Ibis, Kestrel) depend only on TelephonyBackend
+# and CallSession — both MIT. The concrete backends are selected by make_telephony()
+# based on the tier and available credentials.
+#
+# Requires optional extras for real backends:
+# pip install cf-voice[signalwire] — SignalWire (paid tier, CF-provisioned)
+# pip install cf-voice[freeswitch] — FreeSWITCH ESL (free tier, self-hosted)
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Literal, Protocol, runtime_checkable
+
+logger = logging.getLogger(__name__)
+
+CallState = Literal[
+ "dialing",
+ "ringing",
+ "in_progress",
+ "hold",
+ "bridged",
+ "completed",
+ "failed",
+ "no_answer",
+ "busy",
+]
+
+
+@dataclass
+class CallSession:
+ """
+ Represents an active or completed outbound call.
+
+ call_sid is the backend-assigned identifier — for SignalWire this is a
+ Twilio-compatible SID string; for FreeSWITCH it is the UUID.
+
+ state is updated by the backend as the call progresses. Consumers should
+ poll via backend.get_state() or subscribe to webhook events.
+ """
+ call_sid: str
+ to: str
+ from_: str
+ state: CallState = "dialing"
+ duration_s: float = 0.0
+ # AMD result: "human" | "machine" | "unknown"
+ # Populated once the backend resolves answering machine detection.
+ amd_result: str = "unknown"
+ error: str | None = None
+
+
+@runtime_checkable
+class TelephonyBackend(Protocol):
+ """
+ Abstract telephony backend interface.
+
+ All methods are async. Implementations must be safe to call from an
+ asyncio event loop. Long-running network operations run in a thread pool
+ (not the caller's responsibility).
+
+ Field names are stable as of cf-voice v0.1.0.
+ """
+
+ async def dial(
+ self,
+ to: str,
+ from_: str,
+ webhook_url: str,
+ *,
+ amd: bool = False,
+ ) -> CallSession:
+ """
+ Initiate an outbound call.
+
+ to / from_ E.164 numbers ("+15551234567").
+ webhook_url URL the backend will POST call events to (SignalWire/TwiML style).
+ amd If True, request answering machine detection. Result lands in
+ CallSession.amd_result once the backend resolves it.
+
+ Returns a CallSession with state="dialing".
+ """
+ ...
+
+ async def send_dtmf(self, call_sid: str, digits: str) -> None:
+ """
+ Send DTMF (dual-tone multi-frequency) tones mid-call.
+
+ digits String of 0-9, *, #, A-D. Each character is one tone.
+ Pauses may be represented as 'w' (0.5s) or 'W' (1s) if the backend
+ supports them.
+ """
+ ...
+
+ async def bridge(self, call_sid: str, target: str) -> None:
+ """
+ Bridge the active call to a second E.164 number or SIP URI.
+
+ Used to connect the user directly to a human agent after Osprey has
+ navigated the IVR. The original call leg remains connected.
+ """
+ ...
+
+ async def hangup(self, call_sid: str) -> None:
+ """Terminate the call. Idempotent — safe to call on already-ended calls."""
+ ...
+
+ async def announce(
+ self,
+ call_sid: str,
+ text: str,
+ voice: str = "default",
+ ) -> None:
+ """
+ Play synthesised speech into the call.
+
+ Implements the adaptive service identification requirement (osprey#21):
+ Osprey announces its identity before navigating an IVR so that the
+ other party can consent to automated interaction.
+
+ voice Backend-specific voice identifier. "default" uses the backend's
+ default TTS voice.
+ """
+ ...
+
+ async def get_state(self, call_sid: str) -> CallState:
+ """Fetch the current state of a call from the backend."""
+ ...
+
+
+# ── Mock backend (MIT) ────────────────────────────────────────────────────────
+
+
+class MockTelephonyBackend:
+ """
+ Synthetic telephony backend for development and CI.
+
+ No real calls are placed. Operations log to cf_voice.telephony and update
+ in-memory CallSession objects. AMD resolves to "human" after a simulated
+ delay.
+
+ Usage:
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+15551234567", "+18005550000", "https://...")
+ await backend.send_dtmf(session.call_sid, "1")
+ await backend.hangup(session.call_sid)
+ """
+
+ def __init__(self, amd_delay_s: float = 0.5) -> None:
+ self._sessions: dict[str, CallSession] = {}
+ self._amd_delay_s = amd_delay_s
+ self._call_counter = 0
+
+ def _next_sid(self) -> str:
+ self._call_counter += 1
+ return f"mock_sid_{self._call_counter:04d}"
+
+ async def dial(
+ self,
+ to: str,
+ from_: str,
+ webhook_url: str,
+ *,
+ amd: bool = False,
+ ) -> CallSession:
+ sid = self._next_sid()
+ session = CallSession(call_sid=sid, to=to, from_=from_, state="ringing")
+ self._sessions[sid] = session
+ logger.debug("MockTelephony: dial %s → %s (sid=%s)", from_, to, sid)
+
+ async def _progress() -> None:
+ await asyncio.sleep(0.05)
+ session.state = "in_progress"
+ if amd:
+ await asyncio.sleep(self._amd_delay_s)
+ session.amd_result = "human"
+ logger.debug("MockTelephony: AMD resolved human (sid=%s)", sid)
+
+ asyncio.create_task(_progress())
+ return session
+
+ async def send_dtmf(self, call_sid: str, digits: str) -> None:
+ self._sessions[call_sid] # KeyError if unknown — intentional
+ logger.debug("MockTelephony: DTMF %r (sid=%s)", digits, call_sid)
+
+ async def bridge(self, call_sid: str, target: str) -> None:
+ session = self._sessions[call_sid]
+ session.state = "bridged"
+ logger.debug("MockTelephony: bridge → %s (sid=%s)", target, call_sid)
+
+ async def hangup(self, call_sid: str) -> None:
+ session = self._sessions.get(call_sid)
+ if session:
+ session.state = "completed"
+ logger.debug("MockTelephony: hangup (sid=%s)", call_sid)
+
+ async def announce(
+ self,
+ call_sid: str,
+ text: str,
+ voice: str = "default",
+ ) -> None:
+ self._sessions[call_sid] # KeyError if unknown — intentional
+ logger.debug(
+ "MockTelephony: announce voice=%s text=%r (sid=%s)", voice, text, call_sid
+ )
+
+ async def get_state(self, call_sid: str) -> CallState:
+ return self._sessions[call_sid].state
+
+
+# ── SignalWire backend (BSL 1.1) ──────────────────────────────────────────────
+
+
+class SignalWireBackend:
+ """
+ SignalWire outbound telephony (Twilio-compatible REST API).
+
+ BSL 1.1 — requires paid tier or self-hosted CF SignalWire project.
+
+ Credentials sourced from environment:
+ CF_SW_PROJECT_ID — SignalWire project ID
+ CF_SW_AUTH_TOKEN — SignalWire auth token
+ CF_SW_SPACE_URL — space URL, e.g. "yourspace.signalwire.com"
+
+ Requires: pip install cf-voice[signalwire]
+ """
+
+ def __init__(
+ self,
+ project_id: str | None = None,
+ auth_token: str | None = None,
+ space_url: str | None = None,
+ ) -> None:
+ try:
+ from signalwire.rest import Client as SWClient # type: ignore[import]
+ except ImportError as exc:
+ raise ImportError(
+ "SignalWire SDK is required for SignalWireBackend. "
+ "Install with: pip install cf-voice[signalwire]"
+ ) from exc
+
+ self._project_id = project_id or os.environ["CF_SW_PROJECT_ID"]
+ self._auth_token = auth_token or os.environ["CF_SW_AUTH_TOKEN"]
+ self._space_url = space_url or os.environ["CF_SW_SPACE_URL"]
+ self._client = SWClient(
+ self._project_id,
+ self._auth_token,
+ signalwire_space_url=self._space_url,
+ )
+ self._loop = asyncio.get_event_loop()
+
+ async def dial(
+ self,
+ to: str,
+ from_: str,
+ webhook_url: str,
+ *,
+ amd: bool = False,
+ ) -> CallSession:
+ call_kwargs: dict = dict(
+ to=to,
+ from_=from_,
+ url=webhook_url,
+ status_callback=webhook_url,
+ )
+ if amd:
+ call_kwargs["machine_detection"] = "Enable"
+ call_kwargs["async_amd"] = True
+
+ call = await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls.create(**call_kwargs),
+ )
+ return CallSession(
+ call_sid=call.sid,
+ to=to,
+ from_=from_,
+ state="dialing",
+ )
+
+ async def send_dtmf(self, call_sid: str, digits: str) -> None:
+ await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls(call_sid).update(
+ twiml=f""
+ ),
+ )
+
+ async def bridge(self, call_sid: str, target: str) -> None:
+ await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls(call_sid).update(
+ twiml=(
+ f"{target}"
+ )
+ ),
+ )
+
+ async def hangup(self, call_sid: str) -> None:
+ await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls(call_sid).update(status="completed"),
+ )
+
+ async def announce(
+ self,
+ call_sid: str,
+ text: str,
+ voice: str = "alice",
+ ) -> None:
+ await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls(call_sid).update(
+ twiml=f"{text}"
+ ),
+ )
+
+ async def get_state(self, call_sid: str) -> CallState:
+ call = await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._client.calls(call_sid).fetch(),
+ )
+ _sw_map: dict[str, CallState] = {
+ "queued": "dialing", "ringing": "ringing", "in-progress": "in_progress",
+ "completed": "completed", "failed": "failed", "busy": "busy",
+ "no-answer": "no_answer",
+ }
+ return _sw_map.get(call.status, "failed")
+
+
+# ── FreeSWITCH backend (BSL 1.1) ─────────────────────────────────────────────
+
+
+class FreeSWITCHBackend:
+ """
+ Self-hosted FreeSWITCH outbound telephony via ESL (event socket layer).
+
+ BSL 1.1 — requires free tier + user-provisioned FreeSWITCH + VoIP.ms SIP trunk.
+
+ Credentials sourced from environment:
+ CF_ESL_HOST — FreeSWITCH ESL host (default: 127.0.0.1)
+ CF_ESL_PORT — FreeSWITCH ESL port (default: 8021)
+ CF_ESL_PASSWORD — FreeSWITCH ESL password
+
+ Requires: pip install cf-voice[freeswitch]
+
+ Note: FreeSWITCH AMD (mod_vad + custom heuristic or Whisper pipe) is not
+ yet implemented. The amd parameter is accepted but amd_result stays "unknown".
+ """
+
+ def __init__(
+ self,
+ host: str | None = None,
+ port: int | None = None,
+ password: str | None = None,
+ ) -> None:
+ try:
+ import ESL # type: ignore[import]
+ except ImportError as exc:
+ raise ImportError(
+ "FreeSWITCH ESL bindings are required for FreeSWITCHBackend. "
+ "Install with: pip install cf-voice[freeswitch]"
+ ) from exc
+
+ self._host = host or os.environ.get("CF_ESL_HOST", "127.0.0.1")
+ self._port = int(port or os.environ.get("CF_ESL_PORT", 8021))
+ self._password = password or os.environ["CF_ESL_PASSWORD"]
+ self._esl = ESL
+
+ def _connect(self):
+ conn = self._esl.ESLconnection(self._host, str(self._port), self._password)
+ if not conn.connected():
+ raise RuntimeError(
+ f"Could not connect to FreeSWITCH ESL at {self._host}:{self._port}"
+ )
+ return conn
+
+ async def dial(
+ self,
+ to: str,
+ from_: str,
+ webhook_url: str,
+ *,
+ amd: bool = False,
+ ) -> CallSession:
+ def _originate() -> str:
+ conn = self._connect()
+ # ESL originate: sofia/gateway/voipms/{to} {from_} XML default
+ cmd = (
+ f"originate {{origination_caller_id_number={from_},"
+ f"origination_caller_id_name=CircuitForge}}"
+ f"sofia/gateway/voipms/{to.lstrip('+')} &park()"
+ )
+ result = conn.api("originate", cmd)
+ return result.getBody().strip()
+
+ body = await asyncio.get_event_loop().run_in_executor(None, _originate)
+ # FreeSWITCH returns "+OK " on success
+ if not body.startswith("+OK"):
+ raise RuntimeError(f"FreeSWITCH originate failed: {body}")
+ uuid = body.removeprefix("+OK").strip()
+ return CallSession(call_sid=uuid, to=to, from_=from_, state="dialing")
+
+ async def send_dtmf(self, call_sid: str, digits: str) -> None:
+ def _dtmf() -> None:
+ conn = self._connect()
+ conn.api("uuid_send_dtmf", f"{call_sid} {digits}")
+
+ await asyncio.get_event_loop().run_in_executor(None, _dtmf)
+
+ async def bridge(self, call_sid: str, target: str) -> None:
+ def _bridge() -> None:
+ conn = self._connect()
+ conn.api(
+ "uuid_bridge",
+ f"{call_sid} sofia/gateway/voipms/{target.lstrip('+')}",
+ )
+
+ await asyncio.get_event_loop().run_in_executor(None, _bridge)
+
+ async def hangup(self, call_sid: str) -> None:
+ def _hangup() -> None:
+ conn = self._connect()
+ conn.api("uuid_kill", call_sid)
+
+ await asyncio.get_event_loop().run_in_executor(None, _hangup)
+
+ async def announce(
+ self,
+ call_sid: str,
+ text: str,
+ voice: str = "default",
+ ) -> None:
+ # FreeSWITCH TTS via mod_tts_commandline or Piper pipe
+ def _say() -> None:
+ conn = self._connect()
+ conn.api("uuid_broadcast", f"{call_sid} say::en CHAT SPOKEN {text}")
+
+ await asyncio.get_event_loop().run_in_executor(None, _say)
+
+ async def get_state(self, call_sid: str) -> CallState:
+ def _fetch() -> str:
+ conn = self._connect()
+ return conn.api("uuid_getvar", f"{call_sid} call_state").getBody().strip()
+
+ raw = await asyncio.get_event_loop().run_in_executor(None, _fetch)
+ _fs_map: dict[str, CallState] = {
+ "CS_INIT": "dialing", "CS_ROUTING": "ringing",
+ "CS_EXECUTE": "in_progress", "CS_HANGUP": "completed",
+ "CS_DESTROY": "completed",
+ }
+ return _fs_map.get(raw, "failed")
+
+
+# ── Factory ───────────────────────────────────────────────────────────────────
+
+
+def make_telephony(
+ mock: bool | None = None,
+ backend: str | None = None,
+) -> MockTelephonyBackend | SignalWireBackend | FreeSWITCHBackend:
+ """
+ Factory: return a TelephonyBackend appropriate for the current environment.
+
+ Resolution order:
+ 1. mock=True or CF_VOICE_MOCK=1 → MockTelephonyBackend
+ 2. backend="signalwire" or CF_SW_PROJECT_ID present → SignalWireBackend
+ 3. backend="freeswitch" or CF_ESL_PASSWORD present → FreeSWITCHBackend
+ 4. Raises RuntimeError — no usable backend configured
+
+ In production, backend selection is driven by the tier system:
+ Free tier → FreeSWITCHBackend (BYOK VoIP)
+ Paid tier → SignalWireBackend (CF-provisioned)
+ """
+ use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
+ if use_mock:
+ return MockTelephonyBackend()
+
+ resolved_backend = backend or (
+ "signalwire" if os.environ.get("CF_SW_PROJECT_ID") else
+ "freeswitch" if os.environ.get("CF_ESL_PASSWORD") else
+ None
+ )
+
+ if resolved_backend == "signalwire":
+ return SignalWireBackend()
+
+ if resolved_backend == "freeswitch":
+ return FreeSWITCHBackend()
+
+ raise RuntimeError(
+ "No telephony backend configured. "
+ "Set CF_VOICE_MOCK=1 for mock mode, or provide SignalWire / FreeSWITCH credentials."
+ )
diff --git a/cf_voice/trajectory.py b/cf_voice/trajectory.py
new file mode 100644
index 0000000..3b2608c
--- /dev/null
+++ b/cf_voice/trajectory.py
@@ -0,0 +1,288 @@
+# cf_voice/trajectory.py — affect trajectory and SER/VAD coherence signals
+#
+# MIT licensed — derived computation only, no inference models.
+#
+# Two signal families:
+#
+# 1. TrajectorySignal — rolling arousal/valence trend across the last N windows.
+# Detects escalation, de-escalation, suppression, worsening, improving.
+#
+# 2. CoherenceSignal — cross-model comparison between SER (categorical affect)
+# and VAD (continuous dimensional valence). Disagreement indicates affect
+# suppression, controlled presentation, or surface-only semantic reframe.
+#
+# Both signals activate only after BASELINE_MIN windows per speaker are buffered.
+# All thresholds are relative to the per-speaker rolling mean, not absolute —
+# this is required for ND/neurodivergent speaker safety (see design doc).
+#
+# Safety note: these signals must never be labelled "deception" in any
+# user-facing context. Use: "affect divergence", "controlled presentation",
+# "framing shift". The user interprets; the system observes.
+from __future__ import annotations
+
+from collections import deque
+from dataclasses import dataclass
+
+from cf_voice.dimensional import DimensionalResult
+
+# Rolling window depth per speaker
+BUFFER_WINDOW = 5
+
+# Minimum frames before signals activate (relative baseline requirement)
+BASELINE_MIN = 3
+
+# Minimum arousal/valence delta per window to count as directional movement
+_DELTA_THRESHOLD = 0.05
+
+# Arousal threshold above which "neutral SER + high arousal" = suppression candidate
+_SUPPRESSION_AROUSAL_MIN = 0.65
+
+# SER affects that imply low arousal presentation (used for suppression detection)
+_LOW_PRESENTATION_AFFECTS = frozenset({"neutral", "scripted", "tired", "apologetic"})
+
+# Expected valence ranges derived from MSP-Podcast emotion distribution.
+# Used to determine whether SER affect label and dimensional valence agree.
+_AFFECT_VALENCE_PRIOR: dict[str, tuple[float, float]] = {
+ "warm": (0.60, 1.00),
+ "genuine": (0.55, 1.00),
+ "optimistic": (0.55, 0.90),
+ "neutral": (0.35, 0.65),
+ "confused": (0.30, 0.60),
+ "scripted": (0.30, 0.65),
+ "apologetic": (0.20, 0.55),
+ "tired": (0.10, 0.50),
+ "frustrated": (0.10, 0.45),
+ "dismissive": (0.15, 0.50),
+ "condescending": (0.10, 0.45),
+ "urgent": (0.15, 0.55),
+}
+
+# Ordinal positivity for reframe direction detection.
+# Higher = more positive presentation.
+_AFFECT_POSITIVITY: dict[str, int] = {
+ "urgent": 1,
+ "frustrated": 1,
+ "condescending": 1,
+ "dismissive": 2,
+ "tired": 2,
+ "apologetic": 3,
+ "confused": 3,
+ "scripted": 4,
+ "neutral": 4,
+ "optimistic": 5,
+ "genuine": 5,
+ "warm": 6,
+}
+
+
+@dataclass
+class TrajectorySignal:
+ """
+ Rolling trend across recent dimensional frames for one speaker.
+
+ All delta values: current_frame_value - mean(buffer_values).
+ Positive arousal_delta = current frame is more activated than baseline.
+ Negative valence_delta = current frame is more negative than baseline.
+
+ trend values:
+ "calibrating" not enough frames yet (< BASELINE_MIN)
+ "stable" no significant directional movement
+ "escalating" arousal rising: current > mean by DELTA_THRESHOLD, consecutive
+ "de-escalating" arousal falling after elevated period
+ "worsening" valence falling: current < mean, consecutive
+ "improving" valence rising after depressed period
+ "suppressed" SER affect is calm/neutral, VAD arousal is elevated
+ """
+ arousal_delta: float
+ valence_delta: float
+ dominance_delta: float
+ arousal_trend: str # "rising" | "falling" | "flat"
+ valence_trend: str # "rising" | "falling" | "flat"
+ trend: str
+ frames_in_buffer: int
+ baseline_established: bool
+
+
+@dataclass
+class CoherenceSignal:
+ """
+ Cross-signal comparison: SER categorical affect vs. VAD dimensional valence.
+
+ coherence_score:
+ 1.0 = SER label and VAD valence are fully consistent.
+ 0.0 = maximum disagreement.
+
+ suppression_flag:
+ True when the speaker is presenting as calm/neutral (SER) but VAD arousal
+ is elevated. Indicates controlled presentation with activation underneath.
+ This is relative to a per-session threshold — not a universal claim.
+
+ reframe_type:
+ "none" no SER category shift this window
+ "genuine" SER shifted toward more positive AND dimensional valence also
+ improved (>= DELTA_THRESHOLD in this window)
+ "surface" SER shifted toward more positive BUT dimensional valence
+ continued its prior trajectory unchanged or worsening
+
+ affect_divergence:
+ Signed: VAD-implied valence minus SER-implied valence midpoint.
+ Negative = VAD more negative than SER label implies (masking candidate).
+ Positive = VAD more positive than SER label implies (unusual).
+ """
+ coherence_score: float
+ suppression_flag: bool
+ reframe_type: str # "none" | "genuine" | "surface"
+ affect_divergence: float
+
+
+# ── Public helpers ─────────────────────────────────────────────────────────────
+
+
+def affect_coherence(affect: str, valence: float) -> float:
+ """
+ Compute coherence between a SER affect category and a VAD valence score.
+
+ Returns 1.0 when valence falls inside the expected range for the affect.
+ Returns 0.0 when the gap between valence and the nearest range boundary
+ exceeds 0.40 (the full range of a typical incoherence gap).
+ """
+ lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
+ if lo <= valence <= hi:
+ return 1.0
+ gap = min(abs(valence - lo), abs(valence - hi))
+ return round(max(0.0, 1.0 - (gap / 0.40)), 3)
+
+
+def affect_divergence_score(affect: str, valence: float) -> float:
+ """
+ Signed divergence: actual VAD valence minus the midpoint of the expected range.
+
+ Negative = VAD more negative than SER label implies.
+ Positive = VAD more positive than SER label implies.
+ """
+ lo, hi = _AFFECT_VALENCE_PRIOR.get(affect, (0.30, 0.70))
+ midpoint = (lo + hi) / 2.0
+ return round(valence - midpoint, 3)
+
+
+def compute_trajectory(
+ buffer: deque,
+ current: DimensionalResult,
+ ser_affect: str,
+ prior_ser_affect: str | None,
+) -> tuple[TrajectorySignal, CoherenceSignal]:
+ """
+ Compute trajectory and coherence signals for one speaker at one window.
+
+ buffer Rolling deque of prior DimensionalResult for this speaker.
+ Must be updated AFTER this call (append current to buffer).
+ current DimensionalResult for the window being classified.
+ ser_affect SER affect label for this window (from ToneClassifier).
+ prior_ser_affect SER affect label from the previous window, for reframe detection.
+ Pass None on the first window or when not tracking.
+
+ Returns (TrajectorySignal, CoherenceSignal). Both have baseline_established=False
+ and trend="calibrating" when buffer has fewer than BASELINE_MIN entries.
+ """
+ n = len(buffer)
+
+ # Coherence can be computed without a buffer
+ coh_score = affect_coherence(ser_affect, current.valence)
+ div_score = affect_divergence_score(ser_affect, current.valence)
+
+ suppression = (
+ ser_affect in _LOW_PRESENTATION_AFFECTS
+ and current.arousal > _SUPPRESSION_AROUSAL_MIN
+ and current.valence < 0.50
+ )
+
+ reframe = "none"
+ if prior_ser_affect and prior_ser_affect != ser_affect:
+ if _is_more_positive(ser_affect, prior_ser_affect):
+ # Valence actually improved in this window vs. single prior frame
+ if n >= 1:
+ prev_valence = list(buffer)[-1].valence
+ dim_improved = (current.valence - prev_valence) >= _DELTA_THRESHOLD
+ else:
+ dim_improved = False
+ reframe = "genuine" if dim_improved else "surface"
+
+ coher = CoherenceSignal(
+ coherence_score=coh_score,
+ suppression_flag=suppression,
+ reframe_type=reframe,
+ affect_divergence=div_score,
+ )
+
+ if n < BASELINE_MIN:
+ traj = TrajectorySignal(
+ arousal_delta=0.0,
+ valence_delta=0.0,
+ dominance_delta=0.0,
+ arousal_trend="flat",
+ valence_trend="flat",
+ trend="calibrating",
+ frames_in_buffer=n,
+ baseline_established=False,
+ )
+ return traj, coher
+
+ mean_arousal = sum(f.arousal for f in buffer) / n
+ mean_valence = sum(f.valence for f in buffer) / n
+ mean_dominance = sum(f.dominance for f in buffer) / n
+
+ a_delta = current.arousal - mean_arousal
+ v_delta = current.valence - mean_valence
+ d_delta = current.dominance - mean_dominance
+
+ a_trend = (
+ "rising" if a_delta > _DELTA_THRESHOLD else
+ "falling" if a_delta < -_DELTA_THRESHOLD else
+ "flat"
+ )
+ v_trend = (
+ "rising" if v_delta > _DELTA_THRESHOLD else
+ "falling" if v_delta < -_DELTA_THRESHOLD else
+ "flat"
+ )
+
+ # Consecutive movement: check whether the most recent buffered frame
+ # was already moving in the same direction as the current frame.
+ buf_list = list(buffer)
+ prev = buf_list[-1]
+ a_consecutive = a_trend == "rising" and (current.arousal - prev.arousal) > 0.03
+ v_consecutive = v_trend == "falling" and (current.valence - prev.valence) < -0.03
+
+ # Composite trend label
+ if suppression:
+ trend = "suppressed"
+ elif a_trend == "rising" and a_consecutive:
+ trend = "escalating"
+ elif a_trend == "falling" and mean_arousal > 0.55:
+ trend = "de-escalating"
+ elif v_trend == "falling" and v_consecutive:
+ trend = "worsening"
+ elif v_trend == "rising" and mean_valence < 0.45:
+ trend = "improving"
+ else:
+ trend = "stable"
+
+ traj = TrajectorySignal(
+ arousal_delta=round(a_delta, 3),
+ valence_delta=round(v_delta, 3),
+ dominance_delta=round(d_delta, 3),
+ arousal_trend=a_trend,
+ valence_trend=v_trend,
+ trend=trend,
+ frames_in_buffer=n,
+ baseline_established=True,
+ )
+ return traj, coher
+
+
+# ── Internal helpers ───────────────────────────────────────────────────────────
+
+
+def _is_more_positive(current: str, prior: str) -> bool:
+ """True when the current SER affect is ranked more positive than prior."""
+ return _AFFECT_POSITIVITY.get(current, 4) > _AFFECT_POSITIVITY.get(prior, 4)
diff --git a/pyproject.toml b/pyproject.toml
index 0aa0c6c..5c67341 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,8 @@ requires-python = ">=3.11"
license = {text = "MIT"}
dependencies = [
"pydantic>=2.0",
+ "fastapi>=0.111",
+ "uvicorn[standard]>=0.29",
]
[project.optional-dependencies]
@@ -26,6 +28,14 @@ inference = [
"pyannote.audio>=3.1",
"python-dotenv>=1.0",
]
+signalwire = [
+ "signalwire>=2.0",
+]
+freeswitch = [
+ # ESL Python bindings are compiled from FreeSWITCH source.
+ # See: https://developer.signalwire.com/freeswitch/FreeSWITCH-Explained/Client-and-Developer-Interfaces/Event-Socket-Library/
+ "python-ESL",
+]
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
diff --git a/scripts/test_classify_e2e.py b/scripts/test_classify_e2e.py
new file mode 100644
index 0000000..a16c962
--- /dev/null
+++ b/scripts/test_classify_e2e.py
@@ -0,0 +1,69 @@
+"""
+End-to-end integration test for the cf-voice /classify endpoint.
+
+Extracts a 2-second window from a local media file, base64-encodes the
+raw PCM, and POSTs it to the running cf-voice service at localhost:8009.
+Prints each returned AudioEvent for quick inspection.
+
+Requires:
+ - cf-voice running at localhost:8009 (CF_VOICE_DIARIZE=1 for speaker labels)
+ - ffmpeg on PATH
+ - A local audio/video file (edit MEDIA_FILE below)
+
+Run:
+ python scripts/test_classify_e2e.py
+"""
+from __future__ import annotations
+
+import base64
+import json
+import subprocess
+import urllib.request
+
+import numpy as np
+
+MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv"
+START_S = 120
+DURATION_S = 2
+SAMPLE_RATE = 16_000
+CF_VOICE_URL = "http://localhost:8009"
+
+proc = subprocess.run(
+ [
+ "ffmpeg", "-i", MEDIA_FILE,
+ "-ss", str(START_S),
+ "-t", str(DURATION_S),
+ "-ar", str(SAMPLE_RATE),
+ "-ac", "1",
+ "-f", "s16le",
+ "-",
+ ],
+ capture_output=True,
+ check=True,
+)
+
+pcm = proc.stdout
+audio = np.frombuffer(pcm, dtype=np.int16)
+print(f"audio samples: {len(audio)}, duration: {len(audio) / SAMPLE_RATE:.2f}s")
+
+payload = json.dumps({
+ "audio_chunk": base64.b64encode(pcm).decode(),
+ "timestamp": float(START_S),
+ "session_id": "test",
+}).encode()
+
+req = urllib.request.Request(
+ f"{CF_VOICE_URL}/classify",
+ data=payload,
+ headers={"Content-Type": "application/json"},
+ method="POST",
+)
+with urllib.request.urlopen(req, timeout=30) as resp:
+ result = json.loads(resp.read())
+
+for ev in result["events"]:
+ print(
+ f" {ev['event_type']:10}"
+ f" speaker_id={ev.get('speaker_id', 'N/A'):14}"
+ f" label={ev.get('label', '')}"
+ )
diff --git a/scripts/test_diarize_real.py b/scripts/test_diarize_real.py
new file mode 100644
index 0000000..4f38183
--- /dev/null
+++ b/scripts/test_diarize_real.py
@@ -0,0 +1,65 @@
+"""
+Manual integration test for speaker diarization via pyannote.
+
+Requires:
+ - HF_TOKEN env var (or set below)
+ - CF_VOICE_DIARIZE=1
+ - ffmpeg on PATH
+ - A local audio/video file (edit MEDIA_FILE below)
+ - pip install cf-voice[inference]
+
+Run:
+ HF_TOKEN=hf_... CF_VOICE_DIARIZE=1 python scripts/test_diarize_real.py
+"""
+from __future__ import annotations
+
+import asyncio
+import os
+import subprocess
+
+import numpy as np
+
+# Override if not in env
+if not os.environ.get("HF_TOKEN"):
+ raise SystemExit("Set HF_TOKEN in env before running this script.")
+os.environ.setdefault("CF_VOICE_DIARIZE", "1")
+
+MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv"
+START_S = 120
+DURATION_S = 2
+SAMPLE_RATE = 16_000
+
+from cf_voice.diarize import Diarizer, SpeakerTracker # noqa: E402
+
+
+async def main() -> None:
+ d = Diarizer.from_env()
+ tracker = SpeakerTracker()
+
+ proc = subprocess.run(
+ [
+ "ffmpeg", "-i", MEDIA_FILE,
+ "-ss", str(START_S),
+ "-t", str(DURATION_S),
+ "-ar", str(SAMPLE_RATE),
+ "-ac", "1",
+ "-f", "s16le",
+ "-",
+ ],
+ capture_output=True,
+ check=True,
+ )
+ audio = np.frombuffer(proc.stdout, dtype=np.int16).astype(np.float32) / 32768.0
+ rms = float(np.sqrt(np.mean(audio**2)))
+ print(f"audio: {len(audio)} samples, {len(audio) / SAMPLE_RATE:.2f}s, rms={rms:.4f}")
+
+ segs = await d.diarize_async(audio)
+ print(f"segments ({len(segs)}): {segs}")
+
+ mid = len(audio) / 2.0 / SAMPLE_RATE
+ label = d.speaker_at(segs, mid, tracker)
+ print(f"speaker_at({mid:.2f}s): {label}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tests/test_acoustic.py b/tests/test_acoustic.py
new file mode 100644
index 0000000..07e60bc
--- /dev/null
+++ b/tests/test_acoustic.py
@@ -0,0 +1,119 @@
+import pytest
+from cf_voice.acoustic import (
+ AcousticBackend,
+ AcousticResult,
+ ASTAcousticBackend,
+ MockAcousticBackend,
+ make_acoustic,
+)
+from cf_voice.events import AudioEvent
+
+
+class TestAcousticResult:
+ def test_fields(self):
+ evt = AudioEvent(timestamp=1.0, event_type="queue", label="ringback", confidence=0.9)
+ result = AcousticResult(queue=evt, speaker=None, environ=None, scene=None, timestamp=1.0)
+ assert result.queue.label == "ringback"
+ assert result.speaker is None
+ assert result.environ is None
+ assert result.scene is None
+
+
+class TestMockAcousticBackend:
+ def test_classify_returns_result(self):
+ backend = MockAcousticBackend(seed=0)
+ result = backend.classify_window(b"", timestamp=0.0)
+ assert isinstance(result, AcousticResult)
+ assert result.timestamp == 0.0
+
+ def test_all_events_present(self):
+ backend = MockAcousticBackend(seed=1)
+ result = backend.classify_window(b"", timestamp=1.0)
+ assert result.queue is not None
+ assert result.speaker is not None
+ assert result.environ is not None
+ assert result.scene is not None
+
+ def test_event_types_correct(self):
+ backend = MockAcousticBackend(seed=2)
+ result = backend.classify_window(b"", timestamp=2.0)
+ assert result.queue.event_type == "queue"
+ assert result.speaker.event_type == "speaker"
+ assert result.environ.event_type == "environ"
+ assert result.scene.event_type == "scene"
+
+ def test_confidence_in_range(self):
+ backend = MockAcousticBackend(seed=3)
+ for _ in range(5):
+ result = backend.classify_window(b"", timestamp=0.0)
+ assert 0.0 <= result.queue.confidence <= 1.0
+ assert 0.0 <= result.speaker.confidence <= 1.0
+ assert 0.0 <= result.environ.confidence <= 1.0
+ assert 0.0 <= result.scene.confidence <= 1.0
+
+ def test_lifecycle_advances(self):
+ """Phases should change after their duration elapses."""
+ import time
+ backend = MockAcousticBackend(seed=42)
+ # Force phase to advance by manipulating phase_start
+ backend._phase_start -= 1000 # pretend 1000s elapsed
+ result = backend.classify_window(b"", timestamp=0.0)
+ # Should have advanced — just verify it doesn't crash and returns valid
+ assert result.queue.label in (
+ "hold_music", "silence", "ringback", "busy", "dead_air", "dtmf_tone"
+ )
+
+ def test_isinstance_protocol(self):
+ backend = MockAcousticBackend()
+ assert isinstance(backend, AcousticBackend)
+
+ def test_deterministic_with_seed(self):
+ b1 = MockAcousticBackend(seed=99)
+ b2 = MockAcousticBackend(seed=99)
+ r1 = b1.classify_window(b"", timestamp=0.0)
+ r2 = b2.classify_window(b"", timestamp=0.0)
+ assert r1.queue.label == r2.queue.label
+ assert r1.queue.confidence == r2.queue.confidence
+
+
+class TestASTAcousticBackend:
+ def test_raises_import_error_without_deps(self, monkeypatch):
+ """ASTAcousticBackend should raise ImportError when transformers is unavailable."""
+ import builtins
+ real_import = builtins.__import__
+
+ def mock_import(name, *args, **kwargs):
+ if name in ("transformers",):
+ raise ImportError(f"Mocked: {name} not available")
+ return real_import(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", mock_import)
+ with pytest.raises(ImportError, match="transformers"):
+ ASTAcousticBackend()
+
+
+class TestMakeAcoustic:
+ def test_mock_flag(self):
+ backend = make_acoustic(mock=True)
+ assert isinstance(backend, MockAcousticBackend)
+
+ def test_mock_env(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_MOCK", "1")
+ backend = make_acoustic()
+ assert isinstance(backend, MockAcousticBackend)
+
+ def test_real_falls_back_to_mock_without_deps(self, monkeypatch, capsys):
+ """make_acoustic(mock=False) falls back to mock when deps are missing."""
+ import builtins
+ real_import = builtins.__import__
+
+ def mock_import(name, *args, **kwargs):
+ if name in ("transformers",):
+ raise ImportError(f"Mocked: {name} not available")
+ return real_import(name, *args, **kwargs)
+
+ monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
+ monkeypatch.setattr(builtins, "__import__", mock_import)
+ backend = make_acoustic(mock=False)
+ # Should fall back gracefully, never raise
+ assert isinstance(backend, MockAcousticBackend)
diff --git a/tests/test_diarize.py b/tests/test_diarize.py
new file mode 100644
index 0000000..0bbc4fc
--- /dev/null
+++ b/tests/test_diarize.py
@@ -0,0 +1,131 @@
+# tests/test_diarize.py — SpeakerTracker and speaker_at() diarization logic
+#
+# All tests are pure Python — no GPU, no pyannote, no HF_TOKEN required.
+# The Diarizer class itself is only tested for its from_env() guard and the
+# speaker_at() method, both of which run without loading the model.
+from __future__ import annotations
+
+import os
+import pytest
+
+from cf_voice.diarize import (
+ Diarizer,
+ SpeakerSegment,
+ SpeakerTracker,
+ SPEAKER_MULTIPLE,
+ SPEAKER_UNKNOWN,
+)
+
+
+# ── SpeakerTracker ────────────────────────────────────────────────────────────
+
+def test_tracker_first_speaker_is_a():
+ t = SpeakerTracker()
+ assert t.label("SPEAKER_00") == "Speaker A"
+
+
+def test_tracker_second_speaker_is_b():
+ t = SpeakerTracker()
+ t.label("SPEAKER_00")
+ assert t.label("SPEAKER_01") == "Speaker B"
+
+
+def test_tracker_same_id_returns_same_label():
+ t = SpeakerTracker()
+ first = t.label("SPEAKER_00")
+ second = t.label("SPEAKER_00")
+ assert first == second == "Speaker A"
+
+
+def test_tracker_26_speakers():
+ t = SpeakerTracker()
+ labels = [t.label(f"SPEAKER_{i:02d}") for i in range(26)]
+ assert labels[0] == "Speaker A"
+ assert labels[25] == "Speaker Z"
+
+
+def test_tracker_27th_speaker_wraps():
+ t = SpeakerTracker()
+ for i in range(26):
+ t.label(f"SPEAKER_{i:02d}")
+ label_27 = t.label("SPEAKER_26")
+ assert label_27 == "Speaker AA"
+
+
+def test_tracker_reset_clears_map():
+ t = SpeakerTracker()
+ t.label("SPEAKER_00")
+ t.label("SPEAKER_01")
+ t.reset()
+ # After reset, SPEAKER_01 is seen as new and maps to "Speaker A" again
+ assert t.label("SPEAKER_01") == "Speaker A"
+
+
+# ── Diarizer.speaker_at() ─────────────────────────────────────────────────────
+
+def _segs(*items: tuple[str, float, float]) -> list[SpeakerSegment]:
+ return [SpeakerSegment(speaker_id=s, start_s=st, end_s=en) for s, st, en in items]
+
+
+def test_speaker_at_single_speaker():
+ d = object.__new__(Diarizer) # bypass __init__ (no GPU needed)
+ segs = _segs(("SPEAKER_00", 0.0, 2.0))
+ t = SpeakerTracker()
+ assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A"
+
+
+def test_speaker_at_no_coverage_returns_unknown():
+ d = object.__new__(Diarizer)
+ segs = _segs(("SPEAKER_00", 0.0, 1.0))
+ assert d.speaker_at(segs, 1.5) == SPEAKER_UNKNOWN
+
+
+def test_speaker_at_empty_segments_returns_unknown():
+ d = object.__new__(Diarizer)
+ assert d.speaker_at([], 1.0) == SPEAKER_UNKNOWN
+
+
+def test_speaker_at_overlap_returns_multiple():
+ d = object.__new__(Diarizer)
+ segs = _segs(
+ ("SPEAKER_00", 0.0, 2.0),
+ ("SPEAKER_01", 0.5, 2.0), # overlaps SPEAKER_00 from 0.5s
+ )
+ assert d.speaker_at(segs, 1.0) == SPEAKER_MULTIPLE
+
+
+def test_speaker_at_boundary_inclusive():
+ d = object.__new__(Diarizer)
+ segs = _segs(("SPEAKER_00", 1.0, 2.0))
+ t = SpeakerTracker()
+ # Exact boundary timestamps are included
+ assert d.speaker_at(segs, 1.0, tracker=t) == "Speaker A"
+ assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker A"
+
+
+def test_speaker_at_without_tracker_returns_raw_id():
+ d = object.__new__(Diarizer)
+ segs = _segs(("SPEAKER_00", 0.0, 2.0))
+ assert d.speaker_at(segs, 1.0) == "SPEAKER_00"
+
+
+def test_speaker_at_two_speakers_no_overlap():
+ d = object.__new__(Diarizer)
+ t = SpeakerTracker()
+ segs = _segs(
+ ("SPEAKER_00", 0.0, 1.0),
+ ("SPEAKER_01", 1.5, 2.5),
+ )
+ assert d.speaker_at(segs, 0.5, tracker=t) == "Speaker A"
+ assert d.speaker_at(segs, 2.0, tracker=t) == "Speaker B"
+ # Gap at 1.2s: window [0.7, 1.7] → SPEAKER_00 has 0.3s, SPEAKER_01 has 0.2s
+ # Dominant speaker (SPEAKER_00 = "Speaker A") is returned, not SPEAKER_UNKNOWN.
+ assert d.speaker_at(segs, 1.2, tracker=t) == "Speaker A"
+
+
+# ── Diarizer.from_env() guard ─────────────────────────────────────────────────
+
+def test_from_env_raises_without_hf_token(monkeypatch):
+ monkeypatch.delenv("HF_TOKEN", raising=False)
+ with pytest.raises(EnvironmentError, match="HF_TOKEN"):
+ Diarizer.from_env()
diff --git a/tests/test_models.py b/tests/test_models.py
index 5743df0..e057456 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -75,10 +75,60 @@ class TestMockVoiceIO:
io = make_io()
assert isinstance(io, MockVoiceIO)
- def test_make_io_real_raises(self, monkeypatch):
+ def test_make_io_real_returns_mic_io(self, monkeypatch):
+ """make_io(mock=False) returns MicVoiceIO when sounddevice/numpy are installed."""
+ from cf_voice.capture import MicVoiceIO
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
- with pytest.raises(NotImplementedError):
- make_io(mock=False)
+ io = make_io(mock=False)
+ assert isinstance(io, MicVoiceIO)
+
+
+class TestContextClassifierChunk:
+ """Tests for classify_chunk() — multi-class event output."""
+
+ def test_mock_returns_four_event_types(self):
+ classifier = ContextClassifier.mock(interval_s=0.05, seed=10)
+ events = classifier.classify_chunk(timestamp=1.0)
+ types = {e.event_type for e in events}
+ # In mock mode all four event types should be present
+ assert "tone" in types
+ assert "queue" in types
+ assert "speaker" in types
+ assert "environ" in types
+
+ def test_mock_tone_event_has_subtext(self):
+ classifier = ContextClassifier.mock(interval_s=0.05, seed=11)
+ events = classifier.classify_chunk(timestamp=0.0)
+ tone_events = [e for e in events if e.event_type == "tone"]
+ assert len(tone_events) == 1
+ assert tone_events[0].subtext is not None
+
+ def test_elcor_override_flag(self):
+ classifier = ContextClassifier.mock(interval_s=0.05, seed=12)
+ events_generic = classifier.classify_chunk(timestamp=0.0, elcor=False)
+ events_elcor = classifier.classify_chunk(timestamp=0.0, elcor=True)
+
+ def subtext(evs):
+ return next(e.subtext for e in evs if e.event_type == "tone")
+
+ generic_sub = subtext(events_generic)
+ elcor_sub = subtext(events_elcor)
+ # Generic format: "Tone: X". Elcor format: "With X:" or "Warmly:" etc.
+ assert generic_sub.startswith("Tone:") or not generic_sub.endswith(":")
+ # Elcor format ends with ":"
+ assert elcor_sub.endswith(":")
+
+ def test_session_id_propagates(self):
+ classifier = ContextClassifier.mock(interval_s=0.05, seed=13)
+ events = classifier.classify_chunk(timestamp=0.0, session_id="ses_test")
+ tone_events = [e for e in events if e.event_type == "tone"]
+ assert tone_events[0].session_id == "ses_test"
+
+ def test_prior_frames_zero_means_no_shift(self):
+ classifier = ContextClassifier.mock(interval_s=0.05, seed=14)
+ events = classifier.classify_chunk(timestamp=0.0, prior_frames=0)
+ tone_events = [e for e in events if e.event_type == "tone"]
+ assert tone_events[0].shift_magnitude == 0.0
class TestContextClassifier:
diff --git a/tests/test_prefs.py b/tests/test_prefs.py
new file mode 100644
index 0000000..42b588b
--- /dev/null
+++ b/tests/test_prefs.py
@@ -0,0 +1,109 @@
+import os
+import pytest
+from cf_voice.prefs import (
+ PREF_CONFIDENCE_THRESHOLD,
+ PREF_ELCOR_MODE,
+ PREF_ELCOR_PRIOR_FRAMES,
+ PREF_WHISPER_MODEL,
+ get_confidence_threshold,
+ get_elcor_prior_frames,
+ get_voice_pref,
+ get_whisper_model,
+ is_elcor_enabled,
+ set_voice_pref,
+)
+
+
+class _DictStore:
+ """In-memory preference store for testing."""
+
+ def __init__(self, data: dict | None = None) -> None:
+ self._data: dict = data or {}
+
+ def get(self, user_id, path, default=None):
+ return self._data.get(path, default)
+
+ def set(self, user_id, path, value):
+ self._data[path] = value
+
+
+class TestGetVoicePref:
+ def test_returns_default_when_nothing_set(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_ELCOR", raising=False)
+ val = get_voice_pref(PREF_ELCOR_MODE, store=_DictStore())
+ assert val is False
+
+ def test_explicit_store_takes_priority(self):
+ store = _DictStore({PREF_ELCOR_MODE: True})
+ assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True
+
+ def test_env_fallback_bool(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_ELCOR", "1")
+ assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is True
+
+ def test_env_fallback_false(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_ELCOR", "0")
+ assert get_voice_pref(PREF_ELCOR_MODE, store=_DictStore()) is False
+
+ def test_env_fallback_float(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_CONFIDENCE_THRESHOLD", "0.7")
+ val = get_voice_pref(PREF_CONFIDENCE_THRESHOLD, store=_DictStore())
+ assert abs(val - 0.7) < 1e-9
+
+ def test_env_fallback_int(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_ELCOR_PRIOR_FRAMES", "6")
+ val = get_voice_pref(PREF_ELCOR_PRIOR_FRAMES, store=_DictStore())
+ assert val == 6
+
+ def test_env_fallback_str(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_WHISPER_MODEL", "medium")
+ val = get_voice_pref(PREF_WHISPER_MODEL, store=_DictStore())
+ assert val == "medium"
+
+ def test_store_beats_env(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_ELCOR", "1")
+ store = _DictStore({PREF_ELCOR_MODE: False})
+ # store has explicit False — but store.get returns None for falsy values
+ # only if the key is absent; here key IS set so store wins
+ store._data[PREF_ELCOR_MODE] = True
+ assert get_voice_pref(PREF_ELCOR_MODE, store=store) is True
+
+ def test_unknown_key_returns_none(self):
+ val = get_voice_pref("voice.nonexistent", store=_DictStore())
+ assert val is None
+
+
+class TestSetVoicePref:
+ def test_sets_in_store(self):
+ store = _DictStore()
+ set_voice_pref(PREF_ELCOR_MODE, True, store=store)
+ assert store._data[PREF_ELCOR_MODE] is True
+
+ def test_no_store_raises(self, monkeypatch):
+ # Patch _cf_core_store to return None (simulates no cf-core installed)
+ import cf_voice.prefs as prefs_mod
+ monkeypatch.setattr(prefs_mod, "_cf_core_store", lambda: None)
+ with pytest.raises(RuntimeError, match="No writable preference store"):
+ set_voice_pref(PREF_ELCOR_MODE, True)
+
+
+class TestConvenienceHelpers:
+ def test_is_elcor_enabled_false_default(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_ELCOR", raising=False)
+ assert is_elcor_enabled(store=_DictStore()) is False
+
+ def test_is_elcor_enabled_true_from_store(self):
+ store = _DictStore({PREF_ELCOR_MODE: True})
+ assert is_elcor_enabled(store=store) is True
+
+ def test_get_confidence_threshold_default(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_CONFIDENCE_THRESHOLD", raising=False)
+ assert get_confidence_threshold(store=_DictStore()) == pytest.approx(0.55)
+
+ def test_get_whisper_model_default(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_WHISPER_MODEL", raising=False)
+ assert get_whisper_model(store=_DictStore()) == "small"
+
+ def test_get_elcor_prior_frames_default(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_ELCOR_PRIOR_FRAMES", raising=False)
+ assert get_elcor_prior_frames(store=_DictStore()) == 4
diff --git a/tests/test_telephony.py b/tests/test_telephony.py
new file mode 100644
index 0000000..d83dbaf
--- /dev/null
+++ b/tests/test_telephony.py
@@ -0,0 +1,141 @@
+import asyncio
+import pytest
+from cf_voice.telephony import (
+ CallSession,
+ MockTelephonyBackend,
+ TelephonyBackend,
+ make_telephony,
+)
+
+
+class TestCallSession:
+ def test_defaults(self):
+ s = CallSession(call_sid="sid_1", to="+15551234567", from_="+18005550000")
+ assert s.state == "dialing"
+ assert s.amd_result == "unknown"
+ assert s.duration_s == 0.0
+ assert s.error is None
+
+ def test_state_mutation(self):
+ s = CallSession(call_sid="sid_2", to="+1", from_="+2", state="in_progress")
+ s.state = "completed"
+ assert s.state == "completed"
+
+
+class TestMockTelephonyBackend:
+ @pytest.mark.asyncio
+ async def test_dial_returns_session(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+15551234567", "+18005550000", "https://example.com/wh")
+ assert isinstance(session, CallSession)
+ assert session.call_sid.startswith("mock_sid_")
+ assert session.to == "+15551234567"
+ assert session.from_ == "+18005550000"
+
+ @pytest.mark.asyncio
+ async def test_dial_transitions_to_in_progress(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+15551234567", "+18005550000", "https://x.com")
+ # give the background task a moment to transition
+ await asyncio.sleep(0.1)
+ assert session.state == "in_progress"
+
+ @pytest.mark.asyncio
+ async def test_amd_resolves_human(self):
+ backend = MockTelephonyBackend(amd_delay_s=0.05)
+ session = await backend.dial("+1555", "+1800", "https://x.com", amd=True)
+ await asyncio.sleep(0.2)
+ assert session.amd_result == "human"
+
+ @pytest.mark.asyncio
+ async def test_send_dtmf(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ # should not raise
+ await backend.send_dtmf(session.call_sid, "1234#")
+
+ @pytest.mark.asyncio
+ async def test_send_dtmf_unknown_sid_raises(self):
+ backend = MockTelephonyBackend()
+ with pytest.raises(KeyError):
+ await backend.send_dtmf("nonexistent_sid", "1")
+
+ @pytest.mark.asyncio
+ async def test_bridge_updates_state(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ await backend.bridge(session.call_sid, "+15559999999")
+ assert session.state == "bridged"
+
+ @pytest.mark.asyncio
+ async def test_hangup_sets_completed(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ await backend.hangup(session.call_sid)
+ assert session.state == "completed"
+
+ @pytest.mark.asyncio
+ async def test_hangup_idempotent(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ await backend.hangup(session.call_sid)
+ await backend.hangup(session.call_sid)
+ assert session.state == "completed"
+
+ @pytest.mark.asyncio
+ async def test_announce_does_not_raise(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ await backend.announce(session.call_sid, "Hello, this is an automated assistant.")
+
+ @pytest.mark.asyncio
+ async def test_get_state(self):
+ backend = MockTelephonyBackend()
+ session = await backend.dial("+1", "+2", "https://x.com")
+ state = await backend.get_state(session.call_sid)
+ assert state in ("ringing", "in_progress", "dialing")
+
+ @pytest.mark.asyncio
+ async def test_multiple_calls_unique_sids(self):
+ backend = MockTelephonyBackend()
+ s1 = await backend.dial("+1", "+2", "https://x.com")
+ s2 = await backend.dial("+3", "+4", "https://x.com")
+ assert s1.call_sid != s2.call_sid
+
+ def test_isinstance_protocol(self):
+ backend = MockTelephonyBackend()
+ assert isinstance(backend, TelephonyBackend)
+
+
+class TestMakeTelephony:
+ def test_mock_flag(self):
+ backend = make_telephony(mock=True)
+ assert isinstance(backend, MockTelephonyBackend)
+
+ def test_mock_env(self, monkeypatch):
+ monkeypatch.setenv("CF_VOICE_MOCK", "1")
+ backend = make_telephony()
+ assert isinstance(backend, MockTelephonyBackend)
+
+ def test_no_config_raises(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
+ monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False)
+ monkeypatch.delenv("CF_ESL_PASSWORD", raising=False)
+ with pytest.raises(RuntimeError, match="No telephony backend configured"):
+ make_telephony()
+
+ def test_signalwire_selected_by_env(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
+ monkeypatch.setenv("CF_SW_PROJECT_ID", "proj_123")
+ # SignalWireBackend will raise ImportError (signalwire SDK not installed)
+ # but only at instantiation — make_telephony should call the constructor
+ with pytest.raises((ImportError, RuntimeError)):
+ make_telephony()
+
+ def test_freeswitch_selected_by_env(self, monkeypatch):
+ monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
+ monkeypatch.delenv("CF_SW_PROJECT_ID", raising=False)
+ monkeypatch.setenv("CF_ESL_PASSWORD", "s3cret")
+ # FreeSWITCHBackend will raise ImportError (ESL not installed)
+ with pytest.raises((ImportError, RuntimeError)):
+ make_telephony()