From 335d51f02f2115bc449dd54c40b75b7a3561243b Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 6 Apr 2026 17:51:09 -0700 Subject: [PATCH] feat: lock ToneEvent SSE wire format (cf-core#40) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AudioEvent: add speaker_id field (was on VoiceFrame only; needed on all events) - ToneEvent: add session_id field for session correlation across embedded consumers - README: full wire format documentation — JSON shape, field reference table, SSE envelope, Elcor mode subtext table, module license map - ToneEvent docstring references cf-core#40 as the wire format spec Closes cf-core#40 --- README.md | 122 ++++++++++++++++++++++++++++++++++++++++----- cf_voice/events.py | 7 +++ 2 files changed, 117 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 61f2d56..1e14e8a 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,156 @@ # cf-voice -CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude. +CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude — and exposes `ToneEvent` as the stable SSE wire type for downstream consumers (Linnet, Osprey, Falcon). -**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally. +**Status:** Notation v0.1.x — real inference pipeline live (faster-whisper STT, wav2vec2 SER, librosa prosody, pyannote diarization). Mock mode available for dev/CI without GPU or mic. ## Install ```bash -pip install -e ../cf-voice # editable install alongside sibling repos +# Mock mode only (no GPU required) +pip install -e ../cf-voice + +# Real inference (STT + tone classifier + diarization) +pip install -e "../cf-voice[inference]" ``` +Copy `.env.example` to `.env` and fill in `HF_TOKEN` for diarization. + ## Quick start ```python from cf_voice.context import ContextClassifier -classifier = ContextClassifier.mock() # or from_env() with CF_VOICE_MOCK=1 +# Mock mode (no hardware needed) +classifier = ContextClassifier.mock() +async for frame in classifier.stream(): + print(frame.label, frame.confidence) + +# Real mic capture (requires [inference] extras + CF_VOICE_MOCK unset) +classifier = ContextClassifier.from_env() async for frame in classifier.stream(): print(frame.label, frame.confidence) ``` -Or run the demo CLI: +CLI smoke-test: ```bash CF_VOICE_MOCK=1 cf-voice-demo ``` +--- + ## VoiceFrame +Produced by `cf_voice.io` (audio capture layer). MIT licensed. + ```python @dataclass class VoiceFrame: - label: str # e.g. "Warmly impatient" + label: str # tone descriptor, e.g. "Warmly impatient" confidence: float # 0.0–1.0 speaker_id: str # ephemeral local label, e.g. "speaker_a" shift_magnitude: float # delta from previous frame, 0.0–1.0 timestamp: float # session-relative seconds + + def is_reliable(self, threshold=0.6) -> bool: ... + def is_shift(self, threshold=0.3) -> bool: ... ``` +--- + +## ToneEvent — SSE wire format + +`ToneEvent` is the stable SSE wire type emitted by Linnet's annotation stream +and consumed by `` embeds in Osprey, Falcon, and other products. + +**Field names are locked as of cf-voice v0.1.0** (cf-core#40). + +### JSON shape + +```json +{ + "event_type": "tone", + "timestamp": 4.82, + "label": "Warmly impatient", + "confidence": 0.79, + "speaker_id": "speaker_a", + "subtext": "Tone: Frustrated", + "affect": "frustrated", + "shift_magnitude": 0.74, + "shift_direction": "more_urgent", + "prosody_flags": ["fast_rate", "rising"], + "session_id": "ses_abc123" +} +``` + +### Field reference + +| Field | Type | Stable | Description | +|---|---|---|---| +| `event_type` | `"tone"` | yes | Always `"tone"` for ToneEvent | +| `timestamp` | `float` | yes | Seconds since session start | +| `label` | `str` | yes | Human-readable tone descriptor ("Warmly impatient") | +| `confidence` | `float` | yes | 0.0–1.0. Below ~0.55 = speculative | +| `speaker_id` | `str` | yes | Ephemeral diarization label ("speaker_a"). Resets per session | +| `subtext` | `str \| null` | yes | Annotation text. Generic: `"Tone: Frustrated"`. Elcor: `"With barely concealed frustration:"` | +| `affect` | `str` | yes | AFFECT_LABELS key ("frustrated"). See `cf_voice.events.AFFECT_LABELS` | +| `shift_magnitude` | `float` | yes | 0.0–1.0. High = meaningful register change from previous frame | +| `shift_direction` | `str` | yes | `"warmer"` \| `"colder"` \| `"more_urgent"` \| `"stable"` | +| `prosody_flags` | `str[]` | no | Raw prosody signals ("fast_rate", "rising", "flat_pitch", "low_energy"). Subject to change | +| `session_id` | `str` | yes | Caller-assigned. Correlates events to a conversation session | + +### SSE envelope + +Linnet emits events in standard SSE format: + +``` +event: tone-event +data: {"event_type":"tone","timestamp":4.82,...} + +``` + +Host apps subscribing via `` receive `MessageEvent` with `type === "tone-event"`. + +### Elcor mode + +`subtext` switches format when the session is in Elcor mode (easter egg, unlocked by cumulative session time). Generic is always available; Elcor is opt-in via the session flag: + +| Affect | Generic | Elcor | +|---|---|---| +| frustrated | `Tone: Frustrated` | `With barely concealed frustration:` | +| warm | `Tone: Warm` | `Warmly:` | +| scripted | `Tone: Scripted` | `Reading from a script:` | +| dismissive | `Tone: Dismissive` | `With polite dismissiveness:` | +| tired | `Tone: Tired` | `With audible fatigue:` | + +--- + ## Mock mode -Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development. +Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode. + +--- ## Module structure | Module | License | Purpose | |--------|---------|---------| | `cf_voice.models` | MIT | `VoiceFrame` dataclass | -| `cf_voice.io` | MIT | Audio capture, mock generator | -| `cf_voice.context` | BSL 1.1* | Tone classification, diarization | +| `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types | +| `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory | +| `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing | +| `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper | +| `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody | +| `cf_voice.diarize` | BSL 1.1 | `Diarizer` — pyannote.audio async wrapper | +| `cf_voice.context` | BSL 1.1 | `ContextClassifier` — high-level consumer API | -*BSL applies when real inference models are integrated. Currently stub = MIT. +BSL applies to inference modules. IO + types + wire format = MIT. + +--- ## Consumed by -- `Circuit-Forge/linnet` — real-time tone annotation widget -- `Circuit-Forge/osprey` — telephony bridge voice context +- `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer) +- `Circuit-Forge/osprey` — telephony bridge voice context (Navigation v0.2.x) +- `Circuit-Forge/falcon` (planned) — phone form-filling, IVR navigation diff --git a/cf_voice/events.py b/cf_voice/events.py index 1622b81..eb4a578 100644 --- a/cf_voice/events.py +++ b/cf_voice/events.py @@ -62,11 +62,13 @@ class AudioEvent: A single classified event from the parallel audio classifier. event_type determines how to interpret label and whether subtext is present. + speaker_id is the ephemeral local diarization label for this event's speaker. """ timestamp: float event_type: EventType label: str confidence: float + speaker_id: str = "speaker_a" # Tone annotation — present on ToneEvent only. # Generic format (default): "Tone: Frustrated" # Elcor format (easter egg): "With barely concealed frustration:" @@ -78,6 +80,10 @@ class ToneEvent(AudioEvent): """ Tone/affect classification event. + This is the SSE wire type for Linnet's annotation stream and the + embed protocol. Field names are stable as of cf-voice + v0.1.0 — see cf-core#40 for the wire format spec. + The subtext field carries the human-readable annotation. Format is controlled by the caller (elcor flag in the classify request). """ @@ -85,6 +91,7 @@ class ToneEvent(AudioEvent): shift_magnitude: float = 0.0 shift_direction: str = "stable" # "warmer" | "colder" | "more_urgent" | "stable" prosody_flags: list[str] = field(default_factory=list) + session_id: str = "" # caller-assigned; correlates events to a session def __post_init__(self) -> None: # Force event_type to "tone" regardless of what the caller passed.