From 335d51f02f2115bc449dd54c40b75b7a3561243b Mon Sep 17 00:00:00 2001
From: pyr0ball <pyr0ball@gmail.com>
Date: Mon, 6 Apr 2026 17:51:09 -0700
Subject: [PATCH] feat: lock ToneEvent SSE wire format (cf-core#40)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- AudioEvent: add speaker_id field (was on VoiceFrame only; needed on all events)
- ToneEvent: add session_id field for session correlation across embedded consumers
- README: full wire format documentation — JSON shape, field reference table,
  SSE envelope, Elcor mode subtext table, module license map
- ToneEvent docstring references cf-core#40 as the wire format spec

Closes cf-core#40
---
 README.md          | 122 ++++++++++++++++++++++++++++++++++++++++-----
 cf_voice/events.py |   7 +++
 2 files changed, 117 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index 61f2d56..1e14e8a 100644
--- a/README.md
+++ b/README.md
@@ -1,58 +1,156 @@
 # cf-voice
 
-CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude.
+CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude — and exposes `ToneEvent` as the stable SSE wire type for downstream consumers (Linnet, Osprey, Falcon).
 
-**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally.
+**Status:** Notation v0.1.x — real inference pipeline live (faster-whisper STT, wav2vec2 SER, librosa prosody, pyannote diarization). Mock mode available for dev/CI without GPU or mic.
 
 ## Install
 
 ```bash
-pip install -e ../cf-voice   # editable install alongside sibling repos
+# Mock mode only (no GPU required)
+pip install -e ../cf-voice
+
+# Real inference (STT + tone classifier + diarization)
+pip install -e "../cf-voice[inference]"
 ```
 
+Copy `.env.example` to `.env` and fill in `HF_TOKEN` for diarization.
+
 ## Quick start
 
 ```python
 from cf_voice.context import ContextClassifier
 
-classifier = ContextClassifier.mock()          # or from_env() with CF_VOICE_MOCK=1
+# Mock mode (no hardware needed)
+classifier = ContextClassifier.mock()
+async for frame in classifier.stream():
+    print(frame.label, frame.confidence)
+
+# Real mic capture (requires [inference] extras + CF_VOICE_MOCK unset)
+classifier = ContextClassifier.from_env()
 async for frame in classifier.stream():
     print(frame.label, frame.confidence)
 ```
 
-Or run the demo CLI:
+CLI smoke-test:
 
 ```bash
 CF_VOICE_MOCK=1 cf-voice-demo
 ```
 
+---
+
 ## VoiceFrame
 
+Produced by `cf_voice.io` (audio capture layer). MIT licensed.
+
 ```python
 @dataclass
 class VoiceFrame:
-    label: str            # e.g. "Warmly impatient"
+    label: str            # tone descriptor, e.g. "Warmly impatient"
     confidence: float     # 0.0–1.0
     speaker_id: str       # ephemeral local label, e.g. "speaker_a"
     shift_magnitude: float  # delta from previous frame, 0.0–1.0
     timestamp: float      # session-relative seconds
+
+    def is_reliable(self, threshold=0.6) -> bool: ...
+    def is_shift(self, threshold=0.3) -> bool: ...
 ```
 
+---
+
+## ToneEvent — SSE wire format
+
+`ToneEvent` is the stable SSE wire type emitted by Linnet's annotation stream
+and consumed by `<LinnetWidget />` embeds in Osprey, Falcon, and other products.
+
+**Field names are locked as of cf-voice v0.1.0** (cf-core#40).
+
+### JSON shape
+
+```json
+{
+  "event_type": "tone",
+  "timestamp": 4.82,
+  "label": "Warmly impatient",
+  "confidence": 0.79,
+  "speaker_id": "speaker_a",
+  "subtext": "Tone: Frustrated",
+  "affect": "frustrated",
+  "shift_magnitude": 0.74,
+  "shift_direction": "more_urgent",
+  "prosody_flags": ["fast_rate", "rising"],
+  "session_id": "ses_abc123"
+}
+```
+
+### Field reference
+
+| Field | Type | Stable | Description |
+|---|---|---|---|
+| `event_type` | `"tone"` | yes | Always `"tone"` for ToneEvent |
+| `timestamp` | `float` | yes | Seconds since session start |
+| `label` | `str` | yes | Human-readable tone descriptor ("Warmly impatient") |
+| `confidence` | `float` | yes | 0.0–1.0. Below ~0.55 = speculative |
+| `speaker_id` | `str` | yes | Ephemeral diarization label ("speaker_a"). Resets per session |
+| `subtext` | `str \| null` | yes | Annotation text. Generic: `"Tone: Frustrated"`. Elcor: `"With barely concealed frustration:"` |
+| `affect` | `str` | yes | AFFECT_LABELS key ("frustrated"). See `cf_voice.events.AFFECT_LABELS` |
+| `shift_magnitude` | `float` | yes | 0.0–1.0. High = meaningful register change from previous frame |
+| `shift_direction` | `str` | yes | `"warmer"` \| `"colder"` \| `"more_urgent"` \| `"stable"` |
+| `prosody_flags` | `str[]` | no | Raw prosody signals ("fast_rate", "rising", "flat_pitch", "low_energy"). Subject to change |
+| `session_id` | `str` | yes | Caller-assigned. Correlates events to a conversation session |
+
+### SSE envelope
+
+Linnet emits events in standard SSE format:
+
+```
+event: tone-event
+data: {"event_type":"tone","timestamp":4.82,...}
+
+```
+
+Host apps subscribing via `<LinnetWidget />` receive `MessageEvent` with `type === "tone-event"`.
+
+### Elcor mode
+
+`subtext` switches format when the session is in Elcor mode (easter egg, unlocked by cumulative session time). Generic is always available; Elcor is opt-in via the session flag:
+
+| Affect | Generic | Elcor |
+|---|---|---|
+| frustrated | `Tone: Frustrated` | `With barely concealed frustration:` |
+| warm | `Tone: Warm` | `Warmly:` |
+| scripted | `Tone: Scripted` | `Reading from a script:` |
+| dismissive | `Tone: Dismissive` | `With polite dismissiveness:` |
+| tired | `Tone: Tired` | `With audible fatigue:` |
+
+---
+
 ## Mock mode
 
-Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development.
+Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. Emits synthetic `VoiceFrame` objects on a timer. No GPU, microphone, or `HF_TOKEN` required. All API surface is identical to real mode.
+
+---
 
 ## Module structure
 
 | Module | License | Purpose |
 |--------|---------|---------|
 | `cf_voice.models` | MIT | `VoiceFrame` dataclass |
-| `cf_voice.io` | MIT | Audio capture, mock generator |
-| `cf_voice.context` | BSL 1.1* | Tone classification, diarization |
+| `cf_voice.events` | MIT | `AudioEvent`, `ToneEvent`, wire format types |
+| `cf_voice.io` | MIT | `VoiceIO` base, `MockVoiceIO`, `make_io()` factory |
+| `cf_voice.capture` | BSL 1.1 | `MicVoiceIO` — real mic capture, 2s windowing |
+| `cf_voice.stt` | BSL 1.1 | `WhisperSTT` — faster-whisper async wrapper |
+| `cf_voice.classify` | BSL 1.1 | `ToneClassifier` — wav2vec2 SER + librosa prosody |
+| `cf_voice.diarize` | BSL 1.1 | `Diarizer` — pyannote.audio async wrapper |
+| `cf_voice.context` | BSL 1.1 | `ContextClassifier` — high-level consumer API |
 
-*BSL applies when real inference models are integrated. Currently stub = MIT.
+BSL applies to inference modules. IO + types + wire format = MIT.
+
+---
 
 ## Consumed by
 
-- `Circuit-Forge/linnet` — real-time tone annotation widget
-- `Circuit-Forge/osprey` — telephony bridge voice context
+- `Circuit-Forge/linnet` — real-time tone annotation PWA (primary consumer)
+- `Circuit-Forge/osprey` — telephony bridge voice context (Navigation v0.2.x)
+- `Circuit-Forge/falcon` (planned) — phone form-filling, IVR navigation
diff --git a/cf_voice/events.py b/cf_voice/events.py
index 1622b81..eb4a578 100644
--- a/cf_voice/events.py
+++ b/cf_voice/events.py
@@ -62,11 +62,13 @@ class AudioEvent:
     A single classified event from the parallel audio classifier.
 
     event_type determines how to interpret label and whether subtext is present.
+    speaker_id is the ephemeral local diarization label for this event's speaker.
     """
     timestamp: float
     event_type: EventType
     label: str
     confidence: float
+    speaker_id: str = "speaker_a"
     # Tone annotation — present on ToneEvent only.
     # Generic format (default): "Tone: Frustrated"
     # Elcor format (easter egg):  "With barely concealed frustration:"
@@ -78,6 +80,10 @@ class ToneEvent(AudioEvent):
     """
     Tone/affect classification event.
 
+    This is the SSE wire type for Linnet's annotation stream and the
+    <LinnetWidget /> embed protocol. Field names are stable as of cf-voice
+    v0.1.0 — see cf-core#40 for the wire format spec.
+
     The subtext field carries the human-readable annotation.
     Format is controlled by the caller (elcor flag in the classify request).
     """
@@ -85,6 +91,7 @@ class ToneEvent(AudioEvent):
     shift_magnitude: float = 0.0
     shift_direction: str = "stable"   # "warmer" | "colder" | "more_urgent" | "stable"
     prosody_flags: list[str] = field(default_factory=list)
+    session_id: str = ""              # caller-assigned; correlates events to a session
 
     def __post_init__(self) -> None:
         # Force event_type to "tone" regardless of what the caller passed.