feat: initial cf-voice stub — VoiceFrame API, mock IO, context classifier

- VoiceFrame dataclass: label, confidence, speaker_id, shift_magnitude, timestamp
- MockVoiceIO: async generator of synthetic frames on a timer (CF_VOICE_MOCK=1)
- ContextClassifier: passthrough stub wrapping VoiceIO; _enrich() hook for real classifiers
- make_io() factory: mock mode auto-detected from env, raises NotImplementedError for real audio
- cf-voice-demo CLI entry point for quick smoke-testing
- 12 tests passing; editable install via pip install -e ../cf-voice
This commit is contained in:
pyr0ball 2026-04-06 16:03:07 -07:00
commit 35fc0a088c
10 changed files with 492 additions and 0 deletions

10
.gitignore vendored Normal file
View file

@ -0,0 +1,10 @@
__pycache__/
*.py[cod]
*.egg-info/
dist/
build/
.venv/
.env
*.so
.pytest_cache/
.mypy_cache/

58
README.md Normal file
View file

@ -0,0 +1,58 @@
# cf-voice
CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude.
**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally.
## Install
```bash
pip install -e ../cf-voice # editable install alongside sibling repos
```
## Quick start
```python
from cf_voice.context import ContextClassifier
classifier = ContextClassifier.mock() # or from_env() with CF_VOICE_MOCK=1
async for frame in classifier.stream():
print(frame.label, frame.confidence)
```
Or run the demo CLI:
```bash
CF_VOICE_MOCK=1 cf-voice-demo
```
## VoiceFrame
```python
@dataclass
class VoiceFrame:
label: str # e.g. "Warmly impatient"
confidence: float # 0.01.0
speaker_id: str # ephemeral local label, e.g. "speaker_a"
shift_magnitude: float # delta from previous frame, 0.01.0
timestamp: float # session-relative seconds
```
## Mock mode
Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development.
## Module structure
| Module | License | Purpose |
|--------|---------|---------|
| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
| `cf_voice.io` | MIT | Audio capture, mock generator |
| `cf_voice.context` | BSL 1.1* | Tone classification, diarization |
*BSL applies when real inference models are integrated. Currently stub = MIT.
## Consumed by
- `Circuit-Forge/linnet` — real-time tone annotation widget
- `Circuit-Forge/osprey` — telephony bridge voice context

17
cf_voice/__init__.py Normal file
View file

@ -0,0 +1,17 @@
"""
cf-voice: CircuitForge voice annotation pipeline.
Quick start (mock mode, no GPU required):
from cf_voice.context import ContextClassifier
classifier = ContextClassifier.mock()
async for frame in classifier.stream():
print(frame.label, frame.confidence)
Set CF_VOICE_MOCK=1 in the environment to activate mock mode globally.
"""
from cf_voice.models import VoiceFrame
__version__ = "0.1.0"
__all__ = ["VoiceFrame"]

25
cf_voice/cli.py Normal file
View file

@ -0,0 +1,25 @@
# cf_voice/cli.py — cf-voice-demo entry point
import asyncio
from cf_voice.context import ContextClassifier
async def _run() -> None:
print("cf-voice mock stream — Ctrl+C to stop\n")
classifier = ContextClassifier.mock(interval_s=2.0)
try:
async for frame in classifier.stream():
reliable = "+" if frame.is_reliable() else "?"
shift = " [SHIFT]" if frame.is_shift() else ""
print(
f"[{frame.timestamp:6.1f}s] {frame.speaker_id} "
f"{frame.label:<30} conf={frame.confidence:.2f}{reliable}{shift}"
)
except KeyboardInterrupt:
pass
finally:
await classifier.stop()
def demo() -> None:
asyncio.run(_run())

74
cf_voice/context.py Normal file
View file

@ -0,0 +1,74 @@
# cf_voice/context.py — tone classification and context enrichment
#
# BSL 1.1 when real inference models are integrated.
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
#
# Real implementation (Notation v0.1.x) will:
# - Run YAMNet acoustic event detection on the audio buffer
# - Run wav2vec2-based SER (speech emotion recognition)
# - Run librosa prosody extraction (pitch, energy, rate)
# - Combine into enriched VoiceFrame label + confidence
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
from __future__ import annotations
import os
from typing import AsyncIterator
from cf_voice.io import VoiceIO, make_io
from cf_voice.models import VoiceFrame
class ContextClassifier:
"""
High-level voice context classifier.
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
In stub mode the frames pass through unchanged the enrichment pipeline
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
Usage
-----
classifier = ContextClassifier.from_env()
async for frame in classifier.stream():
print(frame.label, frame.confidence)
"""
def __init__(self, io: VoiceIO) -> None:
self._io = io
@classmethod
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
"""
Create a ContextClassifier from environment.
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
"""
io = make_io(interval_s=interval_s)
return cls(io=io)
@classmethod
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
from cf_voice.io import MockVoiceIO
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
async def stream(self) -> AsyncIterator[VoiceFrame]:
"""
Yield enriched VoiceFrames continuously.
Stub: frames from the IO layer pass through unchanged.
Real: enrichment pipeline runs here before yield.
"""
async for frame in self._io.stream():
yield self._enrich(frame)
async def stop(self) -> None:
await self._io.stop()
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
"""
Apply tone classification to a raw frame.
Stub: identity transform returns frame unchanged.
Real: replace label + confidence with classifier output.
"""
return frame

122
cf_voice/io.py Normal file
View file

@ -0,0 +1,122 @@
# cf_voice/io.py — audio capture and VoiceFrame generation
#
# MIT licensed. This layer handles audio I/O only — no inference.
#
# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are
# emitted on a timer. Real audio capture will be added in Notation v0.1.x
# once pyannote.audio and faster-whisper are integrated.
from __future__ import annotations
import asyncio
import os
import random
import time
from abc import ABC, abstractmethod
from typing import AsyncIterator
from cf_voice.models import VoiceFrame
# Generic tone labels for the annotation stream.
# These are the underlying classifier outputs — the Elcor-style prefix format
# ("With barely concealed frustration:") is applied by the UI layer, not here.
_MOCK_LABELS = [
"Calm and focused",
"Warmly impatient",
"Deflecting",
"Genuinely curious",
"Politely dismissive",
"Nervous but cooperative",
"Frustrated but contained",
"Enthusiastic",
"Tired and compliant",
"Guardedly optimistic",
"Apologetically firm",
"Confused but engaged",
]
_MOCK_SPEAKERS = ["speaker_a", "speaker_b"]
class VoiceIO(ABC):
"""
Base class for all audio capture sources.
Subclasses yield VoiceFrame objects from an async generator.
Consumers should use: `async for frame in io_instance.stream(): ...`
"""
@abstractmethod
async def stream(self) -> AsyncIterator[VoiceFrame]:
"""Yield VoiceFrames continuously until stopped."""
...
async def stop(self) -> None:
"""Signal the stream to stop. Override if cleanup is needed."""
class MockVoiceIO(VoiceIO):
"""
Synthetic VoiceFrame generator for development and CI.
Emits one frame every `interval_s` seconds with randomised labels,
confidence, and simulated speaker transitions.
Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated
directly in tests.
"""
def __init__(
self,
interval_s: float = 2.5,
speakers: list[str] | None = None,
labels: list[str] | None = None,
seed: int | None = None,
) -> None:
self._interval_s = interval_s
self._speakers = speakers or _MOCK_SPEAKERS
self._labels = labels or _MOCK_LABELS
self._rng = random.Random(seed)
self._running = False
async def stream(self) -> AsyncIterator[VoiceFrame]: # type: ignore[override]
self._running = True
start = time.monotonic()
prev_label = self._rng.choice(self._labels)
while self._running:
await asyncio.sleep(self._interval_s)
label = self._rng.choice(self._labels)
# shift_magnitude is 0 when the label repeats, higher for big jumps
shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9)
prev_label = label
yield VoiceFrame(
label=label,
confidence=self._rng.uniform(0.55, 0.98),
speaker_id=self._rng.choice(self._speakers),
shift_magnitude=round(shift, 3),
timestamp=round(time.monotonic() - start, 2),
)
async def stop(self) -> None:
self._running = False
def make_io(
mock: bool | None = None,
interval_s: float = 2.5,
) -> VoiceIO:
"""
Factory: return a VoiceIO instance appropriate for the current environment.
mock=True or CF_VOICE_MOCK=1 MockVoiceIO (no audio hardware needed)
Otherwise real audio capture (not yet implemented)
"""
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
if use_mock:
return MockVoiceIO(interval_s=interval_s)
raise NotImplementedError(
"Real audio capture is not yet implemented. "
"Set CF_VOICE_MOCK=1 or pass mock=True to use synthetic frames."
)

43
cf_voice/models.py Normal file
View file

@ -0,0 +1,43 @@
# cf_voice/models.py — VoiceFrame API contract
#
# This module is MIT licensed. All consumers (Linnet, Osprey, etc.)
# import VoiceFrame from here so the shape is consistent across the stack.
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class VoiceFrame:
"""
A single annotated moment in a voice stream.
Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
(tone classification, speaker diarization).
Fields
------
label Tone annotation, e.g. "Warmly impatient" or "Deflecting".
Generic by default; Elcor-style prefix format is an
easter egg surfaced by the product UI, not set here.
confidence 0.01.0. Below ~0.5 the annotation is speculative.
speaker_id Ephemeral local label ("speaker_a", "speaker_b").
Not tied to identity resets each session.
shift_magnitude Delta from the previous frame's tone, 0.01.0.
High values indicate a meaningful register shift.
timestamp Session-relative seconds since capture started.
"""
label: str
confidence: float
speaker_id: str
shift_magnitude: float
timestamp: float
def is_reliable(self, threshold: float = 0.6) -> bool:
"""Return True when confidence meets the given threshold."""
return self.confidence >= threshold
def is_shift(self, threshold: float = 0.3) -> bool:
"""Return True when shift_magnitude indicates a meaningful register change."""
return self.shift_magnitude >= threshold

39
pyproject.toml Normal file
View file

@ -0,0 +1,39 @@
[build-system]
requires = ["setuptools>=68"]
build-backend = "setuptools.build_meta"
[project]
name = "cf-voice"
version = "0.1.0"
description = "CircuitForge voice annotation pipeline — VoiceFrame API, tone classifiers, speaker diarization"
readme = "README.md"
requires-python = ">=3.11"
license = {text = "MIT"}
dependencies = [
"pydantic>=2.0",
]
[project.optional-dependencies]
# Real inference backends — not required for stub/mock mode
inference = [
"torch>=2.0",
"torchaudio>=2.0",
"transformers>=4.40",
"pyannote.audio>=3.1",
]
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
]
[project.scripts]
# Quick smoke-test: stream mock frames to stdout
cf-voice-demo = "cf_voice.cli:demo"
[tool.setuptools.packages.find]
where = ["."]
include = ["cf_voice*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
asyncio_mode = "auto"

0
tests/__init__.py Normal file
View file

104
tests/test_models.py Normal file
View file

@ -0,0 +1,104 @@
import asyncio
import pytest
from cf_voice.models import VoiceFrame
from cf_voice.io import MockVoiceIO, make_io
from cf_voice.context import ContextClassifier
def make_frame(**kwargs) -> VoiceFrame:
defaults = dict(
label="Calm and focused",
confidence=0.8,
speaker_id="speaker_a",
shift_magnitude=0.0,
timestamp=1.0,
)
return VoiceFrame(**{**defaults, **kwargs})
class TestVoiceFrame:
def test_is_reliable_above_threshold(self):
assert make_frame(confidence=0.7).is_reliable(threshold=0.6)
def test_is_reliable_below_threshold(self):
assert not make_frame(confidence=0.4).is_reliable(threshold=0.6)
def test_is_shift_above_threshold(self):
assert make_frame(shift_magnitude=0.5).is_shift(threshold=0.3)
def test_is_shift_below_threshold(self):
assert not make_frame(shift_magnitude=0.1).is_shift(threshold=0.3)
def test_default_reliable_threshold(self):
assert make_frame(confidence=0.6).is_reliable()
assert not make_frame(confidence=0.59).is_reliable()
class TestMockVoiceIO:
@pytest.mark.asyncio
async def test_emits_frames(self):
io = MockVoiceIO(interval_s=0.05, seed=42)
frames = []
async for frame in io.stream():
frames.append(frame)
if len(frames) >= 3:
await io.stop()
break
assert len(frames) == 3
assert all(isinstance(f, VoiceFrame) for f in frames)
@pytest.mark.asyncio
async def test_confidence_in_range(self):
io = MockVoiceIO(interval_s=0.05, seed=1)
count = 0
async for frame in io.stream():
assert 0.0 <= frame.confidence <= 1.0
assert 0.0 <= frame.shift_magnitude <= 1.0
count += 1
if count >= 5:
await io.stop()
break
@pytest.mark.asyncio
async def test_timestamps_increase(self):
io = MockVoiceIO(interval_s=0.05, seed=0)
timestamps = []
async for frame in io.stream():
timestamps.append(frame.timestamp)
if len(timestamps) >= 3:
await io.stop()
break
assert timestamps == sorted(timestamps)
def test_make_io_mock_env(self, monkeypatch):
monkeypatch.setenv("CF_VOICE_MOCK", "1")
io = make_io()
assert isinstance(io, MockVoiceIO)
def test_make_io_real_raises(self, monkeypatch):
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
with pytest.raises(NotImplementedError):
make_io(mock=False)
class TestContextClassifier:
@pytest.mark.asyncio
async def test_mock_passthrough(self):
classifier = ContextClassifier.mock(interval_s=0.05, seed=7)
frames = []
async for frame in classifier.stream():
frames.append(frame)
if len(frames) >= 3:
await classifier.stop()
break
assert len(frames) == 3
assert all(isinstance(f, VoiceFrame) for f in frames)
@pytest.mark.asyncio
async def test_from_env_mock(self, monkeypatch):
monkeypatch.setenv("CF_VOICE_MOCK", "1")
classifier = ContextClassifier.from_env(interval_s=0.05)
async for frame in classifier.stream():
assert isinstance(frame, VoiceFrame)
await classifier.stop()
break