feat: initial cf-voice stub — VoiceFrame API, mock IO, context classifier
- VoiceFrame dataclass: label, confidence, speaker_id, shift_magnitude, timestamp - MockVoiceIO: async generator of synthetic frames on a timer (CF_VOICE_MOCK=1) - ContextClassifier: passthrough stub wrapping VoiceIO; _enrich() hook for real classifiers - make_io() factory: mock mode auto-detected from env, raises NotImplementedError for real audio - cf-voice-demo CLI entry point for quick smoke-testing - 12 tests passing; editable install via pip install -e ../cf-voice
This commit is contained in:
commit
35fc0a088c
10 changed files with 492 additions and 0 deletions
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.venv/
|
||||
.env
|
||||
*.so
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
58
README.md
Normal file
58
README.md
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
# cf-voice
|
||||
|
||||
CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude.
|
||||
|
||||
**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally.
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
pip install -e ../cf-voice # editable install alongside sibling repos
|
||||
```
|
||||
|
||||
## Quick start
|
||||
|
||||
```python
|
||||
from cf_voice.context import ContextClassifier
|
||||
|
||||
classifier = ContextClassifier.mock() # or from_env() with CF_VOICE_MOCK=1
|
||||
async for frame in classifier.stream():
|
||||
print(frame.label, frame.confidence)
|
||||
```
|
||||
|
||||
Or run the demo CLI:
|
||||
|
||||
```bash
|
||||
CF_VOICE_MOCK=1 cf-voice-demo
|
||||
```
|
||||
|
||||
## VoiceFrame
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class VoiceFrame:
|
||||
label: str # e.g. "Warmly impatient"
|
||||
confidence: float # 0.0–1.0
|
||||
speaker_id: str # ephemeral local label, e.g. "speaker_a"
|
||||
shift_magnitude: float # delta from previous frame, 0.0–1.0
|
||||
timestamp: float # session-relative seconds
|
||||
```
|
||||
|
||||
## Mock mode
|
||||
|
||||
Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development.
|
||||
|
||||
## Module structure
|
||||
|
||||
| Module | License | Purpose |
|
||||
|--------|---------|---------|
|
||||
| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
|
||||
| `cf_voice.io` | MIT | Audio capture, mock generator |
|
||||
| `cf_voice.context` | BSL 1.1* | Tone classification, diarization |
|
||||
|
||||
*BSL applies when real inference models are integrated. Currently stub = MIT.
|
||||
|
||||
## Consumed by
|
||||
|
||||
- `Circuit-Forge/linnet` — real-time tone annotation widget
|
||||
- `Circuit-Forge/osprey` — telephony bridge voice context
|
||||
17
cf_voice/__init__.py
Normal file
17
cf_voice/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
"""
|
||||
cf-voice: CircuitForge voice annotation pipeline.
|
||||
|
||||
Quick start (mock mode, no GPU required):
|
||||
|
||||
from cf_voice.context import ContextClassifier
|
||||
|
||||
classifier = ContextClassifier.mock()
|
||||
async for frame in classifier.stream():
|
||||
print(frame.label, frame.confidence)
|
||||
|
||||
Set CF_VOICE_MOCK=1 in the environment to activate mock mode globally.
|
||||
"""
|
||||
from cf_voice.models import VoiceFrame
|
||||
|
||||
__version__ = "0.1.0"
|
||||
__all__ = ["VoiceFrame"]
|
||||
25
cf_voice/cli.py
Normal file
25
cf_voice/cli.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# cf_voice/cli.py — cf-voice-demo entry point
|
||||
import asyncio
|
||||
|
||||
from cf_voice.context import ContextClassifier
|
||||
|
||||
|
||||
async def _run() -> None:
|
||||
print("cf-voice mock stream — Ctrl+C to stop\n")
|
||||
classifier = ContextClassifier.mock(interval_s=2.0)
|
||||
try:
|
||||
async for frame in classifier.stream():
|
||||
reliable = "+" if frame.is_reliable() else "?"
|
||||
shift = " [SHIFT]" if frame.is_shift() else ""
|
||||
print(
|
||||
f"[{frame.timestamp:6.1f}s] {frame.speaker_id} "
|
||||
f"{frame.label:<30} conf={frame.confidence:.2f}{reliable}{shift}"
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
await classifier.stop()
|
||||
|
||||
|
||||
def demo() -> None:
|
||||
asyncio.run(_run())
|
||||
74
cf_voice/context.py
Normal file
74
cf_voice/context.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# cf_voice/context.py — tone classification and context enrichment
|
||||
#
|
||||
# BSL 1.1 when real inference models are integrated.
|
||||
# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
|
||||
#
|
||||
# Real implementation (Notation v0.1.x) will:
|
||||
# - Run YAMNet acoustic event detection on the audio buffer
|
||||
# - Run wav2vec2-based SER (speech emotion recognition)
|
||||
# - Run librosa prosody extraction (pitch, energy, rate)
|
||||
# - Combine into enriched VoiceFrame label + confidence
|
||||
# - Support pyannote.audio speaker diarization (Navigation v0.2.x)
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import AsyncIterator
|
||||
|
||||
from cf_voice.io import VoiceIO, make_io
|
||||
from cf_voice.models import VoiceFrame
|
||||
|
||||
|
||||
class ContextClassifier:
|
||||
"""
|
||||
High-level voice context classifier.
|
||||
|
||||
Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
|
||||
In stub mode the frames pass through unchanged — the enrichment pipeline
|
||||
(YAMNet + wav2vec2 + librosa) is filled in incrementally.
|
||||
|
||||
Usage
|
||||
-----
|
||||
classifier = ContextClassifier.from_env()
|
||||
async for frame in classifier.stream():
|
||||
print(frame.label, frame.confidence)
|
||||
"""
|
||||
|
||||
def __init__(self, io: VoiceIO) -> None:
|
||||
self._io = io
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
|
||||
"""
|
||||
Create a ContextClassifier from environment.
|
||||
CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
|
||||
"""
|
||||
io = make_io(interval_s=interval_s)
|
||||
return cls(io=io)
|
||||
|
||||
@classmethod
|
||||
def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
|
||||
"""Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
|
||||
from cf_voice.io import MockVoiceIO
|
||||
return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
|
||||
|
||||
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
||||
"""
|
||||
Yield enriched VoiceFrames continuously.
|
||||
|
||||
Stub: frames from the IO layer pass through unchanged.
|
||||
Real: enrichment pipeline runs here before yield.
|
||||
"""
|
||||
async for frame in self._io.stream():
|
||||
yield self._enrich(frame)
|
||||
|
||||
async def stop(self) -> None:
|
||||
await self._io.stop()
|
||||
|
||||
def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
|
||||
"""
|
||||
Apply tone classification to a raw frame.
|
||||
|
||||
Stub: identity transform — returns frame unchanged.
|
||||
Real: replace label + confidence with classifier output.
|
||||
"""
|
||||
return frame
|
||||
122
cf_voice/io.py
Normal file
122
cf_voice/io.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
# cf_voice/io.py — audio capture and VoiceFrame generation
|
||||
#
|
||||
# MIT licensed. This layer handles audio I/O only — no inference.
|
||||
#
|
||||
# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are
|
||||
# emitted on a timer. Real audio capture will be added in Notation v0.1.x
|
||||
# once pyannote.audio and faster-whisper are integrated.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import AsyncIterator
|
||||
|
||||
from cf_voice.models import VoiceFrame
|
||||
|
||||
# Generic tone labels for the annotation stream.
|
||||
# These are the underlying classifier outputs — the Elcor-style prefix format
|
||||
# ("With barely concealed frustration:") is applied by the UI layer, not here.
|
||||
_MOCK_LABELS = [
|
||||
"Calm and focused",
|
||||
"Warmly impatient",
|
||||
"Deflecting",
|
||||
"Genuinely curious",
|
||||
"Politely dismissive",
|
||||
"Nervous but cooperative",
|
||||
"Frustrated but contained",
|
||||
"Enthusiastic",
|
||||
"Tired and compliant",
|
||||
"Guardedly optimistic",
|
||||
"Apologetically firm",
|
||||
"Confused but engaged",
|
||||
]
|
||||
|
||||
_MOCK_SPEAKERS = ["speaker_a", "speaker_b"]
|
||||
|
||||
|
||||
class VoiceIO(ABC):
|
||||
"""
|
||||
Base class for all audio capture sources.
|
||||
|
||||
Subclasses yield VoiceFrame objects from an async generator.
|
||||
Consumers should use: `async for frame in io_instance.stream(): ...`
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def stream(self) -> AsyncIterator[VoiceFrame]:
|
||||
"""Yield VoiceFrames continuously until stopped."""
|
||||
...
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Signal the stream to stop. Override if cleanup is needed."""
|
||||
|
||||
|
||||
class MockVoiceIO(VoiceIO):
|
||||
"""
|
||||
Synthetic VoiceFrame generator for development and CI.
|
||||
|
||||
Emits one frame every `interval_s` seconds with randomised labels,
|
||||
confidence, and simulated speaker transitions.
|
||||
|
||||
Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated
|
||||
directly in tests.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
interval_s: float = 2.5,
|
||||
speakers: list[str] | None = None,
|
||||
labels: list[str] | None = None,
|
||||
seed: int | None = None,
|
||||
) -> None:
|
||||
self._interval_s = interval_s
|
||||
self._speakers = speakers or _MOCK_SPEAKERS
|
||||
self._labels = labels or _MOCK_LABELS
|
||||
self._rng = random.Random(seed)
|
||||
self._running = False
|
||||
|
||||
async def stream(self) -> AsyncIterator[VoiceFrame]: # type: ignore[override]
|
||||
self._running = True
|
||||
start = time.monotonic()
|
||||
prev_label = self._rng.choice(self._labels)
|
||||
|
||||
while self._running:
|
||||
await asyncio.sleep(self._interval_s)
|
||||
|
||||
label = self._rng.choice(self._labels)
|
||||
# shift_magnitude is 0 when the label repeats, higher for big jumps
|
||||
shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9)
|
||||
prev_label = label
|
||||
|
||||
yield VoiceFrame(
|
||||
label=label,
|
||||
confidence=self._rng.uniform(0.55, 0.98),
|
||||
speaker_id=self._rng.choice(self._speakers),
|
||||
shift_magnitude=round(shift, 3),
|
||||
timestamp=round(time.monotonic() - start, 2),
|
||||
)
|
||||
|
||||
async def stop(self) -> None:
|
||||
self._running = False
|
||||
|
||||
|
||||
def make_io(
|
||||
mock: bool | None = None,
|
||||
interval_s: float = 2.5,
|
||||
) -> VoiceIO:
|
||||
"""
|
||||
Factory: return a VoiceIO instance appropriate for the current environment.
|
||||
|
||||
mock=True or CF_VOICE_MOCK=1 → MockVoiceIO (no audio hardware needed)
|
||||
Otherwise → real audio capture (not yet implemented)
|
||||
"""
|
||||
use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
|
||||
if use_mock:
|
||||
return MockVoiceIO(interval_s=interval_s)
|
||||
raise NotImplementedError(
|
||||
"Real audio capture is not yet implemented. "
|
||||
"Set CF_VOICE_MOCK=1 or pass mock=True to use synthetic frames."
|
||||
)
|
||||
43
cf_voice/models.py
Normal file
43
cf_voice/models.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# cf_voice/models.py — VoiceFrame API contract
|
||||
#
|
||||
# This module is MIT licensed. All consumers (Linnet, Osprey, etc.)
|
||||
# import VoiceFrame from here so the shape is consistent across the stack.
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceFrame:
|
||||
"""
|
||||
A single annotated moment in a voice stream.
|
||||
|
||||
Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
|
||||
(tone classification, speaker diarization).
|
||||
|
||||
Fields
|
||||
------
|
||||
label Tone annotation, e.g. "Warmly impatient" or "Deflecting".
|
||||
Generic by default; Elcor-style prefix format is an
|
||||
easter egg surfaced by the product UI, not set here.
|
||||
confidence 0.0–1.0. Below ~0.5 the annotation is speculative.
|
||||
speaker_id Ephemeral local label ("speaker_a", "speaker_b").
|
||||
Not tied to identity — resets each session.
|
||||
shift_magnitude Delta from the previous frame's tone, 0.0–1.0.
|
||||
High values indicate a meaningful register shift.
|
||||
timestamp Session-relative seconds since capture started.
|
||||
"""
|
||||
|
||||
label: str
|
||||
confidence: float
|
||||
speaker_id: str
|
||||
shift_magnitude: float
|
||||
timestamp: float
|
||||
|
||||
def is_reliable(self, threshold: float = 0.6) -> bool:
|
||||
"""Return True when confidence meets the given threshold."""
|
||||
return self.confidence >= threshold
|
||||
|
||||
def is_shift(self, threshold: float = 0.3) -> bool:
|
||||
"""Return True when shift_magnitude indicates a meaningful register change."""
|
||||
return self.shift_magnitude >= threshold
|
||||
39
pyproject.toml
Normal file
39
pyproject.toml
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=68"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "cf-voice"
|
||||
version = "0.1.0"
|
||||
description = "CircuitForge voice annotation pipeline — VoiceFrame API, tone classifiers, speaker diarization"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
license = {text = "MIT"}
|
||||
dependencies = [
|
||||
"pydantic>=2.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
# Real inference backends — not required for stub/mock mode
|
||||
inference = [
|
||||
"torch>=2.0",
|
||||
"torchaudio>=2.0",
|
||||
"transformers>=4.40",
|
||||
"pyannote.audio>=3.1",
|
||||
]
|
||||
dev = [
|
||||
"pytest>=8.0",
|
||||
"pytest-asyncio>=0.23",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
# Quick smoke-test: stream mock frames to stdout
|
||||
cf-voice-demo = "cf_voice.cli:demo"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["cf_voice*"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
asyncio_mode = "auto"
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
104
tests/test_models.py
Normal file
104
tests/test_models.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
import asyncio
|
||||
import pytest
|
||||
from cf_voice.models import VoiceFrame
|
||||
from cf_voice.io import MockVoiceIO, make_io
|
||||
from cf_voice.context import ContextClassifier
|
||||
|
||||
|
||||
def make_frame(**kwargs) -> VoiceFrame:
|
||||
defaults = dict(
|
||||
label="Calm and focused",
|
||||
confidence=0.8,
|
||||
speaker_id="speaker_a",
|
||||
shift_magnitude=0.0,
|
||||
timestamp=1.0,
|
||||
)
|
||||
return VoiceFrame(**{**defaults, **kwargs})
|
||||
|
||||
|
||||
class TestVoiceFrame:
|
||||
def test_is_reliable_above_threshold(self):
|
||||
assert make_frame(confidence=0.7).is_reliable(threshold=0.6)
|
||||
|
||||
def test_is_reliable_below_threshold(self):
|
||||
assert not make_frame(confidence=0.4).is_reliable(threshold=0.6)
|
||||
|
||||
def test_is_shift_above_threshold(self):
|
||||
assert make_frame(shift_magnitude=0.5).is_shift(threshold=0.3)
|
||||
|
||||
def test_is_shift_below_threshold(self):
|
||||
assert not make_frame(shift_magnitude=0.1).is_shift(threshold=0.3)
|
||||
|
||||
def test_default_reliable_threshold(self):
|
||||
assert make_frame(confidence=0.6).is_reliable()
|
||||
assert not make_frame(confidence=0.59).is_reliable()
|
||||
|
||||
|
||||
class TestMockVoiceIO:
|
||||
@pytest.mark.asyncio
|
||||
async def test_emits_frames(self):
|
||||
io = MockVoiceIO(interval_s=0.05, seed=42)
|
||||
frames = []
|
||||
async for frame in io.stream():
|
||||
frames.append(frame)
|
||||
if len(frames) >= 3:
|
||||
await io.stop()
|
||||
break
|
||||
assert len(frames) == 3
|
||||
assert all(isinstance(f, VoiceFrame) for f in frames)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_confidence_in_range(self):
|
||||
io = MockVoiceIO(interval_s=0.05, seed=1)
|
||||
count = 0
|
||||
async for frame in io.stream():
|
||||
assert 0.0 <= frame.confidence <= 1.0
|
||||
assert 0.0 <= frame.shift_magnitude <= 1.0
|
||||
count += 1
|
||||
if count >= 5:
|
||||
await io.stop()
|
||||
break
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_timestamps_increase(self):
|
||||
io = MockVoiceIO(interval_s=0.05, seed=0)
|
||||
timestamps = []
|
||||
async for frame in io.stream():
|
||||
timestamps.append(frame.timestamp)
|
||||
if len(timestamps) >= 3:
|
||||
await io.stop()
|
||||
break
|
||||
assert timestamps == sorted(timestamps)
|
||||
|
||||
def test_make_io_mock_env(self, monkeypatch):
|
||||
monkeypatch.setenv("CF_VOICE_MOCK", "1")
|
||||
io = make_io()
|
||||
assert isinstance(io, MockVoiceIO)
|
||||
|
||||
def test_make_io_real_raises(self, monkeypatch):
|
||||
monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
|
||||
with pytest.raises(NotImplementedError):
|
||||
make_io(mock=False)
|
||||
|
||||
|
||||
class TestContextClassifier:
|
||||
@pytest.mark.asyncio
|
||||
async def test_mock_passthrough(self):
|
||||
classifier = ContextClassifier.mock(interval_s=0.05, seed=7)
|
||||
frames = []
|
||||
async for frame in classifier.stream():
|
||||
frames.append(frame)
|
||||
if len(frames) >= 3:
|
||||
await classifier.stop()
|
||||
break
|
||||
assert len(frames) == 3
|
||||
assert all(isinstance(f, VoiceFrame) for f in frames)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_from_env_mock(self, monkeypatch):
|
||||
monkeypatch.setenv("CF_VOICE_MOCK", "1")
|
||||
classifier = ContextClassifier.from_env(interval_s=0.05)
|
||||
async for frame in classifier.stream():
|
||||
assert isinstance(frame, VoiceFrame)
|
||||
await classifier.stop()
|
||||
break
|
||||
Loading…
Reference in a new issue