commit 35fc0a088c6d65f17d8b895e9d23a1c633e4fa5f Author: pyr0ball Date: Mon Apr 6 16:03:07 2026 -0700 feat: initial cf-voice stub — VoiceFrame API, mock IO, context classifier - VoiceFrame dataclass: label, confidence, speaker_id, shift_magnitude, timestamp - MockVoiceIO: async generator of synthetic frames on a timer (CF_VOICE_MOCK=1) - ContextClassifier: passthrough stub wrapping VoiceIO; _enrich() hook for real classifiers - make_io() factory: mock mode auto-detected from env, raises NotImplementedError for real audio - cf-voice-demo CLI entry point for quick smoke-testing - 12 tests passing; editable install via pip install -e ../cf-voice diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e3c2107 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +.venv/ +.env +*.so +.pytest_cache/ +.mypy_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..61f2d56 --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +# cf-voice + +CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude. + +**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally. + +## Install + +```bash +pip install -e ../cf-voice # editable install alongside sibling repos +``` + +## Quick start + +```python +from cf_voice.context import ContextClassifier + +classifier = ContextClassifier.mock() # or from_env() with CF_VOICE_MOCK=1 +async for frame in classifier.stream(): + print(frame.label, frame.confidence) +``` + +Or run the demo CLI: + +```bash +CF_VOICE_MOCK=1 cf-voice-demo +``` + +## VoiceFrame + +```python +@dataclass +class VoiceFrame: + label: str # e.g. "Warmly impatient" + confidence: float # 0.0–1.0 + speaker_id: str # ephemeral local label, e.g. "speaker_a" + shift_magnitude: float # delta from previous frame, 0.0–1.0 + timestamp: float # session-relative seconds +``` + +## Mock mode + +Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development. + +## Module structure + +| Module | License | Purpose | +|--------|---------|---------| +| `cf_voice.models` | MIT | `VoiceFrame` dataclass | +| `cf_voice.io` | MIT | Audio capture, mock generator | +| `cf_voice.context` | BSL 1.1* | Tone classification, diarization | + +*BSL applies when real inference models are integrated. Currently stub = MIT. + +## Consumed by + +- `Circuit-Forge/linnet` — real-time tone annotation widget +- `Circuit-Forge/osprey` — telephony bridge voice context diff --git a/cf_voice/__init__.py b/cf_voice/__init__.py new file mode 100644 index 0000000..844a296 --- /dev/null +++ b/cf_voice/__init__.py @@ -0,0 +1,17 @@ +""" +cf-voice: CircuitForge voice annotation pipeline. + +Quick start (mock mode, no GPU required): + + from cf_voice.context import ContextClassifier + + classifier = ContextClassifier.mock() + async for frame in classifier.stream(): + print(frame.label, frame.confidence) + +Set CF_VOICE_MOCK=1 in the environment to activate mock mode globally. +""" +from cf_voice.models import VoiceFrame + +__version__ = "0.1.0" +__all__ = ["VoiceFrame"] diff --git a/cf_voice/cli.py b/cf_voice/cli.py new file mode 100644 index 0000000..8736ed7 --- /dev/null +++ b/cf_voice/cli.py @@ -0,0 +1,25 @@ +# cf_voice/cli.py — cf-voice-demo entry point +import asyncio + +from cf_voice.context import ContextClassifier + + +async def _run() -> None: + print("cf-voice mock stream — Ctrl+C to stop\n") + classifier = ContextClassifier.mock(interval_s=2.0) + try: + async for frame in classifier.stream(): + reliable = "+" if frame.is_reliable() else "?" + shift = " [SHIFT]" if frame.is_shift() else "" + print( + f"[{frame.timestamp:6.1f}s] {frame.speaker_id} " + f"{frame.label:<30} conf={frame.confidence:.2f}{reliable}{shift}" + ) + except KeyboardInterrupt: + pass + finally: + await classifier.stop() + + +def demo() -> None: + asyncio.run(_run()) diff --git a/cf_voice/context.py b/cf_voice/context.py new file mode 100644 index 0000000..53ddb44 --- /dev/null +++ b/cf_voice/context.py @@ -0,0 +1,74 @@ +# cf_voice/context.py — tone classification and context enrichment +# +# BSL 1.1 when real inference models are integrated. +# Currently a passthrough stub: wraps a VoiceIO source and forwards frames. +# +# Real implementation (Notation v0.1.x) will: +# - Run YAMNet acoustic event detection on the audio buffer +# - Run wav2vec2-based SER (speech emotion recognition) +# - Run librosa prosody extraction (pitch, energy, rate) +# - Combine into enriched VoiceFrame label + confidence +# - Support pyannote.audio speaker diarization (Navigation v0.2.x) +from __future__ import annotations + +import os +from typing import AsyncIterator + +from cf_voice.io import VoiceIO, make_io +from cf_voice.models import VoiceFrame + + +class ContextClassifier: + """ + High-level voice context classifier. + + Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation. + In stub mode the frames pass through unchanged — the enrichment pipeline + (YAMNet + wav2vec2 + librosa) is filled in incrementally. + + Usage + ----- + classifier = ContextClassifier.from_env() + async for frame in classifier.stream(): + print(frame.label, frame.confidence) + """ + + def __init__(self, io: VoiceIO) -> None: + self._io = io + + @classmethod + def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier": + """ + Create a ContextClassifier from environment. + CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed). + """ + io = make_io(interval_s=interval_s) + return cls(io=io) + + @classmethod + def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier": + """Create a ContextClassifier backed by MockVoiceIO. Useful in tests.""" + from cf_voice.io import MockVoiceIO + return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed)) + + async def stream(self) -> AsyncIterator[VoiceFrame]: + """ + Yield enriched VoiceFrames continuously. + + Stub: frames from the IO layer pass through unchanged. + Real: enrichment pipeline runs here before yield. + """ + async for frame in self._io.stream(): + yield self._enrich(frame) + + async def stop(self) -> None: + await self._io.stop() + + def _enrich(self, frame: VoiceFrame) -> VoiceFrame: + """ + Apply tone classification to a raw frame. + + Stub: identity transform — returns frame unchanged. + Real: replace label + confidence with classifier output. + """ + return frame diff --git a/cf_voice/io.py b/cf_voice/io.py new file mode 100644 index 0000000..94e6104 --- /dev/null +++ b/cf_voice/io.py @@ -0,0 +1,122 @@ +# cf_voice/io.py — audio capture and VoiceFrame generation +# +# MIT licensed. This layer handles audio I/O only — no inference. +# +# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are +# emitted on a timer. Real audio capture will be added in Notation v0.1.x +# once pyannote.audio and faster-whisper are integrated. +from __future__ import annotations + +import asyncio +import os +import random +import time +from abc import ABC, abstractmethod +from typing import AsyncIterator + +from cf_voice.models import VoiceFrame + +# Generic tone labels for the annotation stream. +# These are the underlying classifier outputs — the Elcor-style prefix format +# ("With barely concealed frustration:") is applied by the UI layer, not here. +_MOCK_LABELS = [ + "Calm and focused", + "Warmly impatient", + "Deflecting", + "Genuinely curious", + "Politely dismissive", + "Nervous but cooperative", + "Frustrated but contained", + "Enthusiastic", + "Tired and compliant", + "Guardedly optimistic", + "Apologetically firm", + "Confused but engaged", +] + +_MOCK_SPEAKERS = ["speaker_a", "speaker_b"] + + +class VoiceIO(ABC): + """ + Base class for all audio capture sources. + + Subclasses yield VoiceFrame objects from an async generator. + Consumers should use: `async for frame in io_instance.stream(): ...` + """ + + @abstractmethod + async def stream(self) -> AsyncIterator[VoiceFrame]: + """Yield VoiceFrames continuously until stopped.""" + ... + + async def stop(self) -> None: + """Signal the stream to stop. Override if cleanup is needed.""" + + +class MockVoiceIO(VoiceIO): + """ + Synthetic VoiceFrame generator for development and CI. + + Emits one frame every `interval_s` seconds with randomised labels, + confidence, and simulated speaker transitions. + + Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated + directly in tests. + """ + + def __init__( + self, + interval_s: float = 2.5, + speakers: list[str] | None = None, + labels: list[str] | None = None, + seed: int | None = None, + ) -> None: + self._interval_s = interval_s + self._speakers = speakers or _MOCK_SPEAKERS + self._labels = labels or _MOCK_LABELS + self._rng = random.Random(seed) + self._running = False + + async def stream(self) -> AsyncIterator[VoiceFrame]: # type: ignore[override] + self._running = True + start = time.monotonic() + prev_label = self._rng.choice(self._labels) + + while self._running: + await asyncio.sleep(self._interval_s) + + label = self._rng.choice(self._labels) + # shift_magnitude is 0 when the label repeats, higher for big jumps + shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9) + prev_label = label + + yield VoiceFrame( + label=label, + confidence=self._rng.uniform(0.55, 0.98), + speaker_id=self._rng.choice(self._speakers), + shift_magnitude=round(shift, 3), + timestamp=round(time.monotonic() - start, 2), + ) + + async def stop(self) -> None: + self._running = False + + +def make_io( + mock: bool | None = None, + interval_s: float = 2.5, +) -> VoiceIO: + """ + Factory: return a VoiceIO instance appropriate for the current environment. + + mock=True or CF_VOICE_MOCK=1 → MockVoiceIO (no audio hardware needed) + Otherwise → real audio capture (not yet implemented) + """ + use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1" + if use_mock: + return MockVoiceIO(interval_s=interval_s) + raise NotImplementedError( + "Real audio capture is not yet implemented. " + "Set CF_VOICE_MOCK=1 or pass mock=True to use synthetic frames." + ) diff --git a/cf_voice/models.py b/cf_voice/models.py new file mode 100644 index 0000000..8cd6535 --- /dev/null +++ b/cf_voice/models.py @@ -0,0 +1,43 @@ +# cf_voice/models.py — VoiceFrame API contract +# +# This module is MIT licensed. All consumers (Linnet, Osprey, etc.) +# import VoiceFrame from here so the shape is consistent across the stack. +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class VoiceFrame: + """ + A single annotated moment in a voice stream. + + Produced by cf_voice.io (audio capture) and enriched by cf_voice.context + (tone classification, speaker diarization). + + Fields + ------ + label Tone annotation, e.g. "Warmly impatient" or "Deflecting". + Generic by default; Elcor-style prefix format is an + easter egg surfaced by the product UI, not set here. + confidence 0.0–1.0. Below ~0.5 the annotation is speculative. + speaker_id Ephemeral local label ("speaker_a", "speaker_b"). + Not tied to identity — resets each session. + shift_magnitude Delta from the previous frame's tone, 0.0–1.0. + High values indicate a meaningful register shift. + timestamp Session-relative seconds since capture started. + """ + + label: str + confidence: float + speaker_id: str + shift_magnitude: float + timestamp: float + + def is_reliable(self, threshold: float = 0.6) -> bool: + """Return True when confidence meets the given threshold.""" + return self.confidence >= threshold + + def is_shift(self, threshold: float = 0.3) -> bool: + """Return True when shift_magnitude indicates a meaningful register change.""" + return self.shift_magnitude >= threshold diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..20d3d37 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "cf-voice" +version = "0.1.0" +description = "CircuitForge voice annotation pipeline — VoiceFrame API, tone classifiers, speaker diarization" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +dependencies = [ + "pydantic>=2.0", +] + +[project.optional-dependencies] +# Real inference backends — not required for stub/mock mode +inference = [ + "torch>=2.0", + "torchaudio>=2.0", + "transformers>=4.40", + "pyannote.audio>=3.1", +] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.23", +] + +[project.scripts] +# Quick smoke-test: stream mock frames to stdout +cf-voice-demo = "cf_voice.cli:demo" + +[tool.setuptools.packages.find] +where = ["."] +include = ["cf_voice*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..5743df0 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,104 @@ +import asyncio +import pytest +from cf_voice.models import VoiceFrame +from cf_voice.io import MockVoiceIO, make_io +from cf_voice.context import ContextClassifier + + +def make_frame(**kwargs) -> VoiceFrame: + defaults = dict( + label="Calm and focused", + confidence=0.8, + speaker_id="speaker_a", + shift_magnitude=0.0, + timestamp=1.0, + ) + return VoiceFrame(**{**defaults, **kwargs}) + + +class TestVoiceFrame: + def test_is_reliable_above_threshold(self): + assert make_frame(confidence=0.7).is_reliable(threshold=0.6) + + def test_is_reliable_below_threshold(self): + assert not make_frame(confidence=0.4).is_reliable(threshold=0.6) + + def test_is_shift_above_threshold(self): + assert make_frame(shift_magnitude=0.5).is_shift(threshold=0.3) + + def test_is_shift_below_threshold(self): + assert not make_frame(shift_magnitude=0.1).is_shift(threshold=0.3) + + def test_default_reliable_threshold(self): + assert make_frame(confidence=0.6).is_reliable() + assert not make_frame(confidence=0.59).is_reliable() + + +class TestMockVoiceIO: + @pytest.mark.asyncio + async def test_emits_frames(self): + io = MockVoiceIO(interval_s=0.05, seed=42) + frames = [] + async for frame in io.stream(): + frames.append(frame) + if len(frames) >= 3: + await io.stop() + break + assert len(frames) == 3 + assert all(isinstance(f, VoiceFrame) for f in frames) + + @pytest.mark.asyncio + async def test_confidence_in_range(self): + io = MockVoiceIO(interval_s=0.05, seed=1) + count = 0 + async for frame in io.stream(): + assert 0.0 <= frame.confidence <= 1.0 + assert 0.0 <= frame.shift_magnitude <= 1.0 + count += 1 + if count >= 5: + await io.stop() + break + + @pytest.mark.asyncio + async def test_timestamps_increase(self): + io = MockVoiceIO(interval_s=0.05, seed=0) + timestamps = [] + async for frame in io.stream(): + timestamps.append(frame.timestamp) + if len(timestamps) >= 3: + await io.stop() + break + assert timestamps == sorted(timestamps) + + def test_make_io_mock_env(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_MOCK", "1") + io = make_io() + assert isinstance(io, MockVoiceIO) + + def test_make_io_real_raises(self, monkeypatch): + monkeypatch.delenv("CF_VOICE_MOCK", raising=False) + with pytest.raises(NotImplementedError): + make_io(mock=False) + + +class TestContextClassifier: + @pytest.mark.asyncio + async def test_mock_passthrough(self): + classifier = ContextClassifier.mock(interval_s=0.05, seed=7) + frames = [] + async for frame in classifier.stream(): + frames.append(frame) + if len(frames) >= 3: + await classifier.stop() + break + assert len(frames) == 3 + assert all(isinstance(f, VoiceFrame) for f in frames) + + @pytest.mark.asyncio + async def test_from_env_mock(self, monkeypatch): + monkeypatch.setenv("CF_VOICE_MOCK", "1") + classifier = ContextClassifier.from_env(interval_s=0.05) + async for frame in classifier.stream(): + assert isinstance(frame, VoiceFrame) + await classifier.stop() + break