feat: initial cf-voice stub — VoiceFrame API, mock IO, context classifier

- VoiceFrame dataclass: label, confidence, speaker_id, shift_magnitude, timestamp - MockVoiceIO: async generator of synthetic frames on a timer (CF_VOICE_MOCK=1) - ContextClassifier: passthrough stub wrapping VoiceIO; _enrich() hook for real classifiers - make_io() factory: mock mode auto-detected from env, raises NotImplementedError for real audio - cf-voice-demo CLI entry point for quick smoke-testing - 12 tests passing; editable install via pip install -e ../cf-voice
2026-04-06 16:03:07 -07:00 · 2026-04-06 16:03:07 -07:00 · 35fc0a088c
commit 35fc0a088c
10 changed files with 492 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.venv/
+.env
+*.so
+.pytest_cache/
+.mypy_cache/
--- a/README.md
+++ b/README.md
@ -0,0 +1,58 @@
+# cf-voice
+
+CircuitForge voice annotation pipeline. Produces `VoiceFrame` objects from a live audio stream — tone label, confidence, speaker identity, and shift magnitude.
+
+**Status:** Notation v0.1.x stub — mock mode only. Real classifiers (YAMNet, wav2vec2, pyannote.audio) land incrementally.
+
+## Install
+
+```bash
+pip install -e ../cf-voice   # editable install alongside sibling repos
+```
+
+## Quick start
+
+```python
+from cf_voice.context import ContextClassifier
+
+classifier = ContextClassifier.mock()          # or from_env() with CF_VOICE_MOCK=1
+async for frame in classifier.stream():
+    print(frame.label, frame.confidence)
+```
+
+Or run the demo CLI:
+
+```bash
+CF_VOICE_MOCK=1 cf-voice-demo
+```
+
+## VoiceFrame
+
+```python
+@dataclass
+class VoiceFrame:
+    label: str            # e.g. "Warmly impatient"
+    confidence: float     # 0.0–1.0
+    speaker_id: str       # ephemeral local label, e.g. "speaker_a"
+    shift_magnitude: float  # delta from previous frame, 0.0–1.0
+    timestamp: float      # session-relative seconds
+```
+
+## Mock mode
+
+Set `CF_VOICE_MOCK=1` or pass `mock=True` to `make_io()`. No GPU or microphone required. Useful for CI and frontend development.
+
+## Module structure
+
+| Module | License | Purpose |
+|--------|---------|---------|
+| `cf_voice.models` | MIT | `VoiceFrame` dataclass |
+| `cf_voice.io` | MIT | Audio capture, mock generator |
+| `cf_voice.context` | BSL 1.1* | Tone classification, diarization |
+
+*BSL applies when real inference models are integrated. Currently stub = MIT.
+
+## Consumed by
+
+- `Circuit-Forge/linnet` — real-time tone annotation widget
+- `Circuit-Forge/osprey` — telephony bridge voice context
--- a/cf_voice/init.py
+++ b/cf_voice/init.py
@ -0,0 +1,17 @@
+"""
+cf-voice: CircuitForge voice annotation pipeline.
+
+Quick start (mock mode, no GPU required):
+
+    from cf_voice.context import ContextClassifier
+
+    classifier = ContextClassifier.mock()
+    async for frame in classifier.stream():
+        print(frame.label, frame.confidence)
+
+Set CF_VOICE_MOCK=1 in the environment to activate mock mode globally.
+"""
+from cf_voice.models import VoiceFrame
+
+__version__ = "0.1.0"
+__all__ = ["VoiceFrame"]
--- a/cf_voice/cli.py
+++ b/cf_voice/cli.py
@ -0,0 +1,25 @@
+# cf_voice/cli.py — cf-voice-demo entry point
+import asyncio
+
+from cf_voice.context import ContextClassifier
+
+
+async def _run() -> None:
+    print("cf-voice mock stream — Ctrl+C to stop\n")
+    classifier = ContextClassifier.mock(interval_s=2.0)
+    try:
+        async for frame in classifier.stream():
+            reliable = "+" if frame.is_reliable() else "?"
+            shift = " [SHIFT]" if frame.is_shift() else ""
+            print(
+                f"[{frame.timestamp:6.1f}s] {frame.speaker_id}  "
+                f"{frame.label:<30} conf={frame.confidence:.2f}{reliable}{shift}"
+            )
+    except KeyboardInterrupt:
+        pass
+    finally:
+        await classifier.stop()
+
+
+def demo() -> None:
+    asyncio.run(_run())
--- a/cf_voice/context.py
+++ b/cf_voice/context.py
@ -0,0 +1,74 @@
+# cf_voice/context.py — tone classification and context enrichment
+#
+# BSL 1.1 when real inference models are integrated.
+# Currently a passthrough stub: wraps a VoiceIO source and forwards frames.
+#
+# Real implementation (Notation v0.1.x) will:
+#   - Run YAMNet acoustic event detection on the audio buffer
+#   - Run wav2vec2-based SER (speech emotion recognition)
+#   - Run librosa prosody extraction (pitch, energy, rate)
+#   - Combine into enriched VoiceFrame label + confidence
+#   - Support pyannote.audio speaker diarization (Navigation v0.2.x)
+from __future__ import annotations
+
+import os
+from typing import AsyncIterator
+
+from cf_voice.io import VoiceIO, make_io
+from cf_voice.models import VoiceFrame
+
+
+class ContextClassifier:
+    """
+    High-level voice context classifier.
+
+    Wraps a VoiceIO source and enriches each VoiceFrame with tone annotation.
+    In stub mode the frames pass through unchanged — the enrichment pipeline
+    (YAMNet + wav2vec2 + librosa) is filled in incrementally.
+
+    Usage
+    -----
+        classifier = ContextClassifier.from_env()
+        async for frame in classifier.stream():
+            print(frame.label, frame.confidence)
+    """
+
+    def __init__(self, io: VoiceIO) -> None:
+        self._io = io
+
+    @classmethod
+    def from_env(cls, interval_s: float = 2.5) -> "ContextClassifier":
+        """
+        Create a ContextClassifier from environment.
+        CF_VOICE_MOCK=1 activates mock mode (no GPU, no audio hardware needed).
+        """
+        io = make_io(interval_s=interval_s)
+        return cls(io=io)
+
+    @classmethod
+    def mock(cls, interval_s: float = 2.5, seed: int | None = None) -> "ContextClassifier":
+        """Create a ContextClassifier backed by MockVoiceIO. Useful in tests."""
+        from cf_voice.io import MockVoiceIO
+        return cls(io=MockVoiceIO(interval_s=interval_s, seed=seed))
+
+    async def stream(self) -> AsyncIterator[VoiceFrame]:
+        """
+        Yield enriched VoiceFrames continuously.
+
+        Stub: frames from the IO layer pass through unchanged.
+        Real: enrichment pipeline runs here before yield.
+        """
+        async for frame in self._io.stream():
+            yield self._enrich(frame)
+
+    async def stop(self) -> None:
+        await self._io.stop()
+
+    def _enrich(self, frame: VoiceFrame) -> VoiceFrame:
+        """
+        Apply tone classification to a raw frame.
+
+        Stub: identity transform — returns frame unchanged.
+        Real: replace label + confidence with classifier output.
+        """
+        return frame
--- a/cf_voice/io.py
+++ b/cf_voice/io.py
@ -0,0 +1,122 @@
+# cf_voice/io.py — audio capture and VoiceFrame generation
+#
+# MIT licensed. This layer handles audio I/O only — no inference.
+#
+# In mock mode (CF_VOICE_MOCK=1 or MockVoiceIO), synthetic VoiceFrames are
+# emitted on a timer. Real audio capture will be added in Notation v0.1.x
+# once pyannote.audio and faster-whisper are integrated.
+from __future__ import annotations
+
+import asyncio
+import os
+import random
+import time
+from abc import ABC, abstractmethod
+from typing import AsyncIterator
+
+from cf_voice.models import VoiceFrame
+
+# Generic tone labels for the annotation stream.
+# These are the underlying classifier outputs — the Elcor-style prefix format
+# ("With barely concealed frustration:") is applied by the UI layer, not here.
+_MOCK_LABELS = [
+    "Calm and focused",
+    "Warmly impatient",
+    "Deflecting",
+    "Genuinely curious",
+    "Politely dismissive",
+    "Nervous but cooperative",
+    "Frustrated but contained",
+    "Enthusiastic",
+    "Tired and compliant",
+    "Guardedly optimistic",
+    "Apologetically firm",
+    "Confused but engaged",
+]
+
+_MOCK_SPEAKERS = ["speaker_a", "speaker_b"]
+
+
+class VoiceIO(ABC):
+    """
+    Base class for all audio capture sources.
+
+    Subclasses yield VoiceFrame objects from an async generator.
+    Consumers should use: `async for frame in io_instance.stream(): ...`
+    """
+
+    @abstractmethod
+    async def stream(self) -> AsyncIterator[VoiceFrame]:
+        """Yield VoiceFrames continuously until stopped."""
+        ...
+
+    async def stop(self) -> None:
+        """Signal the stream to stop. Override if cleanup is needed."""
+
+
+class MockVoiceIO(VoiceIO):
+    """
+    Synthetic VoiceFrame generator for development and CI.
+
+    Emits one frame every `interval_s` seconds with randomised labels,
+    confidence, and simulated speaker transitions.
+
+    Activated automatically when CF_VOICE_MOCK=1 is set, or instantiated
+    directly in tests.
+    """
+
+    def __init__(
+        self,
+        interval_s: float = 2.5,
+        speakers: list[str] | None = None,
+        labels: list[str] | None = None,
+        seed: int | None = None,
+    ) -> None:
+        self._interval_s = interval_s
+        self._speakers = speakers or _MOCK_SPEAKERS
+        self._labels = labels or _MOCK_LABELS
+        self._rng = random.Random(seed)
+        self._running = False
+
+    async def stream(self) -> AsyncIterator[VoiceFrame]:  # type: ignore[override]
+        self._running = True
+        start = time.monotonic()
+        prev_label = self._rng.choice(self._labels)
+
+        while self._running:
+            await asyncio.sleep(self._interval_s)
+
+            label = self._rng.choice(self._labels)
+            # shift_magnitude is 0 when the label repeats, higher for big jumps
+            shift = 0.0 if label == prev_label else self._rng.uniform(0.1, 0.9)
+            prev_label = label
+
+            yield VoiceFrame(
+                label=label,
+                confidence=self._rng.uniform(0.55, 0.98),
+                speaker_id=self._rng.choice(self._speakers),
+                shift_magnitude=round(shift, 3),
+                timestamp=round(time.monotonic() - start, 2),
+            )
+
+    async def stop(self) -> None:
+        self._running = False
+
+
+def make_io(
+    mock: bool | None = None,
+    interval_s: float = 2.5,
+) -> VoiceIO:
+    """
+    Factory: return a VoiceIO instance appropriate for the current environment.
+
+    mock=True or CF_VOICE_MOCK=1  → MockVoiceIO (no audio hardware needed)
+    Otherwise                     → real audio capture (not yet implemented)
+    """
+    use_mock = mock if mock is not None else os.environ.get("CF_VOICE_MOCK", "") == "1"
+    if use_mock:
+        return MockVoiceIO(interval_s=interval_s)
+    raise NotImplementedError(
+        "Real audio capture is not yet implemented. "
+        "Set CF_VOICE_MOCK=1 or pass mock=True to use synthetic frames."
+    )
--- a/cf_voice/models.py
+++ b/cf_voice/models.py
@ -0,0 +1,43 @@
+# cf_voice/models.py — VoiceFrame API contract
+#
+# This module is MIT licensed. All consumers (Linnet, Osprey, etc.)
+# import VoiceFrame from here so the shape is consistent across the stack.
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class VoiceFrame:
+    """
+    A single annotated moment in a voice stream.
+
+    Produced by cf_voice.io (audio capture) and enriched by cf_voice.context
+    (tone classification, speaker diarization).
+
+    Fields
+    ------
+    label           Tone annotation, e.g. "Warmly impatient" or "Deflecting".
+                    Generic by default; Elcor-style prefix format is an
+                    easter egg surfaced by the product UI, not set here.
+    confidence      0.0–1.0. Below ~0.5 the annotation is speculative.
+    speaker_id      Ephemeral local label ("speaker_a", "speaker_b").
+                    Not tied to identity — resets each session.
+    shift_magnitude Delta from the previous frame's tone, 0.0–1.0.
+                    High values indicate a meaningful register shift.
+    timestamp       Session-relative seconds since capture started.
+    """
+
+    label: str
+    confidence: float
+    speaker_id: str
+    shift_magnitude: float
+    timestamp: float
+
+    def is_reliable(self, threshold: float = 0.6) -> bool:
+        """Return True when confidence meets the given threshold."""
+        return self.confidence >= threshold
+
+    def is_shift(self, threshold: float = 0.3) -> bool:
+        """Return True when shift_magnitude indicates a meaningful register change."""
+        return self.shift_magnitude >= threshold
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,39 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cf-voice"
+version = "0.1.0"
+description = "CircuitForge voice annotation pipeline — VoiceFrame API, tone classifiers, speaker diarization"
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+dependencies = [
+    "pydantic>=2.0",
+]
+
+[project.optional-dependencies]
+# Real inference backends — not required for stub/mock mode
+inference = [
+    "torch>=2.0",
+    "torchaudio>=2.0",
+    "transformers>=4.40",
+    "pyannote.audio>=3.1",
+]
+dev = [
+    "pytest>=8.0",
+    "pytest-asyncio>=0.23",
+]
+
+[project.scripts]
+# Quick smoke-test: stream mock frames to stdout
+cf-voice-demo = "cf_voice.cli:demo"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["cf_voice*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+asyncio_mode = "auto"
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -0,0 +1,104 @@
+import asyncio
+import pytest
+from cf_voice.models import VoiceFrame
+from cf_voice.io import MockVoiceIO, make_io
+from cf_voice.context import ContextClassifier
+
+
+def make_frame(**kwargs) -> VoiceFrame:
+    defaults = dict(
+        label="Calm and focused",
+        confidence=0.8,
+        speaker_id="speaker_a",
+        shift_magnitude=0.0,
+        timestamp=1.0,
+    )
+    return VoiceFrame(**{**defaults, **kwargs})
+
+
+class TestVoiceFrame:
+    def test_is_reliable_above_threshold(self):
+        assert make_frame(confidence=0.7).is_reliable(threshold=0.6)
+
+    def test_is_reliable_below_threshold(self):
+        assert not make_frame(confidence=0.4).is_reliable(threshold=0.6)
+
+    def test_is_shift_above_threshold(self):
+        assert make_frame(shift_magnitude=0.5).is_shift(threshold=0.3)
+
+    def test_is_shift_below_threshold(self):
+        assert not make_frame(shift_magnitude=0.1).is_shift(threshold=0.3)
+
+    def test_default_reliable_threshold(self):
+        assert make_frame(confidence=0.6).is_reliable()
+        assert not make_frame(confidence=0.59).is_reliable()
+
+
+class TestMockVoiceIO:
+    @pytest.mark.asyncio
+    async def test_emits_frames(self):
+        io = MockVoiceIO(interval_s=0.05, seed=42)
+        frames = []
+        async for frame in io.stream():
+            frames.append(frame)
+            if len(frames) >= 3:
+                await io.stop()
+                break
+        assert len(frames) == 3
+        assert all(isinstance(f, VoiceFrame) for f in frames)
+
+    @pytest.mark.asyncio
+    async def test_confidence_in_range(self):
+        io = MockVoiceIO(interval_s=0.05, seed=1)
+        count = 0
+        async for frame in io.stream():
+            assert 0.0 <= frame.confidence <= 1.0
+            assert 0.0 <= frame.shift_magnitude <= 1.0
+            count += 1
+            if count >= 5:
+                await io.stop()
+                break
+
+    @pytest.mark.asyncio
+    async def test_timestamps_increase(self):
+        io = MockVoiceIO(interval_s=0.05, seed=0)
+        timestamps = []
+        async for frame in io.stream():
+            timestamps.append(frame.timestamp)
+            if len(timestamps) >= 3:
+                await io.stop()
+                break
+        assert timestamps == sorted(timestamps)
+
+    def test_make_io_mock_env(self, monkeypatch):
+        monkeypatch.setenv("CF_VOICE_MOCK", "1")
+        io = make_io()
+        assert isinstance(io, MockVoiceIO)
+
+    def test_make_io_real_raises(self, monkeypatch):
+        monkeypatch.delenv("CF_VOICE_MOCK", raising=False)
+        with pytest.raises(NotImplementedError):
+            make_io(mock=False)
+
+
+class TestContextClassifier:
+    @pytest.mark.asyncio
+    async def test_mock_passthrough(self):
+        classifier = ContextClassifier.mock(interval_s=0.05, seed=7)
+        frames = []
+        async for frame in classifier.stream():
+            frames.append(frame)
+            if len(frames) >= 3:
+                await classifier.stop()
+                break
+        assert len(frames) == 3
+        assert all(isinstance(f, VoiceFrame) for f in frames)
+
+    @pytest.mark.asyncio
+    async def test_from_env_mock(self, monkeypatch):
+        monkeypatch.setenv("CF_VOICE_MOCK", "1")
+        classifier = ContextClassifier.from_env(interval_s=0.05)
+        async for frame in classifier.stream():
+            assert isinstance(frame, VoiceFrame)
+            await classifier.stop()
+            break