feat(stt): add cf-stt module — FasterWhisperBackend + managed FastAPI app

- STTBackend Protocol + STTResult/STTSegment frozen dataclasses (base.py) - MockSTTBackend for CI/tests (no GPU needed, CF_STT_MOCK=1) - FasterWhisperBackend: loads model once, thread-safe, VRAM estimate by model size - app.py: FastAPI service runnable as managed process by cf-orch POST /transcribe (multipart audio) → STTTranscribeResponse-compatible JSON GET /health → {status, model, vram_mb} - __init__.py: process-level singleton + transcribe() convenience fn - pyproject.toml: stt-faster-whisper + stt-service optional dep groups
2026-04-08 22:14:46 -07:00 · 2026-04-08 22:14:46 -07:00 · 67493048e2
commit 67493048e2
parent 5766fa82ab
7 changed files with 556 additions and 0 deletions
--- a/circuitforge_core/stt/init.py
+++ b/circuitforge_core/stt/init.py
@ -0,0 +1,79 @@
+"""
+circuitforge_core.stt — Speech-to-text service module.
+
+Quick start (mock mode — no GPU or model required):
+
+    import os; os.environ["CF_STT_MOCK"] = "1"
+    from circuitforge_core.stt import transcribe
+
+    result = transcribe(open("audio.wav", "rb").read())
+    print(result.text, result.confidence)
+
+Real inference (faster-whisper):
+
+    export CF_STT_MODEL=/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash>
+    from circuitforge_core.stt import transcribe
+
+cf-orch service profile:
+
+    service_type: cf-stt
+    max_mb:       1024 (medium); 600 (base/small)
+    max_concurrent: 3
+    shared:       true
+    managed:
+      exec:       python -m circuitforge_core.stt.app
+      args:       --model <path> --port {port} --gpu-id {gpu_id}
+      port:       8004
+      health:     /health
+"""
+from __future__ import annotations
+
+import os
+
+from circuitforge_core.stt.backends.base import (
+    STTBackend,
+    STTResult,
+    STTSegment,
+    make_stt_backend,
+)
+from circuitforge_core.stt.backends.mock import MockSTTBackend
+
+_backend: STTBackend | None = None
+
+
+def _get_backend() -> STTBackend:
+    global _backend
+    if _backend is None:
+        model_path = os.environ.get("CF_STT_MODEL", "mock")
+        mock = model_path == "mock" or os.environ.get("CF_STT_MOCK", "") == "1"
+        _backend = make_stt_backend(model_path, mock=mock)
+    return _backend
+
+
+def transcribe(
+    audio: bytes,
+    *,
+    language: str | None = None,
+    confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
+) -> STTResult:
+    """Transcribe audio bytes using the process-level backend."""
+    return _get_backend().transcribe(
+        audio, language=language, confidence_threshold=confidence_threshold
+    )
+
+
+def reset_backend() -> None:
+    """Reset the process-level singleton. Test teardown only."""
+    global _backend
+    _backend = None
+
+
+__all__ = [
+    "STTBackend",
+    "STTResult",
+    "STTSegment",
+    "MockSTTBackend",
+    "make_stt_backend",
+    "transcribe",
+    "reset_backend",
+]
--- a/circuitforge_core/stt/app.py
+++ b/circuitforge_core/stt/app.py
@ -0,0 +1,150 @@
+"""
+circuitforge_core.stt.app — cf-stt FastAPI service.
+
+Managed by cf-orch as a process-type service. cf-orch starts this via:
+
+    python -m circuitforge_core.stt.app \
+        --model /Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash> \
+        --port 8004 \
+        --gpu-id 0
+
+Endpoints:
+    GET  /health       → {"status": "ok", "model": "<name>", "vram_mb": <n>}
+    POST /transcribe   → STTTranscribeResponse (multipart: audio file)
+
+Audio format: any format ffmpeg understands (WAV, MP3, OGG, FLAC).
+"""
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import sys
+
+import uvicorn
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+
+from circuitforge_core.stt.backends.base import STTResult, make_stt_backend
+
+logger = logging.getLogger(__name__)
+
+# ── Response model (mirrors circuitforge_orch.contracts.stt.STTTranscribeResponse) ──
+
+class TranscribeResponse(BaseModel):
+    text: str
+    confidence: float
+    below_threshold: bool
+    language: str | None = None
+    duration_s: float | None = None
+    segments: list[dict] = []
+    model: str = ""
+
+
+# ── App factory ───────────────────────────────────────────────────────────────
+
+def create_app(
+    model_path: str,
+    device: str = "cuda",
+    compute_type: str = "float16",
+    confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
+    mock: bool = False,
+) -> FastAPI:
+    app = FastAPI(title="cf-stt", version="0.1.0")
+    backend = make_stt_backend(
+        model_path, device=device, compute_type=compute_type, mock=mock
+    )
+    logger.info("cf-stt ready: model=%r vram=%dMB", backend.model_name, backend.vram_mb)
+
+    @app.get("/health")
+    async def health() -> dict:
+        return {"status": "ok", "model": backend.model_name, "vram_mb": backend.vram_mb}
+
+    @app.post("/transcribe", response_model=TranscribeResponse)
+    async def transcribe(
+        audio: UploadFile = File(..., description="Audio file (WAV, MP3, OGG, FLAC, ...)"),
+        language: str | None = Form(None, description="BCP-47 language code hint, e.g. 'en'"),
+        confidence_threshold_override: float | None = Form(
+            None,
+            description="Override default confidence threshold for this request.",
+        ),
+    ) -> TranscribeResponse:
+        audio_bytes = await audio.read()
+        if not audio_bytes:
+            raise HTTPException(status_code=400, detail="Empty audio file")
+
+        threshold = confidence_threshold_override or confidence_threshold
+        try:
+            result = backend.transcribe(
+                audio_bytes, language=language, confidence_threshold=threshold
+            )
+        except Exception as exc:
+            logger.exception("Transcription failed")
+            raise HTTPException(status_code=500, detail=str(exc)) from exc
+
+        return TranscribeResponse(
+            text=result.text,
+            confidence=result.confidence,
+            below_threshold=result.below_threshold,
+            language=result.language,
+            duration_s=result.duration_s,
+            segments=[
+                {
+                    "start_s": s.start_s,
+                    "end_s": s.end_s,
+                    "text": s.text,
+                    "confidence": s.confidence,
+                }
+                for s in result.segments
+            ],
+            model=result.model,
+        )
+
+    return app
+
+
+# ── CLI entry point ───────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="cf-stt — CircuitForge STT service")
+    parser.add_argument("--model", required=True,
+                        help="Model path or size name (e.g. 'medium', or full local path)")
+    parser.add_argument("--port", type=int, default=8004)
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--gpu-id", type=int, default=0,
+                        help="CUDA device index (sets CUDA_VISIBLE_DEVICES)")
+    parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
+    parser.add_argument("--compute-type", default="float16",
+                        choices=["float16", "int8", "int8_float16", "float32"],
+                        help="Quantisation / compute type passed to faster-whisper")
+    parser.add_argument("--confidence-threshold", type=float,
+                        default=STTResult.CONFIDENCE_DEFAULT_THRESHOLD)
+    parser.add_argument("--mock", action="store_true",
+                        help="Run with mock backend (no GPU, for testing)")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
+    )
+
+    # Let cf-orch pass --gpu-id; map to CUDA_VISIBLE_DEVICES so the process
+    # only sees its assigned GPU. This prevents accidental multi-GPU usage.
+    if args.device == "cuda" and not args.mock:
+        os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
+
+    mock = args.mock or os.environ.get("CF_STT_MOCK", "") == "1"
+    app = create_app(
+        model_path=args.model,
+        device=args.device,
+        compute_type=args.compute_type,
+        confidence_threshold=args.confidence_threshold,
+        mock=mock,
+    )
+
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+
+
+if __name__ == "__main__":
+    main()
--- a/circuitforge_core/stt/backends/init.py
+++ b/circuitforge_core/stt/backends/init.py
@ -0,0 +1,4 @@
+from .base import STTBackend, STTResult, STTSegment, make_stt_backend
+from .mock import MockSTTBackend
+
+__all__ = ["STTBackend", "STTResult", "STTSegment", "make_stt_backend", "MockSTTBackend"]
--- a/circuitforge_core/stt/backends/base.py
+++ b/circuitforge_core/stt/backends/base.py
@ -0,0 +1,109 @@
+# circuitforge_core/stt/backends/base.py — STTBackend Protocol + factory
+#
+# MIT licensed. The Protocol and mock are always importable without GPU deps.
+# Real backends require optional extras:
+#   pip install -e "circuitforge-core[stt-faster-whisper]"
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import Protocol, runtime_checkable
+
+
+# ── Result types ──────────────────────────────────────────────────────────────
+
+@dataclass(frozen=True)
+class STTSegment:
+    """Word- or phrase-level segment (included when the backend supports it)."""
+    start_s: float
+    end_s: float
+    text: str
+    confidence: float
+
+
+@dataclass(frozen=True)
+class STTResult:
+    """
+    Standard result from any STTBackend.transcribe() call.
+
+    confidence is normalised to 0.0–1.0 regardless of the backend's native metric.
+    below_threshold is True when confidence < the configured threshold (default 0.75).
+    This flag is safety-critical for products like Osprey: DTMF must NOT be sent
+    when below_threshold is True.
+    """
+    text: str
+    confidence: float                       # 0.0–1.0
+    below_threshold: bool
+    language: str | None = None
+    duration_s: float | None = None
+    segments: list[STTSegment] = field(default_factory=list)
+    model: str = ""
+
+    CONFIDENCE_DEFAULT_THRESHOLD: float = 0.75
+
+
+# ── Protocol ──────────────────────────────────────────────────────────────────
+
+@runtime_checkable
+class STTBackend(Protocol):
+    """
+    Abstract interface for speech-to-text backends.
+
+    All backends load their model once at construction time and are safe to
+    call concurrently (the model weights are read-only after load).
+    """
+
+    def transcribe(
+        self,
+        audio: bytes,
+        *,
+        language: str | None = None,
+        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
+    ) -> STTResult:
+        """Synchronous transcription. audio is raw PCM or any format ffmpeg understands."""
+        ...
+
+    @property
+    def model_name(self) -> str:
+        """Identifier for the loaded model (path stem or size name)."""
+        ...
+
+    @property
+    def vram_mb(self) -> int:
+        """Approximate VRAM footprint in MB. Used by cf-orch service registry."""
+        ...
+
+
+# ── Factory ───────────────────────────────────────────────────────────────────
+
+def make_stt_backend(
+    model_path: str,
+    backend: str | None = None,
+    mock: bool | None = None,
+    device: str = "cuda",
+    compute_type: str = "float16",
+) -> STTBackend:
+    """
+    Return an STTBackend for the given model.
+
+    mock=True or CF_STT_MOCK=1  → MockSTTBackend (no GPU, no model file needed)
+    backend="faster-whisper"    → FasterWhisperBackend (default)
+
+    device and compute_type are passed through to the backend and ignored by mock.
+    """
+    use_mock = mock if mock is not None else os.environ.get("CF_STT_MOCK", "") == "1"
+    if use_mock:
+        from circuitforge_core.stt.backends.mock import MockSTTBackend
+        return MockSTTBackend(model_name=model_path)
+
+    resolved = backend or os.environ.get("CF_STT_BACKEND", "faster-whisper")
+    if resolved == "faster-whisper":
+        from circuitforge_core.stt.backends.faster_whisper import FasterWhisperBackend
+        return FasterWhisperBackend(
+            model_path=model_path, device=device, compute_type=compute_type
+        )
+
+    raise ValueError(
+        f"Unknown STT backend {resolved!r}. "
+        "Expected 'faster-whisper'. Set CF_STT_BACKEND or pass backend= explicitly."
+    )
--- a/circuitforge_core/stt/backends/faster_whisper.py
+++ b/circuitforge_core/stt/backends/faster_whisper.py
@ -0,0 +1,139 @@
+# circuitforge_core/stt/backends/faster_whisper.py — FasterWhisperBackend
+#
+# MIT licensed. Requires: pip install -e "circuitforge-core[stt-faster-whisper]"
+#
+# Model path can be:
+#   - A size name:  "base", "small", "medium", "large-v3"
+#     (faster-whisper downloads and caches it on first use)
+#   - A local path: "/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/..."
+#     (preferred for air-gapped nodes — no download needed)
+from __future__ import annotations
+
+import io
+import logging
+import os
+import tempfile
+
+from circuitforge_core.stt.backends.base import STTResult, STTSegment
+
+logger = logging.getLogger(__name__)
+
+# VRAM estimates by model size. Used by cf-orch for VRAM budgeting.
+_VRAM_MB_BY_SIZE: dict[str, int] = {
+    "tiny":       200,
+    "base":       350,
+    "small":      600,
+    "medium":    1024,
+    "large":     2048,
+    "large-v2":  2048,
+    "large-v3":  2048,
+    "distil-large-v3": 1500,
+}
+
+# Aggregate confidence from per-segment no_speech_prob values.
+# faster-whisper doesn't expose a direct confidence score, so we invert the
+# mean no_speech_prob as a proxy. This is conservative but directionally correct.
+def _aggregate_confidence(segments: list) -> float:
+    if not segments:
+        return 0.0
+    probs = [max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)) for s in segments]
+    return sum(probs) / len(probs)
+
+
+class FasterWhisperBackend:
+    """
+    faster-whisper STT backend.
+
+    Thread-safe after construction: WhisperModel internally manages its own
+    CUDA context and is safe to call from multiple threads.
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        device: str = "cuda",
+        compute_type: str = "float16",
+    ) -> None:
+        try:
+            from faster_whisper import WhisperModel
+        except ImportError as exc:
+            raise ImportError(
+                "faster-whisper is not installed. "
+                "Run: pip install -e 'circuitforge-core[stt-faster-whisper]'"
+            ) from exc
+
+        logger.info("Loading faster-whisper model from %r (device=%s)", model_path, device)
+        self._model_path = model_path
+        self._device = device
+        self._compute_type = compute_type
+        self._model = WhisperModel(model_path, device=device, compute_type=compute_type)
+        logger.info("faster-whisper model ready")
+
+        # Determine VRAM footprint from model name/path stem.
+        stem = os.path.basename(model_path.rstrip("/")).lower()
+        self._vram_mb = next(
+            (v for k, v in _VRAM_MB_BY_SIZE.items() if k in stem),
+            1024,   # conservative default if size can't be inferred
+        )
+
+    def transcribe(
+        self,
+        audio: bytes,
+        *,
+        language: str | None = None,
+        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
+    ) -> STTResult:
+        """
+        Transcribe raw audio bytes.
+
+        audio can be any format ffmpeg understands (WAV, MP3, OGG, FLAC, etc.).
+        faster-whisper writes audio to a temp file internally; we follow the
+        same pattern to avoid holding the bytes in memory longer than needed.
+        """
+        with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
+            tmp.write(audio)
+            tmp_path = tmp.name
+
+        try:
+            segments_gen, info = self._model.transcribe(
+                tmp_path,
+                language=language,
+                word_timestamps=True,
+                vad_filter=True,
+            )
+            segments = list(segments_gen)
+        finally:
+            os.unlink(tmp_path)
+
+        text = " ".join(s.text.strip() for s in segments).strip()
+        confidence = _aggregate_confidence(segments)
+        duration_s = info.duration if hasattr(info, "duration") else None
+        detected_language = getattr(info, "language", language)
+
+        stt_segments = [
+            STTSegment(
+                start_s=s.start,
+                end_s=s.end,
+                text=s.text.strip(),
+                confidence=max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)),
+            )
+            for s in segments
+        ]
+
+        return STTResult(
+            text=text,
+            confidence=confidence,
+            below_threshold=confidence < confidence_threshold,
+            language=detected_language,
+            duration_s=duration_s,
+            segments=stt_segments,
+            model=self._model_path,
+        )
+
+    @property
+    def model_name(self) -> str:
+        return self._model_path
+
+    @property
+    def vram_mb(self) -> int:
+        return self._vram_mb
--- a/circuitforge_core/stt/backends/mock.py
+++ b/circuitforge_core/stt/backends/mock.py
@ -0,0 +1,54 @@
+# circuitforge_core/stt/backends/mock.py — MockSTTBackend
+#
+# MIT licensed. No GPU, no model file required.
+# Used in tests and CI, and when CF_STT_MOCK=1.
+from __future__ import annotations
+
+from circuitforge_core.stt.backends.base import STTBackend, STTResult
+
+
+class MockSTTBackend:
+    """
+    Deterministic mock STT backend for testing.
+
+    Returns a fixed transcript so tests can assert on the response shape
+    without needing a GPU or a model file.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "mock",
+        fixed_text: str = "mock transcription",
+        fixed_confidence: float = 0.95,
+    ) -> None:
+        self._model_name = model_name
+        self._fixed_text = fixed_text
+        self._fixed_confidence = fixed_confidence
+
+    def transcribe(
+        self,
+        audio: bytes,
+        *,
+        language: str | None = None,
+        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
+    ) -> STTResult:
+        return STTResult(
+            text=self._fixed_text,
+            confidence=self._fixed_confidence,
+            below_threshold=self._fixed_confidence < confidence_threshold,
+            language=language or "en",
+            duration_s=float(len(audio)) / 32000,  # rough estimate: 16kHz 16-bit mono
+            model=self._model_name,
+        )
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    @property
+    def vram_mb(self) -> int:
+        return 0
+
+
+# Satisfy the Protocol at import time (no GPU needed)
+assert isinstance(MockSTTBackend(), STTBackend)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,6 +18,27 @@ manage = [
    "platformdirs>=4.0",
    "typer[all]>=0.12",
 ]
+text-llamacpp = [
+    "llama-cpp-python>=0.2.0",
+]
+text-transformers = [
+    "torch>=2.0",
+    "transformers>=4.40",
+    "accelerate>=0.27",
+]
+text-transformers-4bit = [
+    "circuitforge-core[text-transformers]",
+    "bitsandbytes>=0.43",
+]
+stt-faster-whisper = [
+    "faster-whisper>=1.0",
+]
+stt-service = [
+    "circuitforge-core[stt-faster-whisper]",
+    "fastapi>=0.110",
+    "uvicorn[standard]>=0.29",
+    "python-multipart>=0.0.9",
+]
 dev = [
    "circuitforge-core[manage]",
    "pytest>=8.0",