feat(stt): add cf-stt module — FasterWhisperBackend + managed FastAPI app

- STTBackend Protocol + STTResult/STTSegment frozen dataclasses (base.py) - MockSTTBackend for CI/tests (no GPU needed, CF_STT_MOCK=1) - FasterWhisperBackend: loads model once, thread-safe, VRAM estimate by model size - app.py: FastAPI service runnable as managed process by cf-orch POST /transcribe (multipart audio) → STTTranscribeResponse-compatible JSON GET /health → {status, model, vram_mb} - __init__.py: process-level singleton + transcribe() convenience fn - pyproject.toml: stt-faster-whisper + stt-service optional dep groups
2026-04-08 22:14:46 -07:00 · 2026-04-08 22:14:46 -07:00 · 67493048e2
commit 67493048e2
parent 5766fa82ab
7 changed files with 556 additions and 0 deletions
--- a/circuitforge_core/stt/init.py
+++ b/circuitforge_core/stt/init.py
@ -0,0 +1,79 @@
 """
 circuitforge_core.stt — Speech-to-text service module.
 Quick start (mock mode — no GPU or model required):
    import os; os.environ["CF_STT_MOCK"] = "1"
    from circuitforge_core.stt import transcribe
    result = transcribe(open("audio.wav", "rb").read())
    print(result.text, result.confidence)
 Real inference (faster-whisper):
    export CF_STT_MODEL=/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash>
    from circuitforge_core.stt import transcribe
 cf-orch service profile:
    service_type: cf-stt
    max_mb:       1024 (medium); 600 (base/small)
    max_concurrent: 3
    shared:       true
    managed:
      exec:       python -m circuitforge_core.stt.app
      args:       --model <path> --port {port} --gpu-id {gpu_id}
      port:       8004
      health:     /health
 """
 from __future__ import annotations
 import os
 from circuitforge_core.stt.backends.base import (
    STTBackend,
    STTResult,
    STTSegment,
    make_stt_backend,
 )
 from circuitforge_core.stt.backends.mock import MockSTTBackend
 _backend: STTBackend | None = None
 def _get_backend() -> STTBackend:
    global _backend
    if _backend is None:
        model_path = os.environ.get("CF_STT_MODEL", "mock")
        mock = model_path == "mock" or os.environ.get("CF_STT_MOCK", "") == "1"
        _backend = make_stt_backend(model_path, mock=mock)
    return _backend
 def transcribe(
    audio: bytes,
    *,
    language: str | None = None,
    confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
 ) -> STTResult:
    """Transcribe audio bytes using the process-level backend."""
    return _get_backend().transcribe(
        audio, language=language, confidence_threshold=confidence_threshold
    )
 def reset_backend() -> None:
    """Reset the process-level singleton. Test teardown only."""
    global _backend
    _backend = None
 __all__ = [
    "STTBackend",
    "STTResult",
    "STTSegment",
    "MockSTTBackend",
    "make_stt_backend",
    "transcribe",
    "reset_backend",
 ]
--- a/circuitforge_core/stt/app.py
+++ b/circuitforge_core/stt/app.py
@ -0,0 +1,150 @@
 """
 circuitforge_core.stt.app — cf-stt FastAPI service.
 Managed by cf-orch as a process-type service. cf-orch starts this via:
    python -m circuitforge_core.stt.app \
        --model /Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash> \
        --port 8004 \
        --gpu-id 0
 Endpoints:
    GET  /health       → {"status": "ok", "model": "<name>", "vram_mb": <n>}
    POST /transcribe   → STTTranscribeResponse (multipart: audio file)
 Audio format: any format ffmpeg understands (WAV, MP3, OGG, FLAC).
 """
 from __future__ import annotations
 import argparse
 import logging
 import os
 import sys
 import uvicorn
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from circuitforge_core.stt.backends.base import STTResult, make_stt_backend
 logger = logging.getLogger(__name__)
 # ── Response model (mirrors circuitforge_orch.contracts.stt.STTTranscribeResponse) ──
 class TranscribeResponse(BaseModel):
    text: str
    confidence: float
    below_threshold: bool
    language: str | None = None
    duration_s: float | None = None
    segments: list[dict] = []
    model: str = ""
 # ── App factory ───────────────────────────────────────────────────────────────
 def create_app(
    model_path: str,
    device: str = "cuda",
    compute_type: str = "float16",
    confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
    mock: bool = False,
 ) -> FastAPI:
    app = FastAPI(title="cf-stt", version="0.1.0")
    backend = make_stt_backend(
        model_path, device=device, compute_type=compute_type, mock=mock
    )
    logger.info("cf-stt ready: model=%r vram=%dMB", backend.model_name, backend.vram_mb)
    @app.get("/health")
    async def health() -> dict:
        return {"status": "ok", "model": backend.model_name, "vram_mb": backend.vram_mb}
    @app.post("/transcribe", response_model=TranscribeResponse)
    async def transcribe(
        audio: UploadFile = File(..., description="Audio file (WAV, MP3, OGG, FLAC, ...)"),
        language: str | None = Form(None, description="BCP-47 language code hint, e.g. 'en'"),
        confidence_threshold_override: float | None = Form(
            None,
            description="Override default confidence threshold for this request.",
        ),
    ) -> TranscribeResponse:
        audio_bytes = await audio.read()
        if not audio_bytes:
            raise HTTPException(status_code=400, detail="Empty audio file")
        threshold = confidence_threshold_override or confidence_threshold
        try:
            result = backend.transcribe(
                audio_bytes, language=language, confidence_threshold=threshold
            )
        except Exception as exc:
            logger.exception("Transcription failed")
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        return TranscribeResponse(
            text=result.text,
            confidence=result.confidence,
            below_threshold=result.below_threshold,
            language=result.language,
            duration_s=result.duration_s,
            segments=[
                {
                    "start_s": s.start_s,
                    "end_s": s.end_s,
                    "text": s.text,
                    "confidence": s.confidence,
                }
                for s in result.segments
            ],
            model=result.model,
        )
    return app
 # ── CLI entry point ───────────────────────────────────────────────────────────
 def main() -> None:
    parser = argparse.ArgumentParser(description="cf-stt — CircuitForge STT service")
    parser.add_argument("--model", required=True,
                        help="Model path or size name (e.g. 'medium', or full local path)")
    parser.add_argument("--port", type=int, default=8004)
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--gpu-id", type=int, default=0,
                        help="CUDA device index (sets CUDA_VISIBLE_DEVICES)")
    parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
    parser.add_argument("--compute-type", default="float16",
                        choices=["float16", "int8", "int8_float16", "float32"],
                        help="Quantisation / compute type passed to faster-whisper")
    parser.add_argument("--confidence-threshold", type=float,
                        default=STTResult.CONFIDENCE_DEFAULT_THRESHOLD)
    parser.add_argument("--mock", action="store_true",
                        help="Run with mock backend (no GPU, for testing)")
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
    )
    # Let cf-orch pass --gpu-id; map to CUDA_VISIBLE_DEVICES so the process
    # only sees its assigned GPU. This prevents accidental multi-GPU usage.
    if args.device == "cuda" and not args.mock:
        os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
    mock = args.mock or os.environ.get("CF_STT_MOCK", "") == "1"
    app = create_app(
        model_path=args.model,
        device=args.device,
        compute_type=args.compute_type,
        confidence_threshold=args.confidence_threshold,
        mock=mock,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
 if __name__ == "__main__":
    main()
--- a/circuitforge_core/stt/backends/init.py
+++ b/circuitforge_core/stt/backends/init.py
@ -0,0 +1,4 @@
 from .base import STTBackend, STTResult, STTSegment, make_stt_backend
 from .mock import MockSTTBackend
 __all__ = ["STTBackend", "STTResult", "STTSegment", "make_stt_backend", "MockSTTBackend"]
--- a/circuitforge_core/stt/backends/base.py
+++ b/circuitforge_core/stt/backends/base.py
@ -0,0 +1,109 @@
 # circuitforge_core/stt/backends/base.py — STTBackend Protocol + factory
 #
 # MIT licensed. The Protocol and mock are always importable without GPU deps.
 # Real backends require optional extras:
 #   pip install -e "circuitforge-core[stt-faster-whisper]"
 from __future__ import annotations
 import os
 from dataclasses import dataclass, field
 from typing import Protocol, runtime_checkable
 # ── Result types ──────────────────────────────────────────────────────────────
@dataclass(frozen=True)
 class STTSegment:
    """Word- or phrase-level segment (included when the backend supports it)."""
    start_s: float
    end_s: float
    text: str
    confidence: float
@dataclass(frozen=True)
 class STTResult:
    """
    Standard result from any STTBackend.transcribe() call.
    confidence is normalised to 0.0–1.0 regardless of the backend's native metric.
    below_threshold is True when confidence < the configured threshold (default 0.75).
    This flag is safety-critical for products like Osprey: DTMF must NOT be sent
    when below_threshold is True.
    """
    text: str
    confidence: float                       # 0.0–1.0
    below_threshold: bool
    language: str | None = None
    duration_s: float | None = None
    segments: list[STTSegment] = field(default_factory=list)
    model: str = ""
    CONFIDENCE_DEFAULT_THRESHOLD: float = 0.75
 # ── Protocol ──────────────────────────────────────────────────────────────────
@runtime_checkable
 class STTBackend(Protocol):
    """
    Abstract interface for speech-to-text backends.
    All backends load their model once at construction time and are safe to
    call concurrently (the model weights are read-only after load).
    """
    def transcribe(
        self,
        audio: bytes,
        *,
        language: str | None = None,
        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
    ) -> STTResult:
        """Synchronous transcription. audio is raw PCM or any format ffmpeg understands."""
        ...
    @property
    def model_name(self) -> str:
        """Identifier for the loaded model (path stem or size name)."""
        ...
    @property
    def vram_mb(self) -> int:
        """Approximate VRAM footprint in MB. Used by cf-orch service registry."""
        ...
 # ── Factory ───────────────────────────────────────────────────────────────────
 def make_stt_backend(
    model_path: str,
    backend: str | None = None,
    mock: bool | None = None,
    device: str = "cuda",
    compute_type: str = "float16",
 ) -> STTBackend:
    """
    Return an STTBackend for the given model.
    mock=True or CF_STT_MOCK=1  → MockSTTBackend (no GPU, no model file needed)
    backend="faster-whisper"    → FasterWhisperBackend (default)
    device and compute_type are passed through to the backend and ignored by mock.
    """
    use_mock = mock if mock is not None else os.environ.get("CF_STT_MOCK", "") == "1"
    if use_mock:
        from circuitforge_core.stt.backends.mock import MockSTTBackend
        return MockSTTBackend(model_name=model_path)
    resolved = backend or os.environ.get("CF_STT_BACKEND", "faster-whisper")
    if resolved == "faster-whisper":
        from circuitforge_core.stt.backends.faster_whisper import FasterWhisperBackend
        return FasterWhisperBackend(
            model_path=model_path, device=device, compute_type=compute_type
        )
    raise ValueError(
        f"Unknown STT backend {resolved!r}. "
        "Expected 'faster-whisper'. Set CF_STT_BACKEND or pass backend= explicitly."
    )
--- a/circuitforge_core/stt/backends/faster_whisper.py
+++ b/circuitforge_core/stt/backends/faster_whisper.py
@ -0,0 +1,139 @@
 # circuitforge_core/stt/backends/faster_whisper.py — FasterWhisperBackend
 #
 # MIT licensed. Requires: pip install -e "circuitforge-core[stt-faster-whisper]"
 #
 # Model path can be:
 #   - A size name:  "base", "small", "medium", "large-v3"
 #     (faster-whisper downloads and caches it on first use)
 #   - A local path: "/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/..."
 #     (preferred for air-gapped nodes — no download needed)
 from __future__ import annotations
 import io
 import logging
 import os
 import tempfile
 from circuitforge_core.stt.backends.base import STTResult, STTSegment
 logger = logging.getLogger(__name__)
 # VRAM estimates by model size. Used by cf-orch for VRAM budgeting.
 _VRAM_MB_BY_SIZE: dict[str, int] = {
    "tiny":       200,
    "base":       350,
    "small":      600,
    "medium":    1024,
    "large":     2048,
    "large-v2":  2048,
    "large-v3":  2048,
    "distil-large-v3": 1500,
 }
 # Aggregate confidence from per-segment no_speech_prob values.
 # faster-whisper doesn't expose a direct confidence score, so we invert the
 # mean no_speech_prob as a proxy. This is conservative but directionally correct.
 def _aggregate_confidence(segments: list) -> float:
    if not segments:
        return 0.0
    probs = [max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)) for s in segments]
    return sum(probs) / len(probs)
 class FasterWhisperBackend:
    """
    faster-whisper STT backend.
    Thread-safe after construction: WhisperModel internally manages its own
    CUDA context and is safe to call from multiple threads.
    """
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        compute_type: str = "float16",
    ) -> None:
        try:
            from faster_whisper import WhisperModel
        except ImportError as exc:
            raise ImportError(
                "faster-whisper is not installed. "
                "Run: pip install -e 'circuitforge-core[stt-faster-whisper]'"
            ) from exc
        logger.info("Loading faster-whisper model from %r (device=%s)", model_path, device)
        self._model_path = model_path
        self._device = device
        self._compute_type = compute_type
        self._model = WhisperModel(model_path, device=device, compute_type=compute_type)
        logger.info("faster-whisper model ready")
        # Determine VRAM footprint from model name/path stem.
        stem = os.path.basename(model_path.rstrip("/")).lower()
        self._vram_mb = next(
            (v for k, v in _VRAM_MB_BY_SIZE.items() if k in stem),
            1024,   # conservative default if size can't be inferred
        )
    def transcribe(
        self,
        audio: bytes,
        *,
        language: str | None = None,
        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
    ) -> STTResult:
        """
        Transcribe raw audio bytes.
        audio can be any format ffmpeg understands (WAV, MP3, OGG, FLAC, etc.).
        faster-whisper writes audio to a temp file internally; we follow the
        same pattern to avoid holding the bytes in memory longer than needed.
        """
        with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
            tmp.write(audio)
            tmp_path = tmp.name
        try:
            segments_gen, info = self._model.transcribe(
                tmp_path,
                language=language,
                word_timestamps=True,
                vad_filter=True,
            )
            segments = list(segments_gen)
        finally:
            os.unlink(tmp_path)
        text = " ".join(s.text.strip() for s in segments).strip()
        confidence = _aggregate_confidence(segments)
        duration_s = info.duration if hasattr(info, "duration") else None
        detected_language = getattr(info, "language", language)
        stt_segments = [
            STTSegment(
                start_s=s.start,
                end_s=s.end,
                text=s.text.strip(),
                confidence=max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)),
            )
            for s in segments
        ]
        return STTResult(
            text=text,
            confidence=confidence,
            below_threshold=confidence < confidence_threshold,
            language=detected_language,
            duration_s=duration_s,
            segments=stt_segments,
            model=self._model_path,
        )
    @property
    def model_name(self) -> str:
        return self._model_path
    @property
    def vram_mb(self) -> int:
        return self._vram_mb
--- a/circuitforge_core/stt/backends/mock.py
+++ b/circuitforge_core/stt/backends/mock.py
@ -0,0 +1,54 @@
 # circuitforge_core/stt/backends/mock.py — MockSTTBackend
 #
 # MIT licensed. No GPU, no model file required.
 # Used in tests and CI, and when CF_STT_MOCK=1.
 from __future__ import annotations
 from circuitforge_core.stt.backends.base import STTBackend, STTResult
 class MockSTTBackend:
    """
    Deterministic mock STT backend for testing.
    Returns a fixed transcript so tests can assert on the response shape
    without needing a GPU or a model file.
    """
    def __init__(
        self,
        model_name: str = "mock",
        fixed_text: str = "mock transcription",
        fixed_confidence: float = 0.95,
    ) -> None:
        self._model_name = model_name
        self._fixed_text = fixed_text
        self._fixed_confidence = fixed_confidence
    def transcribe(
        self,
        audio: bytes,
        *,
        language: str | None = None,
        confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
    ) -> STTResult:
        return STTResult(
            text=self._fixed_text,
            confidence=self._fixed_confidence,
            below_threshold=self._fixed_confidence < confidence_threshold,
            language=language or "en",
            duration_s=float(len(audio)) / 32000,  # rough estimate: 16kHz 16-bit mono
            model=self._model_name,
        )
    @property
    def model_name(self) -> str:
        return self._model_name
    @property
    def vram_mb(self) -> int:
        return 0
 # Satisfy the Protocol at import time (no GPU needed)
 assert isinstance(MockSTTBackend(), STTBackend)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,6 +18,27 @@ manage = [
    "platformdirs>=4.0",
    "typer[all]>=0.12",
 ]
 text-llamacpp = [
    "llama-cpp-python>=0.2.0",
 ]
 text-transformers = [
    "torch>=2.0",
    "transformers>=4.40",
    "accelerate>=0.27",
 ]
 text-transformers-4bit = [
    "circuitforge-core[text-transformers]",
    "bitsandbytes>=0.43",
 ]
 stt-faster-whisper = [
    "faster-whisper>=1.0",
 ]
 stt-service = [
    "circuitforge-core[stt-faster-whisper]",
    "fastapi>=0.110",
    "uvicorn[standard]>=0.29",
    "python-multipart>=0.0.9",
 ]
 dev = [
    "circuitforge-core[manage]",
    "pytest>=8.0",