From 67493048e2617c8915b2e0c6f6dfc77af649d749 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 8 Apr 2026 22:14:46 -0700 Subject: [PATCH] =?UTF-8?q?feat(stt):=20add=20cf-stt=20module=20=E2=80=94?= =?UTF-8?q?=20FasterWhisperBackend=20+=20managed=20FastAPI=20app?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - STTBackend Protocol + STTResult/STTSegment frozen dataclasses (base.py) - MockSTTBackend for CI/tests (no GPU needed, CF_STT_MOCK=1) - FasterWhisperBackend: loads model once, thread-safe, VRAM estimate by model size - app.py: FastAPI service runnable as managed process by cf-orch POST /transcribe (multipart audio) → STTTranscribeResponse-compatible JSON GET /health → {status, model, vram_mb} - __init__.py: process-level singleton + transcribe() convenience fn - pyproject.toml: stt-faster-whisper + stt-service optional dep groups --- circuitforge_core/stt/__init__.py | 79 +++++++++ circuitforge_core/stt/app.py | 150 ++++++++++++++++++ circuitforge_core/stt/backends/__init__.py | 4 + circuitforge_core/stt/backends/base.py | 109 +++++++++++++ .../stt/backends/faster_whisper.py | 139 ++++++++++++++++ circuitforge_core/stt/backends/mock.py | 54 +++++++ pyproject.toml | 21 +++ 7 files changed, 556 insertions(+) create mode 100644 circuitforge_core/stt/__init__.py create mode 100644 circuitforge_core/stt/app.py create mode 100644 circuitforge_core/stt/backends/__init__.py create mode 100644 circuitforge_core/stt/backends/base.py create mode 100644 circuitforge_core/stt/backends/faster_whisper.py create mode 100644 circuitforge_core/stt/backends/mock.py diff --git a/circuitforge_core/stt/__init__.py b/circuitforge_core/stt/__init__.py new file mode 100644 index 0000000..1c6f1d5 --- /dev/null +++ b/circuitforge_core/stt/__init__.py @@ -0,0 +1,79 @@ +""" +circuitforge_core.stt — Speech-to-text service module. + +Quick start (mock mode — no GPU or model required): + + import os; os.environ["CF_STT_MOCK"] = "1" + from circuitforge_core.stt import transcribe + + result = transcribe(open("audio.wav", "rb").read()) + print(result.text, result.confidence) + +Real inference (faster-whisper): + + export CF_STT_MODEL=/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/ + from circuitforge_core.stt import transcribe + +cf-orch service profile: + + service_type: cf-stt + max_mb: 1024 (medium); 600 (base/small) + max_concurrent: 3 + shared: true + managed: + exec: python -m circuitforge_core.stt.app + args: --model --port {port} --gpu-id {gpu_id} + port: 8004 + health: /health +""" +from __future__ import annotations + +import os + +from circuitforge_core.stt.backends.base import ( + STTBackend, + STTResult, + STTSegment, + make_stt_backend, +) +from circuitforge_core.stt.backends.mock import MockSTTBackend + +_backend: STTBackend | None = None + + +def _get_backend() -> STTBackend: + global _backend + if _backend is None: + model_path = os.environ.get("CF_STT_MODEL", "mock") + mock = model_path == "mock" or os.environ.get("CF_STT_MOCK", "") == "1" + _backend = make_stt_backend(model_path, mock=mock) + return _backend + + +def transcribe( + audio: bytes, + *, + language: str | None = None, + confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD, +) -> STTResult: + """Transcribe audio bytes using the process-level backend.""" + return _get_backend().transcribe( + audio, language=language, confidence_threshold=confidence_threshold + ) + + +def reset_backend() -> None: + """Reset the process-level singleton. Test teardown only.""" + global _backend + _backend = None + + +__all__ = [ + "STTBackend", + "STTResult", + "STTSegment", + "MockSTTBackend", + "make_stt_backend", + "transcribe", + "reset_backend", +] diff --git a/circuitforge_core/stt/app.py b/circuitforge_core/stt/app.py new file mode 100644 index 0000000..c39a44f --- /dev/null +++ b/circuitforge_core/stt/app.py @@ -0,0 +1,150 @@ +""" +circuitforge_core.stt.app — cf-stt FastAPI service. + +Managed by cf-orch as a process-type service. cf-orch starts this via: + + python -m circuitforge_core.stt.app \ + --model /Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/ \ + --port 8004 \ + --gpu-id 0 + +Endpoints: + GET /health → {"status": "ok", "model": "", "vram_mb": } + POST /transcribe → STTTranscribeResponse (multipart: audio file) + +Audio format: any format ffmpeg understands (WAV, MP3, OGG, FLAC). +""" +from __future__ import annotations + +import argparse +import logging +import os +import sys + +import uvicorn +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +from circuitforge_core.stt.backends.base import STTResult, make_stt_backend + +logger = logging.getLogger(__name__) + +# ── Response model (mirrors circuitforge_orch.contracts.stt.STTTranscribeResponse) ── + +class TranscribeResponse(BaseModel): + text: str + confidence: float + below_threshold: bool + language: str | None = None + duration_s: float | None = None + segments: list[dict] = [] + model: str = "" + + +# ── App factory ─────────────────────────────────────────────────────────────── + +def create_app( + model_path: str, + device: str = "cuda", + compute_type: str = "float16", + confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD, + mock: bool = False, +) -> FastAPI: + app = FastAPI(title="cf-stt", version="0.1.0") + backend = make_stt_backend( + model_path, device=device, compute_type=compute_type, mock=mock + ) + logger.info("cf-stt ready: model=%r vram=%dMB", backend.model_name, backend.vram_mb) + + @app.get("/health") + async def health() -> dict: + return {"status": "ok", "model": backend.model_name, "vram_mb": backend.vram_mb} + + @app.post("/transcribe", response_model=TranscribeResponse) + async def transcribe( + audio: UploadFile = File(..., description="Audio file (WAV, MP3, OGG, FLAC, ...)"), + language: str | None = Form(None, description="BCP-47 language code hint, e.g. 'en'"), + confidence_threshold_override: float | None = Form( + None, + description="Override default confidence threshold for this request.", + ), + ) -> TranscribeResponse: + audio_bytes = await audio.read() + if not audio_bytes: + raise HTTPException(status_code=400, detail="Empty audio file") + + threshold = confidence_threshold_override or confidence_threshold + try: + result = backend.transcribe( + audio_bytes, language=language, confidence_threshold=threshold + ) + except Exception as exc: + logger.exception("Transcription failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + + return TranscribeResponse( + text=result.text, + confidence=result.confidence, + below_threshold=result.below_threshold, + language=result.language, + duration_s=result.duration_s, + segments=[ + { + "start_s": s.start_s, + "end_s": s.end_s, + "text": s.text, + "confidence": s.confidence, + } + for s in result.segments + ], + model=result.model, + ) + + return app + + +# ── CLI entry point ─────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser(description="cf-stt — CircuitForge STT service") + parser.add_argument("--model", required=True, + help="Model path or size name (e.g. 'medium', or full local path)") + parser.add_argument("--port", type=int, default=8004) + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--gpu-id", type=int, default=0, + help="CUDA device index (sets CUDA_VISIBLE_DEVICES)") + parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"]) + parser.add_argument("--compute-type", default="float16", + choices=["float16", "int8", "int8_float16", "float32"], + help="Quantisation / compute type passed to faster-whisper") + parser.add_argument("--confidence-threshold", type=float, + default=STTResult.CONFIDENCE_DEFAULT_THRESHOLD) + parser.add_argument("--mock", action="store_true", + help="Run with mock backend (no GPU, for testing)") + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + ) + + # Let cf-orch pass --gpu-id; map to CUDA_VISIBLE_DEVICES so the process + # only sees its assigned GPU. This prevents accidental multi-GPU usage. + if args.device == "cuda" and not args.mock: + os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id)) + + mock = args.mock or os.environ.get("CF_STT_MOCK", "") == "1" + app = create_app( + model_path=args.model, + device=args.device, + compute_type=args.compute_type, + confidence_threshold=args.confidence_threshold, + mock=mock, + ) + + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + +if __name__ == "__main__": + main() diff --git a/circuitforge_core/stt/backends/__init__.py b/circuitforge_core/stt/backends/__init__.py new file mode 100644 index 0000000..6230b10 --- /dev/null +++ b/circuitforge_core/stt/backends/__init__.py @@ -0,0 +1,4 @@ +from .base import STTBackend, STTResult, STTSegment, make_stt_backend +from .mock import MockSTTBackend + +__all__ = ["STTBackend", "STTResult", "STTSegment", "make_stt_backend", "MockSTTBackend"] diff --git a/circuitforge_core/stt/backends/base.py b/circuitforge_core/stt/backends/base.py new file mode 100644 index 0000000..1f99e66 --- /dev/null +++ b/circuitforge_core/stt/backends/base.py @@ -0,0 +1,109 @@ +# circuitforge_core/stt/backends/base.py — STTBackend Protocol + factory +# +# MIT licensed. The Protocol and mock are always importable without GPU deps. +# Real backends require optional extras: +# pip install -e "circuitforge-core[stt-faster-whisper]" +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +# ── Result types ────────────────────────────────────────────────────────────── + +@dataclass(frozen=True) +class STTSegment: + """Word- or phrase-level segment (included when the backend supports it).""" + start_s: float + end_s: float + text: str + confidence: float + + +@dataclass(frozen=True) +class STTResult: + """ + Standard result from any STTBackend.transcribe() call. + + confidence is normalised to 0.0–1.0 regardless of the backend's native metric. + below_threshold is True when confidence < the configured threshold (default 0.75). + This flag is safety-critical for products like Osprey: DTMF must NOT be sent + when below_threshold is True. + """ + text: str + confidence: float # 0.0–1.0 + below_threshold: bool + language: str | None = None + duration_s: float | None = None + segments: list[STTSegment] = field(default_factory=list) + model: str = "" + + CONFIDENCE_DEFAULT_THRESHOLD: float = 0.75 + + +# ── Protocol ────────────────────────────────────────────────────────────────── + +@runtime_checkable +class STTBackend(Protocol): + """ + Abstract interface for speech-to-text backends. + + All backends load their model once at construction time and are safe to + call concurrently (the model weights are read-only after load). + """ + + def transcribe( + self, + audio: bytes, + *, + language: str | None = None, + confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD, + ) -> STTResult: + """Synchronous transcription. audio is raw PCM or any format ffmpeg understands.""" + ... + + @property + def model_name(self) -> str: + """Identifier for the loaded model (path stem or size name).""" + ... + + @property + def vram_mb(self) -> int: + """Approximate VRAM footprint in MB. Used by cf-orch service registry.""" + ... + + +# ── Factory ─────────────────────────────────────────────────────────────────── + +def make_stt_backend( + model_path: str, + backend: str | None = None, + mock: bool | None = None, + device: str = "cuda", + compute_type: str = "float16", +) -> STTBackend: + """ + Return an STTBackend for the given model. + + mock=True or CF_STT_MOCK=1 → MockSTTBackend (no GPU, no model file needed) + backend="faster-whisper" → FasterWhisperBackend (default) + + device and compute_type are passed through to the backend and ignored by mock. + """ + use_mock = mock if mock is not None else os.environ.get("CF_STT_MOCK", "") == "1" + if use_mock: + from circuitforge_core.stt.backends.mock import MockSTTBackend + return MockSTTBackend(model_name=model_path) + + resolved = backend or os.environ.get("CF_STT_BACKEND", "faster-whisper") + if resolved == "faster-whisper": + from circuitforge_core.stt.backends.faster_whisper import FasterWhisperBackend + return FasterWhisperBackend( + model_path=model_path, device=device, compute_type=compute_type + ) + + raise ValueError( + f"Unknown STT backend {resolved!r}. " + "Expected 'faster-whisper'. Set CF_STT_BACKEND or pass backend= explicitly." + ) diff --git a/circuitforge_core/stt/backends/faster_whisper.py b/circuitforge_core/stt/backends/faster_whisper.py new file mode 100644 index 0000000..8bd259f --- /dev/null +++ b/circuitforge_core/stt/backends/faster_whisper.py @@ -0,0 +1,139 @@ +# circuitforge_core/stt/backends/faster_whisper.py — FasterWhisperBackend +# +# MIT licensed. Requires: pip install -e "circuitforge-core[stt-faster-whisper]" +# +# Model path can be: +# - A size name: "base", "small", "medium", "large-v3" +# (faster-whisper downloads and caches it on first use) +# - A local path: "/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/..." +# (preferred for air-gapped nodes — no download needed) +from __future__ import annotations + +import io +import logging +import os +import tempfile + +from circuitforge_core.stt.backends.base import STTResult, STTSegment + +logger = logging.getLogger(__name__) + +# VRAM estimates by model size. Used by cf-orch for VRAM budgeting. +_VRAM_MB_BY_SIZE: dict[str, int] = { + "tiny": 200, + "base": 350, + "small": 600, + "medium": 1024, + "large": 2048, + "large-v2": 2048, + "large-v3": 2048, + "distil-large-v3": 1500, +} + +# Aggregate confidence from per-segment no_speech_prob values. +# faster-whisper doesn't expose a direct confidence score, so we invert the +# mean no_speech_prob as a proxy. This is conservative but directionally correct. +def _aggregate_confidence(segments: list) -> float: + if not segments: + return 0.0 + probs = [max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)) for s in segments] + return sum(probs) / len(probs) + + +class FasterWhisperBackend: + """ + faster-whisper STT backend. + + Thread-safe after construction: WhisperModel internally manages its own + CUDA context and is safe to call from multiple threads. + """ + + def __init__( + self, + model_path: str, + device: str = "cuda", + compute_type: str = "float16", + ) -> None: + try: + from faster_whisper import WhisperModel + except ImportError as exc: + raise ImportError( + "faster-whisper is not installed. " + "Run: pip install -e 'circuitforge-core[stt-faster-whisper]'" + ) from exc + + logger.info("Loading faster-whisper model from %r (device=%s)", model_path, device) + self._model_path = model_path + self._device = device + self._compute_type = compute_type + self._model = WhisperModel(model_path, device=device, compute_type=compute_type) + logger.info("faster-whisper model ready") + + # Determine VRAM footprint from model name/path stem. + stem = os.path.basename(model_path.rstrip("/")).lower() + self._vram_mb = next( + (v for k, v in _VRAM_MB_BY_SIZE.items() if k in stem), + 1024, # conservative default if size can't be inferred + ) + + def transcribe( + self, + audio: bytes, + *, + language: str | None = None, + confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD, + ) -> STTResult: + """ + Transcribe raw audio bytes. + + audio can be any format ffmpeg understands (WAV, MP3, OGG, FLAC, etc.). + faster-whisper writes audio to a temp file internally; we follow the + same pattern to avoid holding the bytes in memory longer than needed. + """ + with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp: + tmp.write(audio) + tmp_path = tmp.name + + try: + segments_gen, info = self._model.transcribe( + tmp_path, + language=language, + word_timestamps=True, + vad_filter=True, + ) + segments = list(segments_gen) + finally: + os.unlink(tmp_path) + + text = " ".join(s.text.strip() for s in segments).strip() + confidence = _aggregate_confidence(segments) + duration_s = info.duration if hasattr(info, "duration") else None + detected_language = getattr(info, "language", language) + + stt_segments = [ + STTSegment( + start_s=s.start, + end_s=s.end, + text=s.text.strip(), + confidence=max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)), + ) + for s in segments + ] + + return STTResult( + text=text, + confidence=confidence, + below_threshold=confidence < confidence_threshold, + language=detected_language, + duration_s=duration_s, + segments=stt_segments, + model=self._model_path, + ) + + @property + def model_name(self) -> str: + return self._model_path + + @property + def vram_mb(self) -> int: + return self._vram_mb diff --git a/circuitforge_core/stt/backends/mock.py b/circuitforge_core/stt/backends/mock.py new file mode 100644 index 0000000..b9bd9a6 --- /dev/null +++ b/circuitforge_core/stt/backends/mock.py @@ -0,0 +1,54 @@ +# circuitforge_core/stt/backends/mock.py — MockSTTBackend +# +# MIT licensed. No GPU, no model file required. +# Used in tests and CI, and when CF_STT_MOCK=1. +from __future__ import annotations + +from circuitforge_core.stt.backends.base import STTBackend, STTResult + + +class MockSTTBackend: + """ + Deterministic mock STT backend for testing. + + Returns a fixed transcript so tests can assert on the response shape + without needing a GPU or a model file. + """ + + def __init__( + self, + model_name: str = "mock", + fixed_text: str = "mock transcription", + fixed_confidence: float = 0.95, + ) -> None: + self._model_name = model_name + self._fixed_text = fixed_text + self._fixed_confidence = fixed_confidence + + def transcribe( + self, + audio: bytes, + *, + language: str | None = None, + confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD, + ) -> STTResult: + return STTResult( + text=self._fixed_text, + confidence=self._fixed_confidence, + below_threshold=self._fixed_confidence < confidence_threshold, + language=language or "en", + duration_s=float(len(audio)) / 32000, # rough estimate: 16kHz 16-bit mono + model=self._model_name, + ) + + @property + def model_name(self) -> str: + return self._model_name + + @property + def vram_mb(self) -> int: + return 0 + + +# Satisfy the Protocol at import time (no GPU needed) +assert isinstance(MockSTTBackend(), STTBackend) diff --git a/pyproject.toml b/pyproject.toml index 70f5929..cb0ffb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,27 @@ manage = [ "platformdirs>=4.0", "typer[all]>=0.12", ] +text-llamacpp = [ + "llama-cpp-python>=0.2.0", +] +text-transformers = [ + "torch>=2.0", + "transformers>=4.40", + "accelerate>=0.27", +] +text-transformers-4bit = [ + "circuitforge-core[text-transformers]", + "bitsandbytes>=0.43", +] +stt-faster-whisper = [ + "faster-whisper>=1.0", +] +stt-service = [ + "circuitforge-core[stt-faster-whisper]", + "fastapi>=0.110", + "uvicorn[standard]>=0.29", + "python-multipart>=0.0.9", +] dev = [ "circuitforge-core[manage]", "pytest>=8.0",