feat(stt): add cf-stt module — FasterWhisperBackend + managed FastAPI app
Some checks are pending
CI / test (push) Waiting to run
Mirror / mirror (push) Waiting to run

- STTBackend Protocol + STTResult/STTSegment frozen dataclasses (base.py)
- MockSTTBackend for CI/tests (no GPU needed, CF_STT_MOCK=1)
- FasterWhisperBackend: loads model once, thread-safe, VRAM estimate by model size
- app.py: FastAPI service runnable as managed process by cf-orch
  POST /transcribe (multipart audio) → STTTranscribeResponse-compatible JSON
  GET  /health → {status, model, vram_mb}
- __init__.py: process-level singleton + transcribe() convenience fn
- pyproject.toml: stt-faster-whisper + stt-service optional dep groups
This commit is contained in:
pyr0ball 2026-04-08 22:14:46 -07:00
parent 5766fa82ab
commit 67493048e2
7 changed files with 556 additions and 0 deletions

View file

@ -0,0 +1,79 @@
"""
circuitforge_core.stt Speech-to-text service module.
Quick start (mock mode no GPU or model required):
import os; os.environ["CF_STT_MOCK"] = "1"
from circuitforge_core.stt import transcribe
result = transcribe(open("audio.wav", "rb").read())
print(result.text, result.confidence)
Real inference (faster-whisper):
export CF_STT_MODEL=/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash>
from circuitforge_core.stt import transcribe
cf-orch service profile:
service_type: cf-stt
max_mb: 1024 (medium); 600 (base/small)
max_concurrent: 3
shared: true
managed:
exec: python -m circuitforge_core.stt.app
args: --model <path> --port {port} --gpu-id {gpu_id}
port: 8004
health: /health
"""
from __future__ import annotations
import os
from circuitforge_core.stt.backends.base import (
STTBackend,
STTResult,
STTSegment,
make_stt_backend,
)
from circuitforge_core.stt.backends.mock import MockSTTBackend
_backend: STTBackend | None = None
def _get_backend() -> STTBackend:
global _backend
if _backend is None:
model_path = os.environ.get("CF_STT_MODEL", "mock")
mock = model_path == "mock" or os.environ.get("CF_STT_MOCK", "") == "1"
_backend = make_stt_backend(model_path, mock=mock)
return _backend
def transcribe(
audio: bytes,
*,
language: str | None = None,
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
) -> STTResult:
"""Transcribe audio bytes using the process-level backend."""
return _get_backend().transcribe(
audio, language=language, confidence_threshold=confidence_threshold
)
def reset_backend() -> None:
"""Reset the process-level singleton. Test teardown only."""
global _backend
_backend = None
__all__ = [
"STTBackend",
"STTResult",
"STTSegment",
"MockSTTBackend",
"make_stt_backend",
"transcribe",
"reset_backend",
]

View file

@ -0,0 +1,150 @@
"""
circuitforge_core.stt.app cf-stt FastAPI service.
Managed by cf-orch as a process-type service. cf-orch starts this via:
python -m circuitforge_core.stt.app \
--model /Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash> \
--port 8004 \
--gpu-id 0
Endpoints:
GET /health {"status": "ok", "model": "<name>", "vram_mb": <n>}
POST /transcribe STTTranscribeResponse (multipart: audio file)
Audio format: any format ffmpeg understands (WAV, MP3, OGG, FLAC).
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import uvicorn
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from circuitforge_core.stt.backends.base import STTResult, make_stt_backend
logger = logging.getLogger(__name__)
# ── Response model (mirrors circuitforge_orch.contracts.stt.STTTranscribeResponse) ──
class TranscribeResponse(BaseModel):
text: str
confidence: float
below_threshold: bool
language: str | None = None
duration_s: float | None = None
segments: list[dict] = []
model: str = ""
# ── App factory ───────────────────────────────────────────────────────────────
def create_app(
model_path: str,
device: str = "cuda",
compute_type: str = "float16",
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
mock: bool = False,
) -> FastAPI:
app = FastAPI(title="cf-stt", version="0.1.0")
backend = make_stt_backend(
model_path, device=device, compute_type=compute_type, mock=mock
)
logger.info("cf-stt ready: model=%r vram=%dMB", backend.model_name, backend.vram_mb)
@app.get("/health")
async def health() -> dict:
return {"status": "ok", "model": backend.model_name, "vram_mb": backend.vram_mb}
@app.post("/transcribe", response_model=TranscribeResponse)
async def transcribe(
audio: UploadFile = File(..., description="Audio file (WAV, MP3, OGG, FLAC, ...)"),
language: str | None = Form(None, description="BCP-47 language code hint, e.g. 'en'"),
confidence_threshold_override: float | None = Form(
None,
description="Override default confidence threshold for this request.",
),
) -> TranscribeResponse:
audio_bytes = await audio.read()
if not audio_bytes:
raise HTTPException(status_code=400, detail="Empty audio file")
threshold = confidence_threshold_override or confidence_threshold
try:
result = backend.transcribe(
audio_bytes, language=language, confidence_threshold=threshold
)
except Exception as exc:
logger.exception("Transcription failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return TranscribeResponse(
text=result.text,
confidence=result.confidence,
below_threshold=result.below_threshold,
language=result.language,
duration_s=result.duration_s,
segments=[
{
"start_s": s.start_s,
"end_s": s.end_s,
"text": s.text,
"confidence": s.confidence,
}
for s in result.segments
],
model=result.model,
)
return app
# ── CLI entry point ───────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="cf-stt — CircuitForge STT service")
parser.add_argument("--model", required=True,
help="Model path or size name (e.g. 'medium', or full local path)")
parser.add_argument("--port", type=int, default=8004)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--gpu-id", type=int, default=0,
help="CUDA device index (sets CUDA_VISIBLE_DEVICES)")
parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
parser.add_argument("--compute-type", default="float16",
choices=["float16", "int8", "int8_float16", "float32"],
help="Quantisation / compute type passed to faster-whisper")
parser.add_argument("--confidence-threshold", type=float,
default=STTResult.CONFIDENCE_DEFAULT_THRESHOLD)
parser.add_argument("--mock", action="store_true",
help="Run with mock backend (no GPU, for testing)")
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
# Let cf-orch pass --gpu-id; map to CUDA_VISIBLE_DEVICES so the process
# only sees its assigned GPU. This prevents accidental multi-GPU usage.
if args.device == "cuda" and not args.mock:
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
mock = args.mock or os.environ.get("CF_STT_MOCK", "") == "1"
app = create_app(
model_path=args.model,
device=args.device,
compute_type=args.compute_type,
confidence_threshold=args.confidence_threshold,
mock=mock,
)
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,4 @@
from .base import STTBackend, STTResult, STTSegment, make_stt_backend
from .mock import MockSTTBackend
__all__ = ["STTBackend", "STTResult", "STTSegment", "make_stt_backend", "MockSTTBackend"]

View file

@ -0,0 +1,109 @@
# circuitforge_core/stt/backends/base.py — STTBackend Protocol + factory
#
# MIT licensed. The Protocol and mock are always importable without GPU deps.
# Real backends require optional extras:
# pip install -e "circuitforge-core[stt-faster-whisper]"
from __future__ import annotations
import os
from dataclasses import dataclass, field
from typing import Protocol, runtime_checkable
# ── Result types ──────────────────────────────────────────────────────────────
@dataclass(frozen=True)
class STTSegment:
"""Word- or phrase-level segment (included when the backend supports it)."""
start_s: float
end_s: float
text: str
confidence: float
@dataclass(frozen=True)
class STTResult:
"""
Standard result from any STTBackend.transcribe() call.
confidence is normalised to 0.01.0 regardless of the backend's native metric.
below_threshold is True when confidence < the configured threshold (default 0.75).
This flag is safety-critical for products like Osprey: DTMF must NOT be sent
when below_threshold is True.
"""
text: str
confidence: float # 0.01.0
below_threshold: bool
language: str | None = None
duration_s: float | None = None
segments: list[STTSegment] = field(default_factory=list)
model: str = ""
CONFIDENCE_DEFAULT_THRESHOLD: float = 0.75
# ── Protocol ──────────────────────────────────────────────────────────────────
@runtime_checkable
class STTBackend(Protocol):
"""
Abstract interface for speech-to-text backends.
All backends load their model once at construction time and are safe to
call concurrently (the model weights are read-only after load).
"""
def transcribe(
self,
audio: bytes,
*,
language: str | None = None,
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
) -> STTResult:
"""Synchronous transcription. audio is raw PCM or any format ffmpeg understands."""
...
@property
def model_name(self) -> str:
"""Identifier for the loaded model (path stem or size name)."""
...
@property
def vram_mb(self) -> int:
"""Approximate VRAM footprint in MB. Used by cf-orch service registry."""
...
# ── Factory ───────────────────────────────────────────────────────────────────
def make_stt_backend(
model_path: str,
backend: str | None = None,
mock: bool | None = None,
device: str = "cuda",
compute_type: str = "float16",
) -> STTBackend:
"""
Return an STTBackend for the given model.
mock=True or CF_STT_MOCK=1 MockSTTBackend (no GPU, no model file needed)
backend="faster-whisper" FasterWhisperBackend (default)
device and compute_type are passed through to the backend and ignored by mock.
"""
use_mock = mock if mock is not None else os.environ.get("CF_STT_MOCK", "") == "1"
if use_mock:
from circuitforge_core.stt.backends.mock import MockSTTBackend
return MockSTTBackend(model_name=model_path)
resolved = backend or os.environ.get("CF_STT_BACKEND", "faster-whisper")
if resolved == "faster-whisper":
from circuitforge_core.stt.backends.faster_whisper import FasterWhisperBackend
return FasterWhisperBackend(
model_path=model_path, device=device, compute_type=compute_type
)
raise ValueError(
f"Unknown STT backend {resolved!r}. "
"Expected 'faster-whisper'. Set CF_STT_BACKEND or pass backend= explicitly."
)

View file

@ -0,0 +1,139 @@
# circuitforge_core/stt/backends/faster_whisper.py — FasterWhisperBackend
#
# MIT licensed. Requires: pip install -e "circuitforge-core[stt-faster-whisper]"
#
# Model path can be:
# - A size name: "base", "small", "medium", "large-v3"
# (faster-whisper downloads and caches it on first use)
# - A local path: "/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/..."
# (preferred for air-gapped nodes — no download needed)
from __future__ import annotations
import io
import logging
import os
import tempfile
from circuitforge_core.stt.backends.base import STTResult, STTSegment
logger = logging.getLogger(__name__)
# VRAM estimates by model size. Used by cf-orch for VRAM budgeting.
_VRAM_MB_BY_SIZE: dict[str, int] = {
"tiny": 200,
"base": 350,
"small": 600,
"medium": 1024,
"large": 2048,
"large-v2": 2048,
"large-v3": 2048,
"distil-large-v3": 1500,
}
# Aggregate confidence from per-segment no_speech_prob values.
# faster-whisper doesn't expose a direct confidence score, so we invert the
# mean no_speech_prob as a proxy. This is conservative but directionally correct.
def _aggregate_confidence(segments: list) -> float:
if not segments:
return 0.0
probs = [max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)) for s in segments]
return sum(probs) / len(probs)
class FasterWhisperBackend:
"""
faster-whisper STT backend.
Thread-safe after construction: WhisperModel internally manages its own
CUDA context and is safe to call from multiple threads.
"""
def __init__(
self,
model_path: str,
device: str = "cuda",
compute_type: str = "float16",
) -> None:
try:
from faster_whisper import WhisperModel
except ImportError as exc:
raise ImportError(
"faster-whisper is not installed. "
"Run: pip install -e 'circuitforge-core[stt-faster-whisper]'"
) from exc
logger.info("Loading faster-whisper model from %r (device=%s)", model_path, device)
self._model_path = model_path
self._device = device
self._compute_type = compute_type
self._model = WhisperModel(model_path, device=device, compute_type=compute_type)
logger.info("faster-whisper model ready")
# Determine VRAM footprint from model name/path stem.
stem = os.path.basename(model_path.rstrip("/")).lower()
self._vram_mb = next(
(v for k, v in _VRAM_MB_BY_SIZE.items() if k in stem),
1024, # conservative default if size can't be inferred
)
def transcribe(
self,
audio: bytes,
*,
language: str | None = None,
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
) -> STTResult:
"""
Transcribe raw audio bytes.
audio can be any format ffmpeg understands (WAV, MP3, OGG, FLAC, etc.).
faster-whisper writes audio to a temp file internally; we follow the
same pattern to avoid holding the bytes in memory longer than needed.
"""
with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
tmp.write(audio)
tmp_path = tmp.name
try:
segments_gen, info = self._model.transcribe(
tmp_path,
language=language,
word_timestamps=True,
vad_filter=True,
)
segments = list(segments_gen)
finally:
os.unlink(tmp_path)
text = " ".join(s.text.strip() for s in segments).strip()
confidence = _aggregate_confidence(segments)
duration_s = info.duration if hasattr(info, "duration") else None
detected_language = getattr(info, "language", language)
stt_segments = [
STTSegment(
start_s=s.start,
end_s=s.end,
text=s.text.strip(),
confidence=max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)),
)
for s in segments
]
return STTResult(
text=text,
confidence=confidence,
below_threshold=confidence < confidence_threshold,
language=detected_language,
duration_s=duration_s,
segments=stt_segments,
model=self._model_path,
)
@property
def model_name(self) -> str:
return self._model_path
@property
def vram_mb(self) -> int:
return self._vram_mb

View file

@ -0,0 +1,54 @@
# circuitforge_core/stt/backends/mock.py — MockSTTBackend
#
# MIT licensed. No GPU, no model file required.
# Used in tests and CI, and when CF_STT_MOCK=1.
from __future__ import annotations
from circuitforge_core.stt.backends.base import STTBackend, STTResult
class MockSTTBackend:
"""
Deterministic mock STT backend for testing.
Returns a fixed transcript so tests can assert on the response shape
without needing a GPU or a model file.
"""
def __init__(
self,
model_name: str = "mock",
fixed_text: str = "mock transcription",
fixed_confidence: float = 0.95,
) -> None:
self._model_name = model_name
self._fixed_text = fixed_text
self._fixed_confidence = fixed_confidence
def transcribe(
self,
audio: bytes,
*,
language: str | None = None,
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
) -> STTResult:
return STTResult(
text=self._fixed_text,
confidence=self._fixed_confidence,
below_threshold=self._fixed_confidence < confidence_threshold,
language=language or "en",
duration_s=float(len(audio)) / 32000, # rough estimate: 16kHz 16-bit mono
model=self._model_name,
)
@property
def model_name(self) -> str:
return self._model_name
@property
def vram_mb(self) -> int:
return 0
# Satisfy the Protocol at import time (no GPU needed)
assert isinstance(MockSTTBackend(), STTBackend)

View file

@ -18,6 +18,27 @@ manage = [
"platformdirs>=4.0",
"typer[all]>=0.12",
]
text-llamacpp = [
"llama-cpp-python>=0.2.0",
]
text-transformers = [
"torch>=2.0",
"transformers>=4.40",
"accelerate>=0.27",
]
text-transformers-4bit = [
"circuitforge-core[text-transformers]",
"bitsandbytes>=0.43",
]
stt-faster-whisper = [
"faster-whisper>=1.0",
]
stt-service = [
"circuitforge-core[stt-faster-whisper]",
"fastapi>=0.110",
"uvicorn[standard]>=0.29",
"python-multipart>=0.0.9",
]
dev = [
"circuitforge-core[manage]",
"pytest>=8.0",