feat(stt): add cf-stt module — FasterWhisperBackend + managed FastAPI app
- STTBackend Protocol + STTResult/STTSegment frozen dataclasses (base.py)
- MockSTTBackend for CI/tests (no GPU needed, CF_STT_MOCK=1)
- FasterWhisperBackend: loads model once, thread-safe, VRAM estimate by model size
- app.py: FastAPI service runnable as managed process by cf-orch
POST /transcribe (multipart audio) → STTTranscribeResponse-compatible JSON
GET /health → {status, model, vram_mb}
- __init__.py: process-level singleton + transcribe() convenience fn
- pyproject.toml: stt-faster-whisper + stt-service optional dep groups
This commit is contained in:
parent
5766fa82ab
commit
67493048e2
7 changed files with 556 additions and 0 deletions
79
circuitforge_core/stt/__init__.py
Normal file
79
circuitforge_core/stt/__init__.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
"""
|
||||||
|
circuitforge_core.stt — Speech-to-text service module.
|
||||||
|
|
||||||
|
Quick start (mock mode — no GPU or model required):
|
||||||
|
|
||||||
|
import os; os.environ["CF_STT_MOCK"] = "1"
|
||||||
|
from circuitforge_core.stt import transcribe
|
||||||
|
|
||||||
|
result = transcribe(open("audio.wav", "rb").read())
|
||||||
|
print(result.text, result.confidence)
|
||||||
|
|
||||||
|
Real inference (faster-whisper):
|
||||||
|
|
||||||
|
export CF_STT_MODEL=/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash>
|
||||||
|
from circuitforge_core.stt import transcribe
|
||||||
|
|
||||||
|
cf-orch service profile:
|
||||||
|
|
||||||
|
service_type: cf-stt
|
||||||
|
max_mb: 1024 (medium); 600 (base/small)
|
||||||
|
max_concurrent: 3
|
||||||
|
shared: true
|
||||||
|
managed:
|
||||||
|
exec: python -m circuitforge_core.stt.app
|
||||||
|
args: --model <path> --port {port} --gpu-id {gpu_id}
|
||||||
|
port: 8004
|
||||||
|
health: /health
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from circuitforge_core.stt.backends.base import (
|
||||||
|
STTBackend,
|
||||||
|
STTResult,
|
||||||
|
STTSegment,
|
||||||
|
make_stt_backend,
|
||||||
|
)
|
||||||
|
from circuitforge_core.stt.backends.mock import MockSTTBackend
|
||||||
|
|
||||||
|
_backend: STTBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend() -> STTBackend:
|
||||||
|
global _backend
|
||||||
|
if _backend is None:
|
||||||
|
model_path = os.environ.get("CF_STT_MODEL", "mock")
|
||||||
|
mock = model_path == "mock" or os.environ.get("CF_STT_MOCK", "") == "1"
|
||||||
|
_backend = make_stt_backend(model_path, mock=mock)
|
||||||
|
return _backend
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe(
|
||||||
|
audio: bytes,
|
||||||
|
*,
|
||||||
|
language: str | None = None,
|
||||||
|
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
|
||||||
|
) -> STTResult:
|
||||||
|
"""Transcribe audio bytes using the process-level backend."""
|
||||||
|
return _get_backend().transcribe(
|
||||||
|
audio, language=language, confidence_threshold=confidence_threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_backend() -> None:
|
||||||
|
"""Reset the process-level singleton. Test teardown only."""
|
||||||
|
global _backend
|
||||||
|
_backend = None
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"STTBackend",
|
||||||
|
"STTResult",
|
||||||
|
"STTSegment",
|
||||||
|
"MockSTTBackend",
|
||||||
|
"make_stt_backend",
|
||||||
|
"transcribe",
|
||||||
|
"reset_backend",
|
||||||
|
]
|
||||||
150
circuitforge_core/stt/app.py
Normal file
150
circuitforge_core/stt/app.py
Normal file
|
|
@ -0,0 +1,150 @@
|
||||||
|
"""
|
||||||
|
circuitforge_core.stt.app — cf-stt FastAPI service.
|
||||||
|
|
||||||
|
Managed by cf-orch as a process-type service. cf-orch starts this via:
|
||||||
|
|
||||||
|
python -m circuitforge_core.stt.app \
|
||||||
|
--model /Library/Assets/LLM/whisper/models/Whisper/faster-whisper/models--Systran--faster-whisper-medium/snapshots/<hash> \
|
||||||
|
--port 8004 \
|
||||||
|
--gpu-id 0
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
GET /health → {"status": "ok", "model": "<name>", "vram_mb": <n>}
|
||||||
|
POST /transcribe → STTTranscribeResponse (multipart: audio file)
|
||||||
|
|
||||||
|
Audio format: any format ffmpeg understands (WAV, MP3, OGG, FLAC).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from circuitforge_core.stt.backends.base import STTResult, make_stt_backend
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── Response model (mirrors circuitforge_orch.contracts.stt.STTTranscribeResponse) ──
|
||||||
|
|
||||||
|
class TranscribeResponse(BaseModel):
|
||||||
|
text: str
|
||||||
|
confidence: float
|
||||||
|
below_threshold: bool
|
||||||
|
language: str | None = None
|
||||||
|
duration_s: float | None = None
|
||||||
|
segments: list[dict] = []
|
||||||
|
model: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── App factory ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def create_app(
|
||||||
|
model_path: str,
|
||||||
|
device: str = "cuda",
|
||||||
|
compute_type: str = "float16",
|
||||||
|
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
|
||||||
|
mock: bool = False,
|
||||||
|
) -> FastAPI:
|
||||||
|
app = FastAPI(title="cf-stt", version="0.1.0")
|
||||||
|
backend = make_stt_backend(
|
||||||
|
model_path, device=device, compute_type=compute_type, mock=mock
|
||||||
|
)
|
||||||
|
logger.info("cf-stt ready: model=%r vram=%dMB", backend.model_name, backend.vram_mb)
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health() -> dict:
|
||||||
|
return {"status": "ok", "model": backend.model_name, "vram_mb": backend.vram_mb}
|
||||||
|
|
||||||
|
@app.post("/transcribe", response_model=TranscribeResponse)
|
||||||
|
async def transcribe(
|
||||||
|
audio: UploadFile = File(..., description="Audio file (WAV, MP3, OGG, FLAC, ...)"),
|
||||||
|
language: str | None = Form(None, description="BCP-47 language code hint, e.g. 'en'"),
|
||||||
|
confidence_threshold_override: float | None = Form(
|
||||||
|
None,
|
||||||
|
description="Override default confidence threshold for this request.",
|
||||||
|
),
|
||||||
|
) -> TranscribeResponse:
|
||||||
|
audio_bytes = await audio.read()
|
||||||
|
if not audio_bytes:
|
||||||
|
raise HTTPException(status_code=400, detail="Empty audio file")
|
||||||
|
|
||||||
|
threshold = confidence_threshold_override or confidence_threshold
|
||||||
|
try:
|
||||||
|
result = backend.transcribe(
|
||||||
|
audio_bytes, language=language, confidence_threshold=threshold
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Transcription failed")
|
||||||
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
return TranscribeResponse(
|
||||||
|
text=result.text,
|
||||||
|
confidence=result.confidence,
|
||||||
|
below_threshold=result.below_threshold,
|
||||||
|
language=result.language,
|
||||||
|
duration_s=result.duration_s,
|
||||||
|
segments=[
|
||||||
|
{
|
||||||
|
"start_s": s.start_s,
|
||||||
|
"end_s": s.end_s,
|
||||||
|
"text": s.text,
|
||||||
|
"confidence": s.confidence,
|
||||||
|
}
|
||||||
|
for s in result.segments
|
||||||
|
],
|
||||||
|
model=result.model,
|
||||||
|
)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
# ── CLI entry point ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="cf-stt — CircuitForge STT service")
|
||||||
|
parser.add_argument("--model", required=True,
|
||||||
|
help="Model path or size name (e.g. 'medium', or full local path)")
|
||||||
|
parser.add_argument("--port", type=int, default=8004)
|
||||||
|
parser.add_argument("--host", default="0.0.0.0")
|
||||||
|
parser.add_argument("--gpu-id", type=int, default=0,
|
||||||
|
help="CUDA device index (sets CUDA_VISIBLE_DEVICES)")
|
||||||
|
parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
|
||||||
|
parser.add_argument("--compute-type", default="float16",
|
||||||
|
choices=["float16", "int8", "int8_float16", "float32"],
|
||||||
|
help="Quantisation / compute type passed to faster-whisper")
|
||||||
|
parser.add_argument("--confidence-threshold", type=float,
|
||||||
|
default=STTResult.CONFIDENCE_DEFAULT_THRESHOLD)
|
||||||
|
parser.add_argument("--mock", action="store_true",
|
||||||
|
help="Run with mock backend (no GPU, for testing)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Let cf-orch pass --gpu-id; map to CUDA_VISIBLE_DEVICES so the process
|
||||||
|
# only sees its assigned GPU. This prevents accidental multi-GPU usage.
|
||||||
|
if args.device == "cuda" and not args.mock:
|
||||||
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
|
||||||
|
|
||||||
|
mock = args.mock or os.environ.get("CF_STT_MOCK", "") == "1"
|
||||||
|
app = create_app(
|
||||||
|
model_path=args.model,
|
||||||
|
device=args.device,
|
||||||
|
compute_type=args.compute_type,
|
||||||
|
confidence_threshold=args.confidence_threshold,
|
||||||
|
mock=mock,
|
||||||
|
)
|
||||||
|
|
||||||
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
4
circuitforge_core/stt/backends/__init__.py
Normal file
4
circuitforge_core/stt/backends/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from .base import STTBackend, STTResult, STTSegment, make_stt_backend
|
||||||
|
from .mock import MockSTTBackend
|
||||||
|
|
||||||
|
__all__ = ["STTBackend", "STTResult", "STTSegment", "make_stt_backend", "MockSTTBackend"]
|
||||||
109
circuitforge_core/stt/backends/base.py
Normal file
109
circuitforge_core/stt/backends/base.py
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
# circuitforge_core/stt/backends/base.py — STTBackend Protocol + factory
|
||||||
|
#
|
||||||
|
# MIT licensed. The Protocol and mock are always importable without GPU deps.
|
||||||
|
# Real backends require optional extras:
|
||||||
|
# pip install -e "circuitforge-core[stt-faster-whisper]"
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Protocol, runtime_checkable
|
||||||
|
|
||||||
|
|
||||||
|
# ── Result types ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class STTSegment:
|
||||||
|
"""Word- or phrase-level segment (included when the backend supports it)."""
|
||||||
|
start_s: float
|
||||||
|
end_s: float
|
||||||
|
text: str
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class STTResult:
|
||||||
|
"""
|
||||||
|
Standard result from any STTBackend.transcribe() call.
|
||||||
|
|
||||||
|
confidence is normalised to 0.0–1.0 regardless of the backend's native metric.
|
||||||
|
below_threshold is True when confidence < the configured threshold (default 0.75).
|
||||||
|
This flag is safety-critical for products like Osprey: DTMF must NOT be sent
|
||||||
|
when below_threshold is True.
|
||||||
|
"""
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0–1.0
|
||||||
|
below_threshold: bool
|
||||||
|
language: str | None = None
|
||||||
|
duration_s: float | None = None
|
||||||
|
segments: list[STTSegment] = field(default_factory=list)
|
||||||
|
model: str = ""
|
||||||
|
|
||||||
|
CONFIDENCE_DEFAULT_THRESHOLD: float = 0.75
|
||||||
|
|
||||||
|
|
||||||
|
# ── Protocol ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class STTBackend(Protocol):
|
||||||
|
"""
|
||||||
|
Abstract interface for speech-to-text backends.
|
||||||
|
|
||||||
|
All backends load their model once at construction time and are safe to
|
||||||
|
call concurrently (the model weights are read-only after load).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def transcribe(
|
||||||
|
self,
|
||||||
|
audio: bytes,
|
||||||
|
*,
|
||||||
|
language: str | None = None,
|
||||||
|
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
|
||||||
|
) -> STTResult:
|
||||||
|
"""Synchronous transcription. audio is raw PCM or any format ffmpeg understands."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
"""Identifier for the loaded model (path stem or size name)."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int:
|
||||||
|
"""Approximate VRAM footprint in MB. Used by cf-orch service registry."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# ── Factory ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def make_stt_backend(
|
||||||
|
model_path: str,
|
||||||
|
backend: str | None = None,
|
||||||
|
mock: bool | None = None,
|
||||||
|
device: str = "cuda",
|
||||||
|
compute_type: str = "float16",
|
||||||
|
) -> STTBackend:
|
||||||
|
"""
|
||||||
|
Return an STTBackend for the given model.
|
||||||
|
|
||||||
|
mock=True or CF_STT_MOCK=1 → MockSTTBackend (no GPU, no model file needed)
|
||||||
|
backend="faster-whisper" → FasterWhisperBackend (default)
|
||||||
|
|
||||||
|
device and compute_type are passed through to the backend and ignored by mock.
|
||||||
|
"""
|
||||||
|
use_mock = mock if mock is not None else os.environ.get("CF_STT_MOCK", "") == "1"
|
||||||
|
if use_mock:
|
||||||
|
from circuitforge_core.stt.backends.mock import MockSTTBackend
|
||||||
|
return MockSTTBackend(model_name=model_path)
|
||||||
|
|
||||||
|
resolved = backend or os.environ.get("CF_STT_BACKEND", "faster-whisper")
|
||||||
|
if resolved == "faster-whisper":
|
||||||
|
from circuitforge_core.stt.backends.faster_whisper import FasterWhisperBackend
|
||||||
|
return FasterWhisperBackend(
|
||||||
|
model_path=model_path, device=device, compute_type=compute_type
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown STT backend {resolved!r}. "
|
||||||
|
"Expected 'faster-whisper'. Set CF_STT_BACKEND or pass backend= explicitly."
|
||||||
|
)
|
||||||
139
circuitforge_core/stt/backends/faster_whisper.py
Normal file
139
circuitforge_core/stt/backends/faster_whisper.py
Normal file
|
|
@ -0,0 +1,139 @@
|
||||||
|
# circuitforge_core/stt/backends/faster_whisper.py — FasterWhisperBackend
|
||||||
|
#
|
||||||
|
# MIT licensed. Requires: pip install -e "circuitforge-core[stt-faster-whisper]"
|
||||||
|
#
|
||||||
|
# Model path can be:
|
||||||
|
# - A size name: "base", "small", "medium", "large-v3"
|
||||||
|
# (faster-whisper downloads and caches it on first use)
|
||||||
|
# - A local path: "/Library/Assets/LLM/whisper/models/Whisper/faster-whisper/..."
|
||||||
|
# (preferred for air-gapped nodes — no download needed)
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from circuitforge_core.stt.backends.base import STTResult, STTSegment
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# VRAM estimates by model size. Used by cf-orch for VRAM budgeting.
|
||||||
|
_VRAM_MB_BY_SIZE: dict[str, int] = {
|
||||||
|
"tiny": 200,
|
||||||
|
"base": 350,
|
||||||
|
"small": 600,
|
||||||
|
"medium": 1024,
|
||||||
|
"large": 2048,
|
||||||
|
"large-v2": 2048,
|
||||||
|
"large-v3": 2048,
|
||||||
|
"distil-large-v3": 1500,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Aggregate confidence from per-segment no_speech_prob values.
|
||||||
|
# faster-whisper doesn't expose a direct confidence score, so we invert the
|
||||||
|
# mean no_speech_prob as a proxy. This is conservative but directionally correct.
|
||||||
|
def _aggregate_confidence(segments: list) -> float:
|
||||||
|
if not segments:
|
||||||
|
return 0.0
|
||||||
|
probs = [max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)) for s in segments]
|
||||||
|
return sum(probs) / len(probs)
|
||||||
|
|
||||||
|
|
||||||
|
class FasterWhisperBackend:
|
||||||
|
"""
|
||||||
|
faster-whisper STT backend.
|
||||||
|
|
||||||
|
Thread-safe after construction: WhisperModel internally manages its own
|
||||||
|
CUDA context and is safe to call from multiple threads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_path: str,
|
||||||
|
device: str = "cuda",
|
||||||
|
compute_type: str = "float16",
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"faster-whisper is not installed. "
|
||||||
|
"Run: pip install -e 'circuitforge-core[stt-faster-whisper]'"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
logger.info("Loading faster-whisper model from %r (device=%s)", model_path, device)
|
||||||
|
self._model_path = model_path
|
||||||
|
self._device = device
|
||||||
|
self._compute_type = compute_type
|
||||||
|
self._model = WhisperModel(model_path, device=device, compute_type=compute_type)
|
||||||
|
logger.info("faster-whisper model ready")
|
||||||
|
|
||||||
|
# Determine VRAM footprint from model name/path stem.
|
||||||
|
stem = os.path.basename(model_path.rstrip("/")).lower()
|
||||||
|
self._vram_mb = next(
|
||||||
|
(v for k, v in _VRAM_MB_BY_SIZE.items() if k in stem),
|
||||||
|
1024, # conservative default if size can't be inferred
|
||||||
|
)
|
||||||
|
|
||||||
|
def transcribe(
|
||||||
|
self,
|
||||||
|
audio: bytes,
|
||||||
|
*,
|
||||||
|
language: str | None = None,
|
||||||
|
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
|
||||||
|
) -> STTResult:
|
||||||
|
"""
|
||||||
|
Transcribe raw audio bytes.
|
||||||
|
|
||||||
|
audio can be any format ffmpeg understands (WAV, MP3, OGG, FLAC, etc.).
|
||||||
|
faster-whisper writes audio to a temp file internally; we follow the
|
||||||
|
same pattern to avoid holding the bytes in memory longer than needed.
|
||||||
|
"""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
|
||||||
|
tmp.write(audio)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
segments_gen, info = self._model.transcribe(
|
||||||
|
tmp_path,
|
||||||
|
language=language,
|
||||||
|
word_timestamps=True,
|
||||||
|
vad_filter=True,
|
||||||
|
)
|
||||||
|
segments = list(segments_gen)
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
text = " ".join(s.text.strip() for s in segments).strip()
|
||||||
|
confidence = _aggregate_confidence(segments)
|
||||||
|
duration_s = info.duration if hasattr(info, "duration") else None
|
||||||
|
detected_language = getattr(info, "language", language)
|
||||||
|
|
||||||
|
stt_segments = [
|
||||||
|
STTSegment(
|
||||||
|
start_s=s.start,
|
||||||
|
end_s=s.end,
|
||||||
|
text=s.text.strip(),
|
||||||
|
confidence=max(0.0, 1.0 - getattr(s, "no_speech_prob", 0.0)),
|
||||||
|
)
|
||||||
|
for s in segments
|
||||||
|
]
|
||||||
|
|
||||||
|
return STTResult(
|
||||||
|
text=text,
|
||||||
|
confidence=confidence,
|
||||||
|
below_threshold=confidence < confidence_threshold,
|
||||||
|
language=detected_language,
|
||||||
|
duration_s=duration_s,
|
||||||
|
segments=stt_segments,
|
||||||
|
model=self._model_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
return self._model_path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int:
|
||||||
|
return self._vram_mb
|
||||||
54
circuitforge_core/stt/backends/mock.py
Normal file
54
circuitforge_core/stt/backends/mock.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
# circuitforge_core/stt/backends/mock.py — MockSTTBackend
|
||||||
|
#
|
||||||
|
# MIT licensed. No GPU, no model file required.
|
||||||
|
# Used in tests and CI, and when CF_STT_MOCK=1.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from circuitforge_core.stt.backends.base import STTBackend, STTResult
|
||||||
|
|
||||||
|
|
||||||
|
class MockSTTBackend:
|
||||||
|
"""
|
||||||
|
Deterministic mock STT backend for testing.
|
||||||
|
|
||||||
|
Returns a fixed transcript so tests can assert on the response shape
|
||||||
|
without needing a GPU or a model file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name: str = "mock",
|
||||||
|
fixed_text: str = "mock transcription",
|
||||||
|
fixed_confidence: float = 0.95,
|
||||||
|
) -> None:
|
||||||
|
self._model_name = model_name
|
||||||
|
self._fixed_text = fixed_text
|
||||||
|
self._fixed_confidence = fixed_confidence
|
||||||
|
|
||||||
|
def transcribe(
|
||||||
|
self,
|
||||||
|
audio: bytes,
|
||||||
|
*,
|
||||||
|
language: str | None = None,
|
||||||
|
confidence_threshold: float = STTResult.CONFIDENCE_DEFAULT_THRESHOLD,
|
||||||
|
) -> STTResult:
|
||||||
|
return STTResult(
|
||||||
|
text=self._fixed_text,
|
||||||
|
confidence=self._fixed_confidence,
|
||||||
|
below_threshold=self._fixed_confidence < confidence_threshold,
|
||||||
|
language=language or "en",
|
||||||
|
duration_s=float(len(audio)) / 32000, # rough estimate: 16kHz 16-bit mono
|
||||||
|
model=self._model_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
return self._model_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# Satisfy the Protocol at import time (no GPU needed)
|
||||||
|
assert isinstance(MockSTTBackend(), STTBackend)
|
||||||
|
|
@ -18,6 +18,27 @@ manage = [
|
||||||
"platformdirs>=4.0",
|
"platformdirs>=4.0",
|
||||||
"typer[all]>=0.12",
|
"typer[all]>=0.12",
|
||||||
]
|
]
|
||||||
|
text-llamacpp = [
|
||||||
|
"llama-cpp-python>=0.2.0",
|
||||||
|
]
|
||||||
|
text-transformers = [
|
||||||
|
"torch>=2.0",
|
||||||
|
"transformers>=4.40",
|
||||||
|
"accelerate>=0.27",
|
||||||
|
]
|
||||||
|
text-transformers-4bit = [
|
||||||
|
"circuitforge-core[text-transformers]",
|
||||||
|
"bitsandbytes>=0.43",
|
||||||
|
]
|
||||||
|
stt-faster-whisper = [
|
||||||
|
"faster-whisper>=1.0",
|
||||||
|
]
|
||||||
|
stt-service = [
|
||||||
|
"circuitforge-core[stt-faster-whisper]",
|
||||||
|
"fastapi>=0.110",
|
||||||
|
"uvicorn[standard]>=0.29",
|
||||||
|
"python-multipart>=0.0.9",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"circuitforge-core[manage]",
|
"circuitforge-core[manage]",
|
||||||
"pytest>=8.0",
|
"pytest>=8.0",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue