feat(tts): add cf-tts module with ChatterboxTurbo backend and FastAPI service
Some checks are pending
CI / test (push) Waiting to run
Mirror / mirror (push) Waiting to run

- TTSBackend Protocol + TTSResult dataclass (audio_bytes, sample_rate, duration_s, format)
- MockTTSBackend: silent WAV clip, no GPU required, Protocol assert at import
- ChatterboxTurboBackend: ResembleAI chatterbox-turbo via chatterbox-tts package
  - from_local() loads model from snapshot dir
  - audio_prompt voice cloning via temp file
  - _encode_audio helper: OGG (default), WAV, MP3 via torchaudio
- circuitforge_core.tts module-level synthesize() singleton (CF_TTS_MODEL / CF_TTS_MOCK)
- FastAPI app: GET /health, POST /synthesize (multipart form, returns audio bytes)
  - default format: ogg (smaller than WAV, no patents)
  - X-Duration-S / X-Model / X-Sample-Rate response headers
  - CLI: --model --port --host --gpu-id --mock
- pyproject.toml: tts-chatterbox + tts-service extras
- Sample rate: 24000 Hz (S3GEN_SR from chatterbox internals)
This commit is contained in:
pyr0ball 2026-04-08 23:15:05 -07:00
parent 67493048e2
commit 3075e5d3da
7 changed files with 426 additions and 0 deletions

View file

@ -0,0 +1,87 @@
"""
circuitforge_core.tts Text-to-speech service module.
Quick start (mock mode no GPU or model required):
import os; os.environ["CF_TTS_MOCK"] = "1"
from circuitforge_core.tts import synthesize
result = synthesize("Hello world")
open("out.ogg", "wb").write(result.audio_bytes)
Real inference (chatterbox-turbo):
export CF_TTS_MODEL=/Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash>
from circuitforge_core.tts import synthesize
cf-orch service profile:
service_type: cf-tts
max_mb: 768
max_concurrent: 1
shared: true
managed:
exec: python -m circuitforge_core.tts.app
args: --model <path> --port {port} --gpu-id {gpu_id}
port: 8005
health: /health
"""
from __future__ import annotations
import os
from circuitforge_core.tts.backends.base import (
AudioFormat,
TTSBackend,
TTSResult,
make_tts_backend,
)
from circuitforge_core.tts.backends.mock import MockTTSBackend
_backend: TTSBackend | None = None
def _get_backend() -> TTSBackend:
global _backend
if _backend is None:
model_path = os.environ.get("CF_TTS_MODEL", "mock")
mock = model_path == "mock" or os.environ.get("CF_TTS_MOCK", "") == "1"
_backend = make_tts_backend(model_path, mock=mock)
return _backend
def synthesize(
text: str,
*,
exaggeration: float = 0.5,
cfg_weight: float = 0.5,
temperature: float = 0.8,
audio_prompt: bytes | None = None,
format: AudioFormat = "ogg",
) -> TTSResult:
"""Synthesize speech from text using the process-level backend."""
return _get_backend().synthesize(
text,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
temperature=temperature,
audio_prompt=audio_prompt,
format=format,
)
def reset_backend() -> None:
"""Reset the process-level singleton. Test teardown only."""
global _backend
_backend = None
__all__ = [
"AudioFormat",
"TTSBackend",
"TTSResult",
"MockTTSBackend",
"make_tts_backend",
"synthesize",
"reset_backend",
]

View file

@ -0,0 +1,103 @@
"""
cf-tts FastAPI service managed by cf-orch.
Endpoints:
GET /health {"status": "ok", "model": str, "vram_mb": int}
POST /synthesize audio bytes (Content-Type: audio/ogg or audio/wav or audio/mpeg)
Usage:
python -m circuitforge_core.tts.app \
--model /Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash> \
--port 8005 \
--gpu-id 0
"""
from __future__ import annotations
import argparse
import os
from typing import Annotated, Literal
from fastapi import FastAPI, Form, HTTPException, UploadFile
from fastapi.responses import Response
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, make_tts_backend
_CONTENT_TYPES: dict[str, str] = {
"ogg": "audio/ogg",
"wav": "audio/wav",
"mp3": "audio/mpeg",
}
app = FastAPI(title="cf-tts")
_backend: TTSBackend | None = None
@app.get("/health")
def health() -> dict:
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
return {"status": "ok", "model": _backend.model_name, "vram_mb": _backend.vram_mb}
@app.post("/synthesize")
async def synthesize(
text: Annotated[str, Form()],
format: Annotated[AudioFormat, Form()] = "ogg",
exaggeration: Annotated[float, Form()] = 0.5,
cfg_weight: Annotated[float, Form()] = 0.5,
temperature: Annotated[float, Form()] = 0.8,
audio_prompt: UploadFile | None = None,
) -> Response:
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
if not text.strip():
raise HTTPException(422, detail="text must not be empty")
prompt_bytes: bytes | None = None
if audio_prompt is not None:
prompt_bytes = await audio_prompt.read()
result = _backend.synthesize(
text,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
temperature=temperature,
audio_prompt=prompt_bytes,
format=format,
)
return Response(
content=result.audio_bytes,
media_type=_CONTENT_TYPES.get(result.format, "audio/ogg"),
headers={
"X-Duration-S": str(round(result.duration_s, 3)),
"X-Model": result.model,
"X-Sample-Rate": str(result.sample_rate),
},
)
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="cf-tts service")
p.add_argument("--model", required=True)
p.add_argument("--port", type=int, default=8005)
p.add_argument("--host", default="0.0.0.0")
p.add_argument("--gpu-id", type=int, default=0)
p.add_argument("--mock", action="store_true")
return p.parse_args()
if __name__ == "__main__":
import uvicorn
args = _parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
mock = args.mock or args.model == "mock"
device = "cpu" if mock else "cuda"
global _backend
_backend = make_tts_backend(args.model, mock=mock, device=device)
print(f"cf-tts backend ready: {_backend.model_name} ({_backend.vram_mb} MB)")
uvicorn.run(app, host=args.host, port=args.port)

View file

@ -0,0 +1,4 @@
from .base import AudioFormat, TTSBackend, TTSResult, make_tts_backend
from .mock import MockTTSBackend
__all__ = ["AudioFormat", "TTSBackend", "TTSResult", "make_tts_backend", "MockTTSBackend"]

View file

@ -0,0 +1,84 @@
"""
TTSBackend Protocol backend-agnostic TTS interface.
All backends return TTSResult with audio bytes in the requested format.
Supported formats: ogg (default, smallest), wav (uncompressed, always works), mp3.
"""
from __future__ import annotations
import io
from dataclasses import dataclass, field
from typing import Literal, Protocol, runtime_checkable
AudioFormat = Literal["ogg", "wav", "mp3"]
@dataclass(frozen=True)
class TTSResult:
audio_bytes: bytes
sample_rate: int
duration_s: float
format: AudioFormat = "ogg"
model: str = ""
@runtime_checkable
class TTSBackend(Protocol):
def synthesize(
self,
text: str,
*,
exaggeration: float = 0.5,
cfg_weight: float = 0.5,
temperature: float = 0.8,
audio_prompt: bytes | None = None,
format: AudioFormat = "ogg",
) -> TTSResult: ...
@property
def model_name(self) -> str: ...
@property
def vram_mb(self) -> int: ...
def _encode_audio(
wav_tensor, # torch.Tensor shape [1, T] or [T]
sample_rate: int,
format: AudioFormat,
) -> bytes:
"""Convert a torch tensor to audio bytes in the requested format."""
import torch
import torchaudio
wav = wav_tensor
if wav.dim() == 1:
wav = wav.unsqueeze(0)
wav = wav.to(torch.float32).cpu()
buf = io.BytesIO()
if format == "wav":
torchaudio.save(buf, wav, sample_rate, format="wav")
elif format == "ogg":
torchaudio.save(buf, wav, sample_rate, format="ogg", encoding="vorbis")
elif format == "mp3":
# torchaudio MP3 encode requires ffmpeg backend; fall back to wav on failure
try:
torchaudio.save(buf, wav, sample_rate, format="mp3")
except Exception:
buf = io.BytesIO()
torchaudio.save(buf, wav, sample_rate, format="wav")
return buf.getvalue()
def make_tts_backend(
model_path: str,
*,
mock: bool = False,
device: str = "cuda",
) -> TTSBackend:
if mock:
from circuitforge_core.tts.backends.mock import MockTTSBackend
return MockTTSBackend()
from circuitforge_core.tts.backends.chatterbox import ChatterboxTurboBackend
return ChatterboxTurboBackend(model_path=model_path, device=device)

View file

@ -0,0 +1,82 @@
"""ChatterboxTurboBackend — ResembleAI chatterbox-turbo TTS via chatterbox-tts package."""
from __future__ import annotations
import io
import os
import tempfile
from circuitforge_core.tts.backends.base import (
AudioFormat,
TTSBackend,
TTSResult,
_encode_audio,
)
_VRAM_MB = 768 # conservative estimate for chatterbox-turbo weights
class ChatterboxTurboBackend:
def __init__(self, model_path: str, device: str = "cuda") -> None:
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
from chatterbox.models.s3gen import S3GEN_SR
from chatterbox.tts import ChatterboxTTS
self._sr = S3GEN_SR
self._device = device
self._model = ChatterboxTTS.from_local(model_path, device=device)
self._model_path = model_path
@property
def model_name(self) -> str:
return f"chatterbox-turbo@{os.path.basename(self._model_path)}"
@property
def vram_mb(self) -> int:
return _VRAM_MB
def synthesize(
self,
text: str,
*,
exaggeration: float = 0.5,
cfg_weight: float = 0.5,
temperature: float = 0.8,
audio_prompt: bytes | None = None,
format: AudioFormat = "ogg",
) -> TTSResult:
audio_prompt_path: str | None = None
_tmp = None
if audio_prompt is not None:
_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
_tmp.write(audio_prompt)
_tmp.flush()
audio_prompt_path = _tmp.name
try:
wav = self._model.generate(
text,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
temperature=temperature,
audio_prompt_path=audio_prompt_path,
)
finally:
if _tmp is not None:
_tmp.close()
os.unlink(_tmp.name)
duration_s = wav.shape[-1] / self._sr
audio_bytes = _encode_audio(wav, self._sr, format)
return TTSResult(
audio_bytes=audio_bytes,
sample_rate=self._sr,
duration_s=duration_s,
format=format,
model=self.model_name,
)
assert isinstance(
ChatterboxTurboBackend.__new__(ChatterboxTurboBackend), TTSBackend
), "ChatterboxTurboBackend must satisfy TTSBackend Protocol"

View file

@ -0,0 +1,56 @@
"""MockTTSBackend — no GPU, no model required. Returns a silent WAV clip."""
from __future__ import annotations
import io
import struct
import wave
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, TTSResult
_SAMPLE_RATE = 24000
def _silent_wav(duration_s: float = 0.5, sample_rate: int = _SAMPLE_RATE) -> bytes:
num_samples = int(duration_s * sample_rate)
buf = io.BytesIO()
with wave.open(buf, "wb") as w:
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(sample_rate)
w.writeframes(struct.pack(f"<{num_samples}h", *([0] * num_samples)))
return buf.getvalue()
class MockTTSBackend:
"""Minimal TTSBackend implementation for tests and CI."""
@property
def model_name(self) -> str:
return "mock-tts"
@property
def vram_mb(self) -> int:
return 0
def synthesize(
self,
text: str,
*,
exaggeration: float = 0.5,
cfg_weight: float = 0.5,
temperature: float = 0.8,
audio_prompt: bytes | None = None,
format: AudioFormat = "ogg",
) -> TTSResult:
duration_s = max(0.1, len(text.split()) * 0.3)
audio = _silent_wav(duration_s)
return TTSResult(
audio_bytes=audio,
sample_rate=_SAMPLE_RATE,
duration_s=duration_s,
format="wav",
model=self.model_name,
)
assert isinstance(MockTTSBackend(), TTSBackend), "MockTTSBackend must satisfy TTSBackend Protocol"

View file

@ -39,6 +39,16 @@ stt-service = [
"uvicorn[standard]>=0.29",
"python-multipart>=0.0.9",
]
tts-chatterbox = [
"chatterbox-tts>=0.1",
"torchaudio>=2.0",
]
tts-service = [
"circuitforge-core[tts-chatterbox]",
"fastapi>=0.110",
"uvicorn[standard]>=0.29",
"python-multipart>=0.0.9",
]
dev = [
"circuitforge-core[manage]",
"pytest>=8.0",