From 3075e5d3dacdf363f6cdb6f5fe78009fb6cc7f4b Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 8 Apr 2026 23:15:05 -0700 Subject: [PATCH] feat(tts): add cf-tts module with ChatterboxTurbo backend and FastAPI service - TTSBackend Protocol + TTSResult dataclass (audio_bytes, sample_rate, duration_s, format) - MockTTSBackend: silent WAV clip, no GPU required, Protocol assert at import - ChatterboxTurboBackend: ResembleAI chatterbox-turbo via chatterbox-tts package - from_local() loads model from snapshot dir - audio_prompt voice cloning via temp file - _encode_audio helper: OGG (default), WAV, MP3 via torchaudio - circuitforge_core.tts module-level synthesize() singleton (CF_TTS_MODEL / CF_TTS_MOCK) - FastAPI app: GET /health, POST /synthesize (multipart form, returns audio bytes) - default format: ogg (smaller than WAV, no patents) - X-Duration-S / X-Model / X-Sample-Rate response headers - CLI: --model --port --host --gpu-id --mock - pyproject.toml: tts-chatterbox + tts-service extras - Sample rate: 24000 Hz (S3GEN_SR from chatterbox internals) --- circuitforge_core/tts/__init__.py | 87 ++++++++++++++++ circuitforge_core/tts/app.py | 103 +++++++++++++++++++ circuitforge_core/tts/backends/__init__.py | 4 + circuitforge_core/tts/backends/base.py | 84 +++++++++++++++ circuitforge_core/tts/backends/chatterbox.py | 82 +++++++++++++++ circuitforge_core/tts/backends/mock.py | 56 ++++++++++ pyproject.toml | 10 ++ 7 files changed, 426 insertions(+) create mode 100644 circuitforge_core/tts/__init__.py create mode 100644 circuitforge_core/tts/app.py create mode 100644 circuitforge_core/tts/backends/__init__.py create mode 100644 circuitforge_core/tts/backends/base.py create mode 100644 circuitforge_core/tts/backends/chatterbox.py create mode 100644 circuitforge_core/tts/backends/mock.py diff --git a/circuitforge_core/tts/__init__.py b/circuitforge_core/tts/__init__.py new file mode 100644 index 0000000..36ce8b1 --- /dev/null +++ b/circuitforge_core/tts/__init__.py @@ -0,0 +1,87 @@ +""" +circuitforge_core.tts — Text-to-speech service module. + +Quick start (mock mode — no GPU or model required): + + import os; os.environ["CF_TTS_MOCK"] = "1" + from circuitforge_core.tts import synthesize + + result = synthesize("Hello world") + open("out.ogg", "wb").write(result.audio_bytes) + +Real inference (chatterbox-turbo): + + export CF_TTS_MODEL=/Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/ + from circuitforge_core.tts import synthesize + +cf-orch service profile: + + service_type: cf-tts + max_mb: 768 + max_concurrent: 1 + shared: true + managed: + exec: python -m circuitforge_core.tts.app + args: --model --port {port} --gpu-id {gpu_id} + port: 8005 + health: /health +""" +from __future__ import annotations + +import os + +from circuitforge_core.tts.backends.base import ( + AudioFormat, + TTSBackend, + TTSResult, + make_tts_backend, +) +from circuitforge_core.tts.backends.mock import MockTTSBackend + +_backend: TTSBackend | None = None + + +def _get_backend() -> TTSBackend: + global _backend + if _backend is None: + model_path = os.environ.get("CF_TTS_MODEL", "mock") + mock = model_path == "mock" or os.environ.get("CF_TTS_MOCK", "") == "1" + _backend = make_tts_backend(model_path, mock=mock) + return _backend + + +def synthesize( + text: str, + *, + exaggeration: float = 0.5, + cfg_weight: float = 0.5, + temperature: float = 0.8, + audio_prompt: bytes | None = None, + format: AudioFormat = "ogg", +) -> TTSResult: + """Synthesize speech from text using the process-level backend.""" + return _get_backend().synthesize( + text, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + temperature=temperature, + audio_prompt=audio_prompt, + format=format, + ) + + +def reset_backend() -> None: + """Reset the process-level singleton. Test teardown only.""" + global _backend + _backend = None + + +__all__ = [ + "AudioFormat", + "TTSBackend", + "TTSResult", + "MockTTSBackend", + "make_tts_backend", + "synthesize", + "reset_backend", +] diff --git a/circuitforge_core/tts/app.py b/circuitforge_core/tts/app.py new file mode 100644 index 0000000..fb9e9e8 --- /dev/null +++ b/circuitforge_core/tts/app.py @@ -0,0 +1,103 @@ +""" +cf-tts FastAPI service — managed by cf-orch. + +Endpoints: + GET /health → {"status": "ok", "model": str, "vram_mb": int} + POST /synthesize → audio bytes (Content-Type: audio/ogg or audio/wav or audio/mpeg) + +Usage: + python -m circuitforge_core.tts.app \ + --model /Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/ \ + --port 8005 \ + --gpu-id 0 +""" +from __future__ import annotations + +import argparse +import os +from typing import Annotated, Literal + +from fastapi import FastAPI, Form, HTTPException, UploadFile +from fastapi.responses import Response + +from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, make_tts_backend + +_CONTENT_TYPES: dict[str, str] = { + "ogg": "audio/ogg", + "wav": "audio/wav", + "mp3": "audio/mpeg", +} + +app = FastAPI(title="cf-tts") +_backend: TTSBackend | None = None + + +@app.get("/health") +def health() -> dict: + if _backend is None: + raise HTTPException(503, detail="backend not initialised") + return {"status": "ok", "model": _backend.model_name, "vram_mb": _backend.vram_mb} + + +@app.post("/synthesize") +async def synthesize( + text: Annotated[str, Form()], + format: Annotated[AudioFormat, Form()] = "ogg", + exaggeration: Annotated[float, Form()] = 0.5, + cfg_weight: Annotated[float, Form()] = 0.5, + temperature: Annotated[float, Form()] = 0.8, + audio_prompt: UploadFile | None = None, +) -> Response: + if _backend is None: + raise HTTPException(503, detail="backend not initialised") + if not text.strip(): + raise HTTPException(422, detail="text must not be empty") + + prompt_bytes: bytes | None = None + if audio_prompt is not None: + prompt_bytes = await audio_prompt.read() + + result = _backend.synthesize( + text, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + temperature=temperature, + audio_prompt=prompt_bytes, + format=format, + ) + return Response( + content=result.audio_bytes, + media_type=_CONTENT_TYPES.get(result.format, "audio/ogg"), + headers={ + "X-Duration-S": str(round(result.duration_s, 3)), + "X-Model": result.model, + "X-Sample-Rate": str(result.sample_rate), + }, + ) + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="cf-tts service") + p.add_argument("--model", required=True) + p.add_argument("--port", type=int, default=8005) + p.add_argument("--host", default="0.0.0.0") + p.add_argument("--gpu-id", type=int, default=0) + p.add_argument("--mock", action="store_true") + return p.parse_args() + + +if __name__ == "__main__": + import uvicorn + + args = _parse_args() + + os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) + + mock = args.mock or args.model == "mock" + device = "cpu" if mock else "cuda" + + global _backend + _backend = make_tts_backend(args.model, mock=mock, device=device) + print(f"cf-tts backend ready: {_backend.model_name} ({_backend.vram_mb} MB)") + + uvicorn.run(app, host=args.host, port=args.port) diff --git a/circuitforge_core/tts/backends/__init__.py b/circuitforge_core/tts/backends/__init__.py new file mode 100644 index 0000000..4beacd2 --- /dev/null +++ b/circuitforge_core/tts/backends/__init__.py @@ -0,0 +1,4 @@ +from .base import AudioFormat, TTSBackend, TTSResult, make_tts_backend +from .mock import MockTTSBackend + +__all__ = ["AudioFormat", "TTSBackend", "TTSResult", "make_tts_backend", "MockTTSBackend"] diff --git a/circuitforge_core/tts/backends/base.py b/circuitforge_core/tts/backends/base.py new file mode 100644 index 0000000..fe9859a --- /dev/null +++ b/circuitforge_core/tts/backends/base.py @@ -0,0 +1,84 @@ +""" +TTSBackend Protocol — backend-agnostic TTS interface. + +All backends return TTSResult with audio bytes in the requested format. +Supported formats: ogg (default, smallest), wav (uncompressed, always works), mp3. +""" +from __future__ import annotations + +import io +from dataclasses import dataclass, field +from typing import Literal, Protocol, runtime_checkable + +AudioFormat = Literal["ogg", "wav", "mp3"] + + +@dataclass(frozen=True) +class TTSResult: + audio_bytes: bytes + sample_rate: int + duration_s: float + format: AudioFormat = "ogg" + model: str = "" + + +@runtime_checkable +class TTSBackend(Protocol): + def synthesize( + self, + text: str, + *, + exaggeration: float = 0.5, + cfg_weight: float = 0.5, + temperature: float = 0.8, + audio_prompt: bytes | None = None, + format: AudioFormat = "ogg", + ) -> TTSResult: ... + + @property + def model_name(self) -> str: ... + + @property + def vram_mb(self) -> int: ... + + +def _encode_audio( + wav_tensor, # torch.Tensor shape [1, T] or [T] + sample_rate: int, + format: AudioFormat, +) -> bytes: + """Convert a torch tensor to audio bytes in the requested format.""" + import torch + import torchaudio + + wav = wav_tensor + if wav.dim() == 1: + wav = wav.unsqueeze(0) + wav = wav.to(torch.float32).cpu() + + buf = io.BytesIO() + if format == "wav": + torchaudio.save(buf, wav, sample_rate, format="wav") + elif format == "ogg": + torchaudio.save(buf, wav, sample_rate, format="ogg", encoding="vorbis") + elif format == "mp3": + # torchaudio MP3 encode requires ffmpeg backend; fall back to wav on failure + try: + torchaudio.save(buf, wav, sample_rate, format="mp3") + except Exception: + buf = io.BytesIO() + torchaudio.save(buf, wav, sample_rate, format="wav") + return buf.getvalue() + + +def make_tts_backend( + model_path: str, + *, + mock: bool = False, + device: str = "cuda", +) -> TTSBackend: + if mock: + from circuitforge_core.tts.backends.mock import MockTTSBackend + return MockTTSBackend() + from circuitforge_core.tts.backends.chatterbox import ChatterboxTurboBackend + return ChatterboxTurboBackend(model_path=model_path, device=device) diff --git a/circuitforge_core/tts/backends/chatterbox.py b/circuitforge_core/tts/backends/chatterbox.py new file mode 100644 index 0000000..101380c --- /dev/null +++ b/circuitforge_core/tts/backends/chatterbox.py @@ -0,0 +1,82 @@ +"""ChatterboxTurboBackend — ResembleAI chatterbox-turbo TTS via chatterbox-tts package.""" +from __future__ import annotations + +import io +import os +import tempfile + +from circuitforge_core.tts.backends.base import ( + AudioFormat, + TTSBackend, + TTSResult, + _encode_audio, +) + +_VRAM_MB = 768 # conservative estimate for chatterbox-turbo weights + + +class ChatterboxTurboBackend: + def __init__(self, model_path: str, device: str = "cuda") -> None: + os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") + from chatterbox.models.s3gen import S3GEN_SR + from chatterbox.tts import ChatterboxTTS + + self._sr = S3GEN_SR + self._device = device + self._model = ChatterboxTTS.from_local(model_path, device=device) + self._model_path = model_path + + @property + def model_name(self) -> str: + return f"chatterbox-turbo@{os.path.basename(self._model_path)}" + + @property + def vram_mb(self) -> int: + return _VRAM_MB + + def synthesize( + self, + text: str, + *, + exaggeration: float = 0.5, + cfg_weight: float = 0.5, + temperature: float = 0.8, + audio_prompt: bytes | None = None, + format: AudioFormat = "ogg", + ) -> TTSResult: + audio_prompt_path: str | None = None + _tmp = None + + if audio_prompt is not None: + _tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + _tmp.write(audio_prompt) + _tmp.flush() + audio_prompt_path = _tmp.name + + try: + wav = self._model.generate( + text, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + temperature=temperature, + audio_prompt_path=audio_prompt_path, + ) + finally: + if _tmp is not None: + _tmp.close() + os.unlink(_tmp.name) + + duration_s = wav.shape[-1] / self._sr + audio_bytes = _encode_audio(wav, self._sr, format) + return TTSResult( + audio_bytes=audio_bytes, + sample_rate=self._sr, + duration_s=duration_s, + format=format, + model=self.model_name, + ) + + +assert isinstance( + ChatterboxTurboBackend.__new__(ChatterboxTurboBackend), TTSBackend +), "ChatterboxTurboBackend must satisfy TTSBackend Protocol" diff --git a/circuitforge_core/tts/backends/mock.py b/circuitforge_core/tts/backends/mock.py new file mode 100644 index 0000000..b998adc --- /dev/null +++ b/circuitforge_core/tts/backends/mock.py @@ -0,0 +1,56 @@ +"""MockTTSBackend — no GPU, no model required. Returns a silent WAV clip.""" +from __future__ import annotations + +import io +import struct +import wave + +from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, TTSResult + +_SAMPLE_RATE = 24000 + + +def _silent_wav(duration_s: float = 0.5, sample_rate: int = _SAMPLE_RATE) -> bytes: + num_samples = int(duration_s * sample_rate) + buf = io.BytesIO() + with wave.open(buf, "wb") as w: + w.setnchannels(1) + w.setsampwidth(2) + w.setframerate(sample_rate) + w.writeframes(struct.pack(f"<{num_samples}h", *([0] * num_samples))) + return buf.getvalue() + + +class MockTTSBackend: + """Minimal TTSBackend implementation for tests and CI.""" + + @property + def model_name(self) -> str: + return "mock-tts" + + @property + def vram_mb(self) -> int: + return 0 + + def synthesize( + self, + text: str, + *, + exaggeration: float = 0.5, + cfg_weight: float = 0.5, + temperature: float = 0.8, + audio_prompt: bytes | None = None, + format: AudioFormat = "ogg", + ) -> TTSResult: + duration_s = max(0.1, len(text.split()) * 0.3) + audio = _silent_wav(duration_s) + return TTSResult( + audio_bytes=audio, + sample_rate=_SAMPLE_RATE, + duration_s=duration_s, + format="wav", + model=self.model_name, + ) + + +assert isinstance(MockTTSBackend(), TTSBackend), "MockTTSBackend must satisfy TTSBackend Protocol" diff --git a/pyproject.toml b/pyproject.toml index cb0ffb6..d34a5a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,16 @@ stt-service = [ "uvicorn[standard]>=0.29", "python-multipart>=0.0.9", ] +tts-chatterbox = [ + "chatterbox-tts>=0.1", + "torchaudio>=2.0", +] +tts-service = [ + "circuitforge-core[tts-chatterbox]", + "fastapi>=0.110", + "uvicorn[standard]>=0.29", + "python-multipart>=0.0.9", +] dev = [ "circuitforge-core[manage]", "pytest>=8.0",