feat(tts): add cf-tts module with ChatterboxTurbo backend and FastAPI service
- TTSBackend Protocol + TTSResult dataclass (audio_bytes, sample_rate, duration_s, format) - MockTTSBackend: silent WAV clip, no GPU required, Protocol assert at import - ChatterboxTurboBackend: ResembleAI chatterbox-turbo via chatterbox-tts package - from_local() loads model from snapshot dir - audio_prompt voice cloning via temp file - _encode_audio helper: OGG (default), WAV, MP3 via torchaudio - circuitforge_core.tts module-level synthesize() singleton (CF_TTS_MODEL / CF_TTS_MOCK) - FastAPI app: GET /health, POST /synthesize (multipart form, returns audio bytes) - default format: ogg (smaller than WAV, no patents) - X-Duration-S / X-Model / X-Sample-Rate response headers - CLI: --model --port --host --gpu-id --mock - pyproject.toml: tts-chatterbox + tts-service extras - Sample rate: 24000 Hz (S3GEN_SR from chatterbox internals)
This commit is contained in:
parent
67493048e2
commit
3075e5d3da
7 changed files with 426 additions and 0 deletions
87
circuitforge_core/tts/__init__.py
Normal file
87
circuitforge_core/tts/__init__.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
"""
|
||||
circuitforge_core.tts — Text-to-speech service module.
|
||||
|
||||
Quick start (mock mode — no GPU or model required):
|
||||
|
||||
import os; os.environ["CF_TTS_MOCK"] = "1"
|
||||
from circuitforge_core.tts import synthesize
|
||||
|
||||
result = synthesize("Hello world")
|
||||
open("out.ogg", "wb").write(result.audio_bytes)
|
||||
|
||||
Real inference (chatterbox-turbo):
|
||||
|
||||
export CF_TTS_MODEL=/Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash>
|
||||
from circuitforge_core.tts import synthesize
|
||||
|
||||
cf-orch service profile:
|
||||
|
||||
service_type: cf-tts
|
||||
max_mb: 768
|
||||
max_concurrent: 1
|
||||
shared: true
|
||||
managed:
|
||||
exec: python -m circuitforge_core.tts.app
|
||||
args: --model <path> --port {port} --gpu-id {gpu_id}
|
||||
port: 8005
|
||||
health: /health
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from circuitforge_core.tts.backends.base import (
|
||||
AudioFormat,
|
||||
TTSBackend,
|
||||
TTSResult,
|
||||
make_tts_backend,
|
||||
)
|
||||
from circuitforge_core.tts.backends.mock import MockTTSBackend
|
||||
|
||||
_backend: TTSBackend | None = None
|
||||
|
||||
|
||||
def _get_backend() -> TTSBackend:
|
||||
global _backend
|
||||
if _backend is None:
|
||||
model_path = os.environ.get("CF_TTS_MODEL", "mock")
|
||||
mock = model_path == "mock" or os.environ.get("CF_TTS_MOCK", "") == "1"
|
||||
_backend = make_tts_backend(model_path, mock=mock)
|
||||
return _backend
|
||||
|
||||
|
||||
def synthesize(
|
||||
text: str,
|
||||
*,
|
||||
exaggeration: float = 0.5,
|
||||
cfg_weight: float = 0.5,
|
||||
temperature: float = 0.8,
|
||||
audio_prompt: bytes | None = None,
|
||||
format: AudioFormat = "ogg",
|
||||
) -> TTSResult:
|
||||
"""Synthesize speech from text using the process-level backend."""
|
||||
return _get_backend().synthesize(
|
||||
text,
|
||||
exaggeration=exaggeration,
|
||||
cfg_weight=cfg_weight,
|
||||
temperature=temperature,
|
||||
audio_prompt=audio_prompt,
|
||||
format=format,
|
||||
)
|
||||
|
||||
|
||||
def reset_backend() -> None:
|
||||
"""Reset the process-level singleton. Test teardown only."""
|
||||
global _backend
|
||||
_backend = None
|
||||
|
||||
|
||||
__all__ = [
|
||||
"AudioFormat",
|
||||
"TTSBackend",
|
||||
"TTSResult",
|
||||
"MockTTSBackend",
|
||||
"make_tts_backend",
|
||||
"synthesize",
|
||||
"reset_backend",
|
||||
]
|
||||
103
circuitforge_core/tts/app.py
Normal file
103
circuitforge_core/tts/app.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
cf-tts FastAPI service — managed by cf-orch.
|
||||
|
||||
Endpoints:
|
||||
GET /health → {"status": "ok", "model": str, "vram_mb": int}
|
||||
POST /synthesize → audio bytes (Content-Type: audio/ogg or audio/wav or audio/mpeg)
|
||||
|
||||
Usage:
|
||||
python -m circuitforge_core.tts.app \
|
||||
--model /Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash> \
|
||||
--port 8005 \
|
||||
--gpu-id 0
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from typing import Annotated, Literal
|
||||
|
||||
from fastapi import FastAPI, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import Response
|
||||
|
||||
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, make_tts_backend
|
||||
|
||||
_CONTENT_TYPES: dict[str, str] = {
|
||||
"ogg": "audio/ogg",
|
||||
"wav": "audio/wav",
|
||||
"mp3": "audio/mpeg",
|
||||
}
|
||||
|
||||
app = FastAPI(title="cf-tts")
|
||||
_backend: TTSBackend | None = None
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
return {"status": "ok", "model": _backend.model_name, "vram_mb": _backend.vram_mb}
|
||||
|
||||
|
||||
@app.post("/synthesize")
|
||||
async def synthesize(
|
||||
text: Annotated[str, Form()],
|
||||
format: Annotated[AudioFormat, Form()] = "ogg",
|
||||
exaggeration: Annotated[float, Form()] = 0.5,
|
||||
cfg_weight: Annotated[float, Form()] = 0.5,
|
||||
temperature: Annotated[float, Form()] = 0.8,
|
||||
audio_prompt: UploadFile | None = None,
|
||||
) -> Response:
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
if not text.strip():
|
||||
raise HTTPException(422, detail="text must not be empty")
|
||||
|
||||
prompt_bytes: bytes | None = None
|
||||
if audio_prompt is not None:
|
||||
prompt_bytes = await audio_prompt.read()
|
||||
|
||||
result = _backend.synthesize(
|
||||
text,
|
||||
exaggeration=exaggeration,
|
||||
cfg_weight=cfg_weight,
|
||||
temperature=temperature,
|
||||
audio_prompt=prompt_bytes,
|
||||
format=format,
|
||||
)
|
||||
return Response(
|
||||
content=result.audio_bytes,
|
||||
media_type=_CONTENT_TYPES.get(result.format, "audio/ogg"),
|
||||
headers={
|
||||
"X-Duration-S": str(round(result.duration_s, 3)),
|
||||
"X-Model": result.model,
|
||||
"X-Sample-Rate": str(result.sample_rate),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="cf-tts service")
|
||||
p.add_argument("--model", required=True)
|
||||
p.add_argument("--port", type=int, default=8005)
|
||||
p.add_argument("--host", default="0.0.0.0")
|
||||
p.add_argument("--gpu-id", type=int, default=0)
|
||||
p.add_argument("--mock", action="store_true")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
args = _parse_args()
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
|
||||
|
||||
mock = args.mock or args.model == "mock"
|
||||
device = "cpu" if mock else "cuda"
|
||||
|
||||
global _backend
|
||||
_backend = make_tts_backend(args.model, mock=mock, device=device)
|
||||
print(f"cf-tts backend ready: {_backend.model_name} ({_backend.vram_mb} MB)")
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
4
circuitforge_core/tts/backends/__init__.py
Normal file
4
circuitforge_core/tts/backends/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from .base import AudioFormat, TTSBackend, TTSResult, make_tts_backend
|
||||
from .mock import MockTTSBackend
|
||||
|
||||
__all__ = ["AudioFormat", "TTSBackend", "TTSResult", "make_tts_backend", "MockTTSBackend"]
|
||||
84
circuitforge_core/tts/backends/base.py
Normal file
84
circuitforge_core/tts/backends/base.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
"""
|
||||
TTSBackend Protocol — backend-agnostic TTS interface.
|
||||
|
||||
All backends return TTSResult with audio bytes in the requested format.
|
||||
Supported formats: ogg (default, smallest), wav (uncompressed, always works), mp3.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Literal, Protocol, runtime_checkable
|
||||
|
||||
AudioFormat = Literal["ogg", "wav", "mp3"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TTSResult:
|
||||
audio_bytes: bytes
|
||||
sample_rate: int
|
||||
duration_s: float
|
||||
format: AudioFormat = "ogg"
|
||||
model: str = ""
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TTSBackend(Protocol):
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
exaggeration: float = 0.5,
|
||||
cfg_weight: float = 0.5,
|
||||
temperature: float = 0.8,
|
||||
audio_prompt: bytes | None = None,
|
||||
format: AudioFormat = "ogg",
|
||||
) -> TTSResult: ...
|
||||
|
||||
@property
|
||||
def model_name(self) -> str: ...
|
||||
|
||||
@property
|
||||
def vram_mb(self) -> int: ...
|
||||
|
||||
|
||||
def _encode_audio(
|
||||
wav_tensor, # torch.Tensor shape [1, T] or [T]
|
||||
sample_rate: int,
|
||||
format: AudioFormat,
|
||||
) -> bytes:
|
||||
"""Convert a torch tensor to audio bytes in the requested format."""
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
wav = wav_tensor
|
||||
if wav.dim() == 1:
|
||||
wav = wav.unsqueeze(0)
|
||||
wav = wav.to(torch.float32).cpu()
|
||||
|
||||
buf = io.BytesIO()
|
||||
if format == "wav":
|
||||
torchaudio.save(buf, wav, sample_rate, format="wav")
|
||||
elif format == "ogg":
|
||||
torchaudio.save(buf, wav, sample_rate, format="ogg", encoding="vorbis")
|
||||
elif format == "mp3":
|
||||
# torchaudio MP3 encode requires ffmpeg backend; fall back to wav on failure
|
||||
try:
|
||||
torchaudio.save(buf, wav, sample_rate, format="mp3")
|
||||
except Exception:
|
||||
buf = io.BytesIO()
|
||||
torchaudio.save(buf, wav, sample_rate, format="wav")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def make_tts_backend(
|
||||
model_path: str,
|
||||
*,
|
||||
mock: bool = False,
|
||||
device: str = "cuda",
|
||||
) -> TTSBackend:
|
||||
if mock:
|
||||
from circuitforge_core.tts.backends.mock import MockTTSBackend
|
||||
return MockTTSBackend()
|
||||
from circuitforge_core.tts.backends.chatterbox import ChatterboxTurboBackend
|
||||
return ChatterboxTurboBackend(model_path=model_path, device=device)
|
||||
82
circuitforge_core/tts/backends/chatterbox.py
Normal file
82
circuitforge_core/tts/backends/chatterbox.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
"""ChatterboxTurboBackend — ResembleAI chatterbox-turbo TTS via chatterbox-tts package."""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from circuitforge_core.tts.backends.base import (
|
||||
AudioFormat,
|
||||
TTSBackend,
|
||||
TTSResult,
|
||||
_encode_audio,
|
||||
)
|
||||
|
||||
_VRAM_MB = 768 # conservative estimate for chatterbox-turbo weights
|
||||
|
||||
|
||||
class ChatterboxTurboBackend:
|
||||
def __init__(self, model_path: str, device: str = "cuda") -> None:
|
||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
|
||||
from chatterbox.models.s3gen import S3GEN_SR
|
||||
from chatterbox.tts import ChatterboxTTS
|
||||
|
||||
self._sr = S3GEN_SR
|
||||
self._device = device
|
||||
self._model = ChatterboxTTS.from_local(model_path, device=device)
|
||||
self._model_path = model_path
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return f"chatterbox-turbo@{os.path.basename(self._model_path)}"
|
||||
|
||||
@property
|
||||
def vram_mb(self) -> int:
|
||||
return _VRAM_MB
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
exaggeration: float = 0.5,
|
||||
cfg_weight: float = 0.5,
|
||||
temperature: float = 0.8,
|
||||
audio_prompt: bytes | None = None,
|
||||
format: AudioFormat = "ogg",
|
||||
) -> TTSResult:
|
||||
audio_prompt_path: str | None = None
|
||||
_tmp = None
|
||||
|
||||
if audio_prompt is not None:
|
||||
_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
_tmp.write(audio_prompt)
|
||||
_tmp.flush()
|
||||
audio_prompt_path = _tmp.name
|
||||
|
||||
try:
|
||||
wav = self._model.generate(
|
||||
text,
|
||||
exaggeration=exaggeration,
|
||||
cfg_weight=cfg_weight,
|
||||
temperature=temperature,
|
||||
audio_prompt_path=audio_prompt_path,
|
||||
)
|
||||
finally:
|
||||
if _tmp is not None:
|
||||
_tmp.close()
|
||||
os.unlink(_tmp.name)
|
||||
|
||||
duration_s = wav.shape[-1] / self._sr
|
||||
audio_bytes = _encode_audio(wav, self._sr, format)
|
||||
return TTSResult(
|
||||
audio_bytes=audio_bytes,
|
||||
sample_rate=self._sr,
|
||||
duration_s=duration_s,
|
||||
format=format,
|
||||
model=self.model_name,
|
||||
)
|
||||
|
||||
|
||||
assert isinstance(
|
||||
ChatterboxTurboBackend.__new__(ChatterboxTurboBackend), TTSBackend
|
||||
), "ChatterboxTurboBackend must satisfy TTSBackend Protocol"
|
||||
56
circuitforge_core/tts/backends/mock.py
Normal file
56
circuitforge_core/tts/backends/mock.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
"""MockTTSBackend — no GPU, no model required. Returns a silent WAV clip."""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import struct
|
||||
import wave
|
||||
|
||||
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, TTSResult
|
||||
|
||||
_SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
def _silent_wav(duration_s: float = 0.5, sample_rate: int = _SAMPLE_RATE) -> bytes:
|
||||
num_samples = int(duration_s * sample_rate)
|
||||
buf = io.BytesIO()
|
||||
with wave.open(buf, "wb") as w:
|
||||
w.setnchannels(1)
|
||||
w.setsampwidth(2)
|
||||
w.setframerate(sample_rate)
|
||||
w.writeframes(struct.pack(f"<{num_samples}h", *([0] * num_samples)))
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class MockTTSBackend:
|
||||
"""Minimal TTSBackend implementation for tests and CI."""
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return "mock-tts"
|
||||
|
||||
@property
|
||||
def vram_mb(self) -> int:
|
||||
return 0
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
exaggeration: float = 0.5,
|
||||
cfg_weight: float = 0.5,
|
||||
temperature: float = 0.8,
|
||||
audio_prompt: bytes | None = None,
|
||||
format: AudioFormat = "ogg",
|
||||
) -> TTSResult:
|
||||
duration_s = max(0.1, len(text.split()) * 0.3)
|
||||
audio = _silent_wav(duration_s)
|
||||
return TTSResult(
|
||||
audio_bytes=audio,
|
||||
sample_rate=_SAMPLE_RATE,
|
||||
duration_s=duration_s,
|
||||
format="wav",
|
||||
model=self.model_name,
|
||||
)
|
||||
|
||||
|
||||
assert isinstance(MockTTSBackend(), TTSBackend), "MockTTSBackend must satisfy TTSBackend Protocol"
|
||||
|
|
@ -39,6 +39,16 @@ stt-service = [
|
|||
"uvicorn[standard]>=0.29",
|
||||
"python-multipart>=0.0.9",
|
||||
]
|
||||
tts-chatterbox = [
|
||||
"chatterbox-tts>=0.1",
|
||||
"torchaudio>=2.0",
|
||||
]
|
||||
tts-service = [
|
||||
"circuitforge-core[tts-chatterbox]",
|
||||
"fastapi>=0.110",
|
||||
"uvicorn[standard]>=0.29",
|
||||
"python-multipart>=0.0.9",
|
||||
]
|
||||
dev = [
|
||||
"circuitforge-core[manage]",
|
||||
"pytest>=8.0",
|
||||
|
|
|
|||
Loading…
Reference in a new issue