feat(tts): add cf-tts module with ChatterboxTurbo backend and FastAPI service
- TTSBackend Protocol + TTSResult dataclass (audio_bytes, sample_rate, duration_s, format) - MockTTSBackend: silent WAV clip, no GPU required, Protocol assert at import - ChatterboxTurboBackend: ResembleAI chatterbox-turbo via chatterbox-tts package - from_local() loads model from snapshot dir - audio_prompt voice cloning via temp file - _encode_audio helper: OGG (default), WAV, MP3 via torchaudio - circuitforge_core.tts module-level synthesize() singleton (CF_TTS_MODEL / CF_TTS_MOCK) - FastAPI app: GET /health, POST /synthesize (multipart form, returns audio bytes) - default format: ogg (smaller than WAV, no patents) - X-Duration-S / X-Model / X-Sample-Rate response headers - CLI: --model --port --host --gpu-id --mock - pyproject.toml: tts-chatterbox + tts-service extras - Sample rate: 24000 Hz (S3GEN_SR from chatterbox internals)
This commit is contained in:
parent
67493048e2
commit
3075e5d3da
7 changed files with 426 additions and 0 deletions
87
circuitforge_core/tts/__init__.py
Normal file
87
circuitforge_core/tts/__init__.py
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
"""
|
||||||
|
circuitforge_core.tts — Text-to-speech service module.
|
||||||
|
|
||||||
|
Quick start (mock mode — no GPU or model required):
|
||||||
|
|
||||||
|
import os; os.environ["CF_TTS_MOCK"] = "1"
|
||||||
|
from circuitforge_core.tts import synthesize
|
||||||
|
|
||||||
|
result = synthesize("Hello world")
|
||||||
|
open("out.ogg", "wb").write(result.audio_bytes)
|
||||||
|
|
||||||
|
Real inference (chatterbox-turbo):
|
||||||
|
|
||||||
|
export CF_TTS_MODEL=/Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash>
|
||||||
|
from circuitforge_core.tts import synthesize
|
||||||
|
|
||||||
|
cf-orch service profile:
|
||||||
|
|
||||||
|
service_type: cf-tts
|
||||||
|
max_mb: 768
|
||||||
|
max_concurrent: 1
|
||||||
|
shared: true
|
||||||
|
managed:
|
||||||
|
exec: python -m circuitforge_core.tts.app
|
||||||
|
args: --model <path> --port {port} --gpu-id {gpu_id}
|
||||||
|
port: 8005
|
||||||
|
health: /health
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from circuitforge_core.tts.backends.base import (
|
||||||
|
AudioFormat,
|
||||||
|
TTSBackend,
|
||||||
|
TTSResult,
|
||||||
|
make_tts_backend,
|
||||||
|
)
|
||||||
|
from circuitforge_core.tts.backends.mock import MockTTSBackend
|
||||||
|
|
||||||
|
_backend: TTSBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend() -> TTSBackend:
|
||||||
|
global _backend
|
||||||
|
if _backend is None:
|
||||||
|
model_path = os.environ.get("CF_TTS_MODEL", "mock")
|
||||||
|
mock = model_path == "mock" or os.environ.get("CF_TTS_MOCK", "") == "1"
|
||||||
|
_backend = make_tts_backend(model_path, mock=mock)
|
||||||
|
return _backend
|
||||||
|
|
||||||
|
|
||||||
|
def synthesize(
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
exaggeration: float = 0.5,
|
||||||
|
cfg_weight: float = 0.5,
|
||||||
|
temperature: float = 0.8,
|
||||||
|
audio_prompt: bytes | None = None,
|
||||||
|
format: AudioFormat = "ogg",
|
||||||
|
) -> TTSResult:
|
||||||
|
"""Synthesize speech from text using the process-level backend."""
|
||||||
|
return _get_backend().synthesize(
|
||||||
|
text,
|
||||||
|
exaggeration=exaggeration,
|
||||||
|
cfg_weight=cfg_weight,
|
||||||
|
temperature=temperature,
|
||||||
|
audio_prompt=audio_prompt,
|
||||||
|
format=format,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_backend() -> None:
|
||||||
|
"""Reset the process-level singleton. Test teardown only."""
|
||||||
|
global _backend
|
||||||
|
_backend = None
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AudioFormat",
|
||||||
|
"TTSBackend",
|
||||||
|
"TTSResult",
|
||||||
|
"MockTTSBackend",
|
||||||
|
"make_tts_backend",
|
||||||
|
"synthesize",
|
||||||
|
"reset_backend",
|
||||||
|
]
|
||||||
103
circuitforge_core/tts/app.py
Normal file
103
circuitforge_core/tts/app.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
"""
|
||||||
|
cf-tts FastAPI service — managed by cf-orch.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
GET /health → {"status": "ok", "model": str, "vram_mb": int}
|
||||||
|
POST /synthesize → audio bytes (Content-Type: audio/ogg or audio/wav or audio/mpeg)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m circuitforge_core.tts.app \
|
||||||
|
--model /Library/Assets/LLM/chatterbox/hub/models--ResembleAI--chatterbox-turbo/snapshots/<hash> \
|
||||||
|
--port 8005 \
|
||||||
|
--gpu-id 0
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from typing import Annotated, Literal
|
||||||
|
|
||||||
|
from fastapi import FastAPI, Form, HTTPException, UploadFile
|
||||||
|
from fastapi.responses import Response
|
||||||
|
|
||||||
|
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, make_tts_backend
|
||||||
|
|
||||||
|
_CONTENT_TYPES: dict[str, str] = {
|
||||||
|
"ogg": "audio/ogg",
|
||||||
|
"wav": "audio/wav",
|
||||||
|
"mp3": "audio/mpeg",
|
||||||
|
}
|
||||||
|
|
||||||
|
app = FastAPI(title="cf-tts")
|
||||||
|
_backend: TTSBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health() -> dict:
|
||||||
|
if _backend is None:
|
||||||
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
|
return {"status": "ok", "model": _backend.model_name, "vram_mb": _backend.vram_mb}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/synthesize")
|
||||||
|
async def synthesize(
|
||||||
|
text: Annotated[str, Form()],
|
||||||
|
format: Annotated[AudioFormat, Form()] = "ogg",
|
||||||
|
exaggeration: Annotated[float, Form()] = 0.5,
|
||||||
|
cfg_weight: Annotated[float, Form()] = 0.5,
|
||||||
|
temperature: Annotated[float, Form()] = 0.8,
|
||||||
|
audio_prompt: UploadFile | None = None,
|
||||||
|
) -> Response:
|
||||||
|
if _backend is None:
|
||||||
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
|
if not text.strip():
|
||||||
|
raise HTTPException(422, detail="text must not be empty")
|
||||||
|
|
||||||
|
prompt_bytes: bytes | None = None
|
||||||
|
if audio_prompt is not None:
|
||||||
|
prompt_bytes = await audio_prompt.read()
|
||||||
|
|
||||||
|
result = _backend.synthesize(
|
||||||
|
text,
|
||||||
|
exaggeration=exaggeration,
|
||||||
|
cfg_weight=cfg_weight,
|
||||||
|
temperature=temperature,
|
||||||
|
audio_prompt=prompt_bytes,
|
||||||
|
format=format,
|
||||||
|
)
|
||||||
|
return Response(
|
||||||
|
content=result.audio_bytes,
|
||||||
|
media_type=_CONTENT_TYPES.get(result.format, "audio/ogg"),
|
||||||
|
headers={
|
||||||
|
"X-Duration-S": str(round(result.duration_s, 3)),
|
||||||
|
"X-Model": result.model,
|
||||||
|
"X-Sample-Rate": str(result.sample_rate),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_args() -> argparse.Namespace:
|
||||||
|
p = argparse.ArgumentParser(description="cf-tts service")
|
||||||
|
p.add_argument("--model", required=True)
|
||||||
|
p.add_argument("--port", type=int, default=8005)
|
||||||
|
p.add_argument("--host", default="0.0.0.0")
|
||||||
|
p.add_argument("--gpu-id", type=int, default=0)
|
||||||
|
p.add_argument("--mock", action="store_true")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
args = _parse_args()
|
||||||
|
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
|
||||||
|
|
||||||
|
mock = args.mock or args.model == "mock"
|
||||||
|
device = "cpu" if mock else "cuda"
|
||||||
|
|
||||||
|
global _backend
|
||||||
|
_backend = make_tts_backend(args.model, mock=mock, device=device)
|
||||||
|
print(f"cf-tts backend ready: {_backend.model_name} ({_backend.vram_mb} MB)")
|
||||||
|
|
||||||
|
uvicorn.run(app, host=args.host, port=args.port)
|
||||||
4
circuitforge_core/tts/backends/__init__.py
Normal file
4
circuitforge_core/tts/backends/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from .base import AudioFormat, TTSBackend, TTSResult, make_tts_backend
|
||||||
|
from .mock import MockTTSBackend
|
||||||
|
|
||||||
|
__all__ = ["AudioFormat", "TTSBackend", "TTSResult", "make_tts_backend", "MockTTSBackend"]
|
||||||
84
circuitforge_core/tts/backends/base.py
Normal file
84
circuitforge_core/tts/backends/base.py
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
"""
|
||||||
|
TTSBackend Protocol — backend-agnostic TTS interface.
|
||||||
|
|
||||||
|
All backends return TTSResult with audio bytes in the requested format.
|
||||||
|
Supported formats: ogg (default, smallest), wav (uncompressed, always works), mp3.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Literal, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
AudioFormat = Literal["ogg", "wav", "mp3"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TTSResult:
|
||||||
|
audio_bytes: bytes
|
||||||
|
sample_rate: int
|
||||||
|
duration_s: float
|
||||||
|
format: AudioFormat = "ogg"
|
||||||
|
model: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TTSBackend(Protocol):
|
||||||
|
def synthesize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
exaggeration: float = 0.5,
|
||||||
|
cfg_weight: float = 0.5,
|
||||||
|
temperature: float = 0.8,
|
||||||
|
audio_prompt: bytes | None = None,
|
||||||
|
format: AudioFormat = "ogg",
|
||||||
|
) -> TTSResult: ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str: ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
|
def _encode_audio(
|
||||||
|
wav_tensor, # torch.Tensor shape [1, T] or [T]
|
||||||
|
sample_rate: int,
|
||||||
|
format: AudioFormat,
|
||||||
|
) -> bytes:
|
||||||
|
"""Convert a torch tensor to audio bytes in the requested format."""
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
|
wav = wav_tensor
|
||||||
|
if wav.dim() == 1:
|
||||||
|
wav = wav.unsqueeze(0)
|
||||||
|
wav = wav.to(torch.float32).cpu()
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
if format == "wav":
|
||||||
|
torchaudio.save(buf, wav, sample_rate, format="wav")
|
||||||
|
elif format == "ogg":
|
||||||
|
torchaudio.save(buf, wav, sample_rate, format="ogg", encoding="vorbis")
|
||||||
|
elif format == "mp3":
|
||||||
|
# torchaudio MP3 encode requires ffmpeg backend; fall back to wav on failure
|
||||||
|
try:
|
||||||
|
torchaudio.save(buf, wav, sample_rate, format="mp3")
|
||||||
|
except Exception:
|
||||||
|
buf = io.BytesIO()
|
||||||
|
torchaudio.save(buf, wav, sample_rate, format="wav")
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def make_tts_backend(
|
||||||
|
model_path: str,
|
||||||
|
*,
|
||||||
|
mock: bool = False,
|
||||||
|
device: str = "cuda",
|
||||||
|
) -> TTSBackend:
|
||||||
|
if mock:
|
||||||
|
from circuitforge_core.tts.backends.mock import MockTTSBackend
|
||||||
|
return MockTTSBackend()
|
||||||
|
from circuitforge_core.tts.backends.chatterbox import ChatterboxTurboBackend
|
||||||
|
return ChatterboxTurboBackend(model_path=model_path, device=device)
|
||||||
82
circuitforge_core/tts/backends/chatterbox.py
Normal file
82
circuitforge_core/tts/backends/chatterbox.py
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
"""ChatterboxTurboBackend — ResembleAI chatterbox-turbo TTS via chatterbox-tts package."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from circuitforge_core.tts.backends.base import (
|
||||||
|
AudioFormat,
|
||||||
|
TTSBackend,
|
||||||
|
TTSResult,
|
||||||
|
_encode_audio,
|
||||||
|
)
|
||||||
|
|
||||||
|
_VRAM_MB = 768 # conservative estimate for chatterbox-turbo weights
|
||||||
|
|
||||||
|
|
||||||
|
class ChatterboxTurboBackend:
|
||||||
|
def __init__(self, model_path: str, device: str = "cuda") -> None:
|
||||||
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
|
||||||
|
from chatterbox.models.s3gen import S3GEN_SR
|
||||||
|
from chatterbox.tts import ChatterboxTTS
|
||||||
|
|
||||||
|
self._sr = S3GEN_SR
|
||||||
|
self._device = device
|
||||||
|
self._model = ChatterboxTTS.from_local(model_path, device=device)
|
||||||
|
self._model_path = model_path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
return f"chatterbox-turbo@{os.path.basename(self._model_path)}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int:
|
||||||
|
return _VRAM_MB
|
||||||
|
|
||||||
|
def synthesize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
exaggeration: float = 0.5,
|
||||||
|
cfg_weight: float = 0.5,
|
||||||
|
temperature: float = 0.8,
|
||||||
|
audio_prompt: bytes | None = None,
|
||||||
|
format: AudioFormat = "ogg",
|
||||||
|
) -> TTSResult:
|
||||||
|
audio_prompt_path: str | None = None
|
||||||
|
_tmp = None
|
||||||
|
|
||||||
|
if audio_prompt is not None:
|
||||||
|
_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
_tmp.write(audio_prompt)
|
||||||
|
_tmp.flush()
|
||||||
|
audio_prompt_path = _tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
wav = self._model.generate(
|
||||||
|
text,
|
||||||
|
exaggeration=exaggeration,
|
||||||
|
cfg_weight=cfg_weight,
|
||||||
|
temperature=temperature,
|
||||||
|
audio_prompt_path=audio_prompt_path,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
if _tmp is not None:
|
||||||
|
_tmp.close()
|
||||||
|
os.unlink(_tmp.name)
|
||||||
|
|
||||||
|
duration_s = wav.shape[-1] / self._sr
|
||||||
|
audio_bytes = _encode_audio(wav, self._sr, format)
|
||||||
|
return TTSResult(
|
||||||
|
audio_bytes=audio_bytes,
|
||||||
|
sample_rate=self._sr,
|
||||||
|
duration_s=duration_s,
|
||||||
|
format=format,
|
||||||
|
model=self.model_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
assert isinstance(
|
||||||
|
ChatterboxTurboBackend.__new__(ChatterboxTurboBackend), TTSBackend
|
||||||
|
), "ChatterboxTurboBackend must satisfy TTSBackend Protocol"
|
||||||
56
circuitforge_core/tts/backends/mock.py
Normal file
56
circuitforge_core/tts/backends/mock.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
"""MockTTSBackend — no GPU, no model required. Returns a silent WAV clip."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import struct
|
||||||
|
import wave
|
||||||
|
|
||||||
|
from circuitforge_core.tts.backends.base import AudioFormat, TTSBackend, TTSResult
|
||||||
|
|
||||||
|
_SAMPLE_RATE = 24000
|
||||||
|
|
||||||
|
|
||||||
|
def _silent_wav(duration_s: float = 0.5, sample_rate: int = _SAMPLE_RATE) -> bytes:
|
||||||
|
num_samples = int(duration_s * sample_rate)
|
||||||
|
buf = io.BytesIO()
|
||||||
|
with wave.open(buf, "wb") as w:
|
||||||
|
w.setnchannels(1)
|
||||||
|
w.setsampwidth(2)
|
||||||
|
w.setframerate(sample_rate)
|
||||||
|
w.writeframes(struct.pack(f"<{num_samples}h", *([0] * num_samples)))
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
class MockTTSBackend:
|
||||||
|
"""Minimal TTSBackend implementation for tests and CI."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
return "mock-tts"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def synthesize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
exaggeration: float = 0.5,
|
||||||
|
cfg_weight: float = 0.5,
|
||||||
|
temperature: float = 0.8,
|
||||||
|
audio_prompt: bytes | None = None,
|
||||||
|
format: AudioFormat = "ogg",
|
||||||
|
) -> TTSResult:
|
||||||
|
duration_s = max(0.1, len(text.split()) * 0.3)
|
||||||
|
audio = _silent_wav(duration_s)
|
||||||
|
return TTSResult(
|
||||||
|
audio_bytes=audio,
|
||||||
|
sample_rate=_SAMPLE_RATE,
|
||||||
|
duration_s=duration_s,
|
||||||
|
format="wav",
|
||||||
|
model=self.model_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
assert isinstance(MockTTSBackend(), TTSBackend), "MockTTSBackend must satisfy TTSBackend Protocol"
|
||||||
|
|
@ -39,6 +39,16 @@ stt-service = [
|
||||||
"uvicorn[standard]>=0.29",
|
"uvicorn[standard]>=0.29",
|
||||||
"python-multipart>=0.0.9",
|
"python-multipart>=0.0.9",
|
||||||
]
|
]
|
||||||
|
tts-chatterbox = [
|
||||||
|
"chatterbox-tts>=0.1",
|
||||||
|
"torchaudio>=2.0",
|
||||||
|
]
|
||||||
|
tts-service = [
|
||||||
|
"circuitforge-core[tts-chatterbox]",
|
||||||
|
"fastapi>=0.110",
|
||||||
|
"uvicorn[standard]>=0.29",
|
||||||
|
"python-multipart>=0.0.9",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"circuitforge-core[manage]",
|
"circuitforge-core[manage]",
|
||||||
"pytest>=8.0",
|
"pytest>=8.0",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue