feat(text): add OpenAI-compat /v1/chat/completions endpoint
Some checks are pending
CI / test (push) Waiting to run
Mirror / mirror (push) Waiting to run

Adds POST /v1/chat/completions to the cf-text FastAPI service so it can
be used as an openai_compat backend in LLMRouter without any router changes.
The endpoint accepts the standard OpenAI chat request format and returns
a standard chat.completion response.

4 tests added; all 36 text tests pass.
This commit is contained in:
pyr0ball 2026-04-12 17:04:58 -07:00
parent fc52d32574
commit 69a338bd98
2 changed files with 293 additions and 0 deletions

View file

@ -0,0 +1,226 @@
"""
cf-text FastAPI service managed by cf-orch.
Lightweight local text generation. Supports GGUF models via llama.cpp and
HuggingFace transformers. Sits alongside vllm/ollama for products that need
fast, frequent inference from small local models (3B7B Q4).
Endpoints:
GET /health {"status": "ok", "model": str, "vram_mb": int, "backend": str}
POST /generate GenerateResponse
POST /chat GenerateResponse
Usage:
python -m circuitforge_core.text.app \
--model /Library/Assets/LLM/qwen2.5-3b-instruct-q4_k_m.gguf \
--port 8006 \
--gpu-id 0
Mock mode (no model or GPU required):
CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import os
import time
import uuid
from functools import partial
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
from circuitforge_core.text.backends.base import make_text_backend
logger = logging.getLogger(__name__)
_backend = None
# ── Request / response models ─────────────────────────────────────────────────
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 512
temperature: float = 0.7
stop: list[str] | None = None
class ChatMessageModel(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: list[ChatMessageModel]
max_tokens: int = 512
temperature: float = 0.7
class GenerateResponse(BaseModel):
text: str
tokens_used: int = 0
model: str = ""
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
class OAIMessageModel(BaseModel):
role: str
content: str
class OAIChatRequest(BaseModel):
model: str = "cf-text"
messages: list[OAIMessageModel]
max_tokens: int | None = None
temperature: float = 0.7
stream: bool = False
class OAIChoice(BaseModel):
index: int = 0
message: OAIMessageModel
finish_reason: str = "stop"
class OAIUsage(BaseModel):
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
class OAIChatResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: list[OAIChoice]
usage: OAIUsage
# ── App factory ───────────────────────────────────────────────────────────────
def create_app(
model_path: str,
gpu_id: int = 0,
backend: str | None = None,
mock: bool = False,
) -> FastAPI:
global _backend
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
_backend = make_text_backend(model_path, backend=backend, mock=mock)
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
app = FastAPI(title="cf-text", version="0.1.0")
@app.get("/health")
def health() -> dict:
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
return {
"status": "ok",
"model": _backend.model_name,
"vram_mb": _backend.vram_mb,
}
@app.post("/generate")
async def generate(req: GenerateRequest) -> GenerateResponse:
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
result = await _backend.generate_async(
req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=req.stop,
)
return GenerateResponse(
text=result.text,
tokens_used=result.tokens_used,
model=result.model,
)
@app.post("/chat")
async def chat(req: ChatRequest) -> GenerateResponse:
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
# chat() is sync-only in the Protocol; run in thread pool to avoid blocking
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
partial(_backend.chat, messages,
max_tokens=req.max_tokens, temperature=req.temperature),
)
return GenerateResponse(
text=result.text,
tokens_used=result.tokens_used,
model=result.model,
)
@app.post("/v1/chat/completions")
async def oai_chat_completions(req: OAIChatRequest) -> OAIChatResponse:
"""OpenAI-compatible chat completions endpoint.
Allows LLMRouter (and any openai_compat client) to use cf-text
without a custom backend type just set base_url to this service's
/v1 prefix.
"""
if _backend is None:
raise HTTPException(503, detail="backend not initialised")
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
max_tok = req.max_tokens or 512
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
)
return OAIChatResponse(
id=f"cftext-{uuid.uuid4().hex[:12]}",
created=int(time.time()),
model=result.model or req.model,
choices=[OAIChoice(message=OAIMessageModel(role="assistant", content=result.text))],
usage=OAIUsage(completion_tokens=result.tokens_used, total_tokens=result.tokens_used),
)
return app
# ── CLI entrypoint ────────────────────────────────────────────────────────────
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="cf-text inference server")
parser.add_argument("--model", default=os.environ.get("CF_TEXT_MODEL", "mock"),
help="Path to GGUF file or HF model ID")
parser.add_argument("--port", type=int, default=8006)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--gpu-id", type=int, default=0,
help="CUDA device index to use")
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
parser.add_argument("--mock", action="store_true",
help="Run in mock mode (no model or GPU needed)")
return parser.parse_args()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s%(message)s")
args = _parse_args()
mock = args.mock or os.environ.get("CF_TEXT_MOCK", "") == "1" or args.model == "mock"
app = create_app(
model_path=args.model,
gpu_id=args.gpu_id,
backend=args.backend,
mock=mock,
)
uvicorn.run(app, host=args.host, port=args.port, log_level="info")

View file

@ -0,0 +1,67 @@
# tests/test_text/test_oai_compat.py
"""Tests for the OpenAI-compatible /v1/chat/completions endpoint on cf-text."""
import pytest
from fastapi.testclient import TestClient
from circuitforge_core.text.app import create_app
@pytest.fixture()
def client():
app = create_app(model_path="mock", mock=True)
return TestClient(app)
def test_oai_chat_completions_returns_200(client: TestClient) -> None:
"""POST /v1/chat/completions returns 200 with a valid request."""
resp = client.post(
"/v1/chat/completions",
json={
"model": "cf-text",
"messages": [{"role": "user", "content": "Hello"}],
},
)
assert resp.status_code == 200
def test_oai_chat_completions_response_shape(client: TestClient) -> None:
"""Response contains the fields LLMRouter expects: choices[0].message.content."""
resp = client.post(
"/v1/chat/completions",
json={
"model": "cf-text",
"messages": [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Write a short greeting."},
],
"max_tokens": 64,
},
)
data = resp.json()
assert "choices" in data
assert len(data["choices"]) == 1
choice = data["choices"][0]
assert choice["message"]["role"] == "assistant"
assert isinstance(choice["message"]["content"], str)
assert len(choice["message"]["content"]) > 0
def test_oai_chat_completions_includes_metadata(client: TestClient) -> None:
"""Response includes id, object, created, model, and usage fields."""
resp = client.post(
"/v1/chat/completions",
json={"model": "cf-text", "messages": [{"role": "user", "content": "Hi"}]},
)
data = resp.json()
assert data["object"] == "chat.completion"
assert isinstance(data["id"], str)
assert data["id"].startswith("cftext-")
assert isinstance(data["created"], int)
assert "usage" in data
def test_health_endpoint_still_works(client: TestClient) -> None:
"""Existing /health endpoint is unaffected by the new OAI route."""
resp = client.get("/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok"