feat(text): add OpenAI-compat /v1/chat/completions endpoint
Adds POST /v1/chat/completions to the cf-text FastAPI service so it can be used as an openai_compat backend in LLMRouter without any router changes. The endpoint accepts the standard OpenAI chat request format and returns a standard chat.completion response. 4 tests added; all 36 text tests pass.
This commit is contained in:
parent
fc52d32574
commit
69a338bd98
2 changed files with 293 additions and 0 deletions
226
circuitforge_core/text/app.py
Normal file
226
circuitforge_core/text/app.py
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
"""
|
||||
cf-text FastAPI service — managed by cf-orch.
|
||||
|
||||
Lightweight local text generation. Supports GGUF models via llama.cpp and
|
||||
HuggingFace transformers. Sits alongside vllm/ollama for products that need
|
||||
fast, frequent inference from small local models (3B–7B Q4).
|
||||
|
||||
Endpoints:
|
||||
GET /health → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
|
||||
POST /generate → GenerateResponse
|
||||
POST /chat → GenerateResponse
|
||||
|
||||
Usage:
|
||||
python -m circuitforge_core.text.app \
|
||||
--model /Library/Assets/LLM/qwen2.5-3b-instruct-q4_k_m.gguf \
|
||||
--port 8006 \
|
||||
--gpu-id 0
|
||||
|
||||
Mock mode (no model or GPU required):
|
||||
CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from functools import partial
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
|
||||
from circuitforge_core.text.backends.base import make_text_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_backend = None
|
||||
|
||||
|
||||
# ── Request / response models ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
class GenerateRequest(BaseModel):
|
||||
prompt: str
|
||||
max_tokens: int = 512
|
||||
temperature: float = 0.7
|
||||
stop: list[str] | None = None
|
||||
|
||||
|
||||
class ChatMessageModel(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
messages: list[ChatMessageModel]
|
||||
max_tokens: int = 512
|
||||
temperature: float = 0.7
|
||||
|
||||
|
||||
class GenerateResponse(BaseModel):
|
||||
text: str
|
||||
tokens_used: int = 0
|
||||
model: str = ""
|
||||
|
||||
|
||||
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
|
||||
|
||||
|
||||
class OAIMessageModel(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class OAIChatRequest(BaseModel):
|
||||
model: str = "cf-text"
|
||||
messages: list[OAIMessageModel]
|
||||
max_tokens: int | None = None
|
||||
temperature: float = 0.7
|
||||
stream: bool = False
|
||||
|
||||
|
||||
class OAIChoice(BaseModel):
|
||||
index: int = 0
|
||||
message: OAIMessageModel
|
||||
finish_reason: str = "stop"
|
||||
|
||||
|
||||
class OAIUsage(BaseModel):
|
||||
prompt_tokens: int = 0
|
||||
completion_tokens: int = 0
|
||||
total_tokens: int = 0
|
||||
|
||||
|
||||
class OAIChatResponse(BaseModel):
|
||||
id: str
|
||||
object: str = "chat.completion"
|
||||
created: int
|
||||
model: str
|
||||
choices: list[OAIChoice]
|
||||
usage: OAIUsage
|
||||
|
||||
|
||||
# ── App factory ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def create_app(
|
||||
model_path: str,
|
||||
gpu_id: int = 0,
|
||||
backend: str | None = None,
|
||||
mock: bool = False,
|
||||
) -> FastAPI:
|
||||
global _backend
|
||||
|
||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
|
||||
|
||||
_backend = make_text_backend(model_path, backend=backend, mock=mock)
|
||||
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
||||
|
||||
app = FastAPI(title="cf-text", version="0.1.0")
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
return {
|
||||
"status": "ok",
|
||||
"model": _backend.model_name,
|
||||
"vram_mb": _backend.vram_mb,
|
||||
}
|
||||
|
||||
@app.post("/generate")
|
||||
async def generate(req: GenerateRequest) -> GenerateResponse:
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
result = await _backend.generate_async(
|
||||
req.prompt,
|
||||
max_tokens=req.max_tokens,
|
||||
temperature=req.temperature,
|
||||
stop=req.stop,
|
||||
)
|
||||
return GenerateResponse(
|
||||
text=result.text,
|
||||
tokens_used=result.tokens_used,
|
||||
model=result.model,
|
||||
)
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(req: ChatRequest) -> GenerateResponse:
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
||||
# chat() is sync-only in the Protocol; run in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages,
|
||||
max_tokens=req.max_tokens, temperature=req.temperature),
|
||||
)
|
||||
return GenerateResponse(
|
||||
text=result.text,
|
||||
tokens_used=result.tokens_used,
|
||||
model=result.model,
|
||||
)
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def oai_chat_completions(req: OAIChatRequest) -> OAIChatResponse:
|
||||
"""OpenAI-compatible chat completions endpoint.
|
||||
|
||||
Allows LLMRouter (and any openai_compat client) to use cf-text
|
||||
without a custom backend type — just set base_url to this service's
|
||||
/v1 prefix.
|
||||
"""
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
||||
max_tok = req.max_tokens or 512
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
|
||||
)
|
||||
return OAIChatResponse(
|
||||
id=f"cftext-{uuid.uuid4().hex[:12]}",
|
||||
created=int(time.time()),
|
||||
model=result.model or req.model,
|
||||
choices=[OAIChoice(message=OAIMessageModel(role="assistant", content=result.text))],
|
||||
usage=OAIUsage(completion_tokens=result.tokens_used, total_tokens=result.tokens_used),
|
||||
)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
# ── CLI entrypoint ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="cf-text inference server")
|
||||
parser.add_argument("--model", default=os.environ.get("CF_TEXT_MODEL", "mock"),
|
||||
help="Path to GGUF file or HF model ID")
|
||||
parser.add_argument("--port", type=int, default=8006)
|
||||
parser.add_argument("--host", default="0.0.0.0")
|
||||
parser.add_argument("--gpu-id", type=int, default=0,
|
||||
help="CUDA device index to use")
|
||||
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
|
||||
parser.add_argument("--mock", action="store_true",
|
||||
help="Run in mock mode (no model or GPU needed)")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s — %(message)s")
|
||||
args = _parse_args()
|
||||
mock = args.mock or os.environ.get("CF_TEXT_MOCK", "") == "1" or args.model == "mock"
|
||||
app = create_app(
|
||||
model_path=args.model,
|
||||
gpu_id=args.gpu_id,
|
||||
backend=args.backend,
|
||||
mock=mock,
|
||||
)
|
||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||
67
tests/test_text/test_oai_compat.py
Normal file
67
tests/test_text/test_oai_compat.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# tests/test_text/test_oai_compat.py
|
||||
"""Tests for the OpenAI-compatible /v1/chat/completions endpoint on cf-text."""
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from circuitforge_core.text.app import create_app
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def client():
|
||||
app = create_app(model_path="mock", mock=True)
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
def test_oai_chat_completions_returns_200(client: TestClient) -> None:
|
||||
"""POST /v1/chat/completions returns 200 with a valid request."""
|
||||
resp = client.post(
|
||||
"/v1/chat/completions",
|
||||
json={
|
||||
"model": "cf-text",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def test_oai_chat_completions_response_shape(client: TestClient) -> None:
|
||||
"""Response contains the fields LLMRouter expects: choices[0].message.content."""
|
||||
resp = client.post(
|
||||
"/v1/chat/completions",
|
||||
json={
|
||||
"model": "cf-text",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "Write a short greeting."},
|
||||
],
|
||||
"max_tokens": 64,
|
||||
},
|
||||
)
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
assert len(data["choices"]) == 1
|
||||
choice = data["choices"][0]
|
||||
assert choice["message"]["role"] == "assistant"
|
||||
assert isinstance(choice["message"]["content"], str)
|
||||
assert len(choice["message"]["content"]) > 0
|
||||
|
||||
|
||||
def test_oai_chat_completions_includes_metadata(client: TestClient) -> None:
|
||||
"""Response includes id, object, created, model, and usage fields."""
|
||||
resp = client.post(
|
||||
"/v1/chat/completions",
|
||||
json={"model": "cf-text", "messages": [{"role": "user", "content": "Hi"}]},
|
||||
)
|
||||
data = resp.json()
|
||||
assert data["object"] == "chat.completion"
|
||||
assert isinstance(data["id"], str)
|
||||
assert data["id"].startswith("cftext-")
|
||||
assert isinstance(data["created"], int)
|
||||
assert "usage" in data
|
||||
|
||||
|
||||
def test_health_endpoint_still_works(client: TestClient) -> None:
|
||||
"""Existing /health endpoint is unaffected by the new OAI route."""
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "ok"
|
||||
Loading…
Reference in a new issue