feat(text): add OpenAI-compat /v1/chat/completions endpoint

Adds POST /v1/chat/completions to the cf-text FastAPI service so it can be used as an openai_compat backend in LLMRouter without any router changes. The endpoint accepts the standard OpenAI chat request format and returns a standard chat.completion response. 4 tests added; all 36 text tests pass.
2026-04-12 17:04:58 -07:00 · 2026-04-12 17:04:58 -07:00 · 69a338bd98
commit 69a338bd98
parent fc52d32574
2 changed files with 293 additions and 0 deletions
--- a/circuitforge_core/text/app.py
+++ b/circuitforge_core/text/app.py
@ -0,0 +1,226 @@
 """
 cf-text FastAPI service — managed by cf-orch.
 Lightweight local text generation. Supports GGUF models via llama.cpp and
 HuggingFace transformers. Sits alongside vllm/ollama for products that need
 fast, frequent inference from small local models (3B–7B Q4).
 Endpoints:
  GET  /health      → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
  POST /generate    → GenerateResponse
  POST /chat        → GenerateResponse
 Usage:
    python -m circuitforge_core.text.app \
        --model /Library/Assets/LLM/qwen2.5-3b-instruct-q4_k_m.gguf \
        --port 8006 \
        --gpu-id 0
 Mock mode (no model or GPU required):
    CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import logging
 import os
 import time
 import uuid
 from functools import partial
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
 from circuitforge_core.text.backends.base import make_text_backend
 logger = logging.getLogger(__name__)
 _backend = None
 # ── Request / response models ─────────────────────────────────────────────────
 class GenerateRequest(BaseModel):
    prompt: str
    max_tokens: int = 512
    temperature: float = 0.7
    stop: list[str] | None = None
 class ChatMessageModel(BaseModel):
    role: str
    content: str
 class ChatRequest(BaseModel):
    messages: list[ChatMessageModel]
    max_tokens: int = 512
    temperature: float = 0.7
 class GenerateResponse(BaseModel):
    text: str
    tokens_used: int = 0
    model: str = ""
 # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
 class OAIMessageModel(BaseModel):
    role: str
    content: str
 class OAIChatRequest(BaseModel):
    model: str = "cf-text"
    messages: list[OAIMessageModel]
    max_tokens: int | None = None
    temperature: float = 0.7
    stream: bool = False
 class OAIChoice(BaseModel):
    index: int = 0
    message: OAIMessageModel
    finish_reason: str = "stop"
 class OAIUsage(BaseModel):
    prompt_tokens: int = 0
    completion_tokens: int = 0
    total_tokens: int = 0
 class OAIChatResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: list[OAIChoice]
    usage: OAIUsage
 # ── App factory ───────────────────────────────────────────────────────────────
 def create_app(
    model_path: str,
    gpu_id: int = 0,
    backend: str | None = None,
    mock: bool = False,
 ) -> FastAPI:
    global _backend
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
    _backend = make_text_backend(model_path, backend=backend, mock=mock)
    logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
    app = FastAPI(title="cf-text", version="0.1.0")
    @app.get("/health")
    def health() -> dict:
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        return {
            "status": "ok",
            "model": _backend.model_name,
            "vram_mb": _backend.vram_mb,
        }
    @app.post("/generate")
    async def generate(req: GenerateRequest) -> GenerateResponse:
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        result = await _backend.generate_async(
            req.prompt,
            max_tokens=req.max_tokens,
            temperature=req.temperature,
            stop=req.stop,
        )
        return GenerateResponse(
            text=result.text,
            tokens_used=result.tokens_used,
            model=result.model,
        )
    @app.post("/chat")
    async def chat(req: ChatRequest) -> GenerateResponse:
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
        # chat() is sync-only in the Protocol; run in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        result = await loop.run_in_executor(
            None,
            partial(_backend.chat, messages,
                    max_tokens=req.max_tokens, temperature=req.temperature),
        )
        return GenerateResponse(
            text=result.text,
            tokens_used=result.tokens_used,
            model=result.model,
        )
    @app.post("/v1/chat/completions")
    async def oai_chat_completions(req: OAIChatRequest) -> OAIChatResponse:
        """OpenAI-compatible chat completions endpoint.
        Allows LLMRouter (and any openai_compat client) to use cf-text
        without a custom backend type — just set base_url to this service's
        /v1 prefix.
        """
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
        max_tok = req.max_tokens or 512
        loop = asyncio.get_event_loop()
        result = await loop.run_in_executor(
            None,
            partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
        )
        return OAIChatResponse(
            id=f"cftext-{uuid.uuid4().hex[:12]}",
            created=int(time.time()),
            model=result.model or req.model,
            choices=[OAIChoice(message=OAIMessageModel(role="assistant", content=result.text))],
            usage=OAIUsage(completion_tokens=result.tokens_used, total_tokens=result.tokens_used),
        )
    return app
 # ── CLI entrypoint ────────────────────────────────────────────────────────────
 def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="cf-text inference server")
    parser.add_argument("--model", default=os.environ.get("CF_TEXT_MODEL", "mock"),
                        help="Path to GGUF file or HF model ID")
    parser.add_argument("--port", type=int, default=8006)
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--gpu-id", type=int, default=0,
                        help="CUDA device index to use")
    parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
    parser.add_argument("--mock", action="store_true",
                        help="Run in mock mode (no model or GPU needed)")
    return parser.parse_args()
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(name)s — %(message)s")
    args = _parse_args()
    mock = args.mock or os.environ.get("CF_TEXT_MOCK", "") == "1" or args.model == "mock"
    app = create_app(
        model_path=args.model,
        gpu_id=args.gpu_id,
        backend=args.backend,
        mock=mock,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/tests/test_text/test_oai_compat.py
+++ b/tests/test_text/test_oai_compat.py
@ -0,0 +1,67 @@
 # tests/test_text/test_oai_compat.py
 """Tests for the OpenAI-compatible /v1/chat/completions endpoint on cf-text."""
 import pytest
 from fastapi.testclient import TestClient
 from circuitforge_core.text.app import create_app
@pytest.fixture()
 def client():
    app = create_app(model_path="mock", mock=True)
    return TestClient(app)
 def test_oai_chat_completions_returns_200(client: TestClient) -> None:
    """POST /v1/chat/completions returns 200 with a valid request."""
    resp = client.post(
        "/v1/chat/completions",
        json={
            "model": "cf-text",
            "messages": [{"role": "user", "content": "Hello"}],
        },
    )
    assert resp.status_code == 200
 def test_oai_chat_completions_response_shape(client: TestClient) -> None:
    """Response contains the fields LLMRouter expects: choices[0].message.content."""
    resp = client.post(
        "/v1/chat/completions",
        json={
            "model": "cf-text",
            "messages": [
                {"role": "system", "content": "You are helpful."},
                {"role": "user", "content": "Write a short greeting."},
            ],
            "max_tokens": 64,
        },
    )
    data = resp.json()
    assert "choices" in data
    assert len(data["choices"]) == 1
    choice = data["choices"][0]
    assert choice["message"]["role"] == "assistant"
    assert isinstance(choice["message"]["content"], str)
    assert len(choice["message"]["content"]) > 0
 def test_oai_chat_completions_includes_metadata(client: TestClient) -> None:
    """Response includes id, object, created, model, and usage fields."""
    resp = client.post(
        "/v1/chat/completions",
        json={"model": "cf-text", "messages": [{"role": "user", "content": "Hi"}]},
    )
    data = resp.json()
    assert data["object"] == "chat.completion"
    assert isinstance(data["id"], str)
    assert data["id"].startswith("cftext-")
    assert isinstance(data["created"], int)
    assert "usage" in data
 def test_health_endpoint_still_works(client: TestClient) -> None:
    """Existing /health endpoint is unaffected by the new OAI route."""
    resp = client.get("/health")
    assert resp.status_code == 200
    assert resp.json()["status"] == "ok"