feat(text): add OpenAI-compat /v1/chat/completions endpoint

Adds POST /v1/chat/completions to the cf-text FastAPI service so it can be used as an openai_compat backend in LLMRouter without any router changes. The endpoint accepts the standard OpenAI chat request format and returns a standard chat.completion response. 4 tests added; all 36 text tests pass.
2026-04-12 17:04:58 -07:00 · 2026-04-12 17:04:58 -07:00 · 69a338bd98
commit 69a338bd98
parent fc52d32574
2 changed files with 293 additions and 0 deletions
--- a/circuitforge_core/text/app.py
+++ b/circuitforge_core/text/app.py
@ -0,0 +1,226 @@
+"""
+cf-text FastAPI service — managed by cf-orch.
+
+Lightweight local text generation. Supports GGUF models via llama.cpp and
+HuggingFace transformers. Sits alongside vllm/ollama for products that need
+fast, frequent inference from small local models (3B–7B Q4).
+
+Endpoints:
+  GET  /health      → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
+  POST /generate    → GenerateResponse
+  POST /chat        → GenerateResponse
+
+Usage:
+    python -m circuitforge_core.text.app \
+        --model /Library/Assets/LLM/qwen2.5-3b-instruct-q4_k_m.gguf \
+        --port 8006 \
+        --gpu-id 0
+
+Mock mode (no model or GPU required):
+    CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import time
+import uuid
+from functools import partial
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
+from circuitforge_core.text.backends.base import make_text_backend
+
+logger = logging.getLogger(__name__)
+
+_backend = None
+
+
+# ── Request / response models ─────────────────────────────────────────────────
+
+
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 512
+    temperature: float = 0.7
+    stop: list[str] | None = None
+
+
+class ChatMessageModel(BaseModel):
+    role: str
+    content: str
+
+
+class ChatRequest(BaseModel):
+    messages: list[ChatMessageModel]
+    max_tokens: int = 512
+    temperature: float = 0.7
+
+
+class GenerateResponse(BaseModel):
+    text: str
+    tokens_used: int = 0
+    model: str = ""
+
+
+# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
+
+
+class OAIMessageModel(BaseModel):
+    role: str
+    content: str
+
+
+class OAIChatRequest(BaseModel):
+    model: str = "cf-text"
+    messages: list[OAIMessageModel]
+    max_tokens: int | None = None
+    temperature: float = 0.7
+    stream: bool = False
+
+
+class OAIChoice(BaseModel):
+    index: int = 0
+    message: OAIMessageModel
+    finish_reason: str = "stop"
+
+
+class OAIUsage(BaseModel):
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+
+
+class OAIChatResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: list[OAIChoice]
+    usage: OAIUsage
+
+
+# ── App factory ───────────────────────────────────────────────────────────────
+
+
+def create_app(
+    model_path: str,
+    gpu_id: int = 0,
+    backend: str | None = None,
+    mock: bool = False,
+) -> FastAPI:
+    global _backend
+
+    os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
+
+    _backend = make_text_backend(model_path, backend=backend, mock=mock)
+    logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
+
+    app = FastAPI(title="cf-text", version="0.1.0")
+
+    @app.get("/health")
+    def health() -> dict:
+        if _backend is None:
+            raise HTTPException(503, detail="backend not initialised")
+        return {
+            "status": "ok",
+            "model": _backend.model_name,
+            "vram_mb": _backend.vram_mb,
+        }
+
+    @app.post("/generate")
+    async def generate(req: GenerateRequest) -> GenerateResponse:
+        if _backend is None:
+            raise HTTPException(503, detail="backend not initialised")
+        result = await _backend.generate_async(
+            req.prompt,
+            max_tokens=req.max_tokens,
+            temperature=req.temperature,
+            stop=req.stop,
+        )
+        return GenerateResponse(
+            text=result.text,
+            tokens_used=result.tokens_used,
+            model=result.model,
+        )
+
+    @app.post("/chat")
+    async def chat(req: ChatRequest) -> GenerateResponse:
+        if _backend is None:
+            raise HTTPException(503, detail="backend not initialised")
+        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        # chat() is sync-only in the Protocol; run in thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(
+            None,
+            partial(_backend.chat, messages,
+                    max_tokens=req.max_tokens, temperature=req.temperature),
+        )
+        return GenerateResponse(
+            text=result.text,
+            tokens_used=result.tokens_used,
+            model=result.model,
+        )
+
+    @app.post("/v1/chat/completions")
+    async def oai_chat_completions(req: OAIChatRequest) -> OAIChatResponse:
+        """OpenAI-compatible chat completions endpoint.
+
+        Allows LLMRouter (and any openai_compat client) to use cf-text
+        without a custom backend type — just set base_url to this service's
+        /v1 prefix.
+        """
+        if _backend is None:
+            raise HTTPException(503, detail="backend not initialised")
+        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        max_tok = req.max_tokens or 512
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(
+            None,
+            partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
+        )
+        return OAIChatResponse(
+            id=f"cftext-{uuid.uuid4().hex[:12]}",
+            created=int(time.time()),
+            model=result.model or req.model,
+            choices=[OAIChoice(message=OAIMessageModel(role="assistant", content=result.text))],
+            usage=OAIUsage(completion_tokens=result.tokens_used, total_tokens=result.tokens_used),
+        )
+
+    return app
+
+
+# ── CLI entrypoint ────────────────────────────────────────────────────────────
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="cf-text inference server")
+    parser.add_argument("--model", default=os.environ.get("CF_TEXT_MODEL", "mock"),
+                        help="Path to GGUF file or HF model ID")
+    parser.add_argument("--port", type=int, default=8006)
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--gpu-id", type=int, default=0,
+                        help="CUDA device index to use")
+    parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
+    parser.add_argument("--mock", action="store_true",
+                        help="Run in mock mode (no model or GPU needed)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s %(levelname)s %(name)s — %(message)s")
+    args = _parse_args()
+    mock = args.mock or os.environ.get("CF_TEXT_MOCK", "") == "1" or args.model == "mock"
+    app = create_app(
+        model_path=args.model,
+        gpu_id=args.gpu_id,
+        backend=args.backend,
+        mock=mock,
+    )
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/tests/test_text/test_oai_compat.py
+++ b/tests/test_text/test_oai_compat.py
@ -0,0 +1,67 @@
+# tests/test_text/test_oai_compat.py
+"""Tests for the OpenAI-compatible /v1/chat/completions endpoint on cf-text."""
+import pytest
+from fastapi.testclient import TestClient
+
+from circuitforge_core.text.app import create_app
+
+
+@pytest.fixture()
+def client():
+    app = create_app(model_path="mock", mock=True)
+    return TestClient(app)
+
+
+def test_oai_chat_completions_returns_200(client: TestClient) -> None:
+    """POST /v1/chat/completions returns 200 with a valid request."""
+    resp = client.post(
+        "/v1/chat/completions",
+        json={
+            "model": "cf-text",
+            "messages": [{"role": "user", "content": "Hello"}],
+        },
+    )
+    assert resp.status_code == 200
+
+
+def test_oai_chat_completions_response_shape(client: TestClient) -> None:
+    """Response contains the fields LLMRouter expects: choices[0].message.content."""
+    resp = client.post(
+        "/v1/chat/completions",
+        json={
+            "model": "cf-text",
+            "messages": [
+                {"role": "system", "content": "You are helpful."},
+                {"role": "user", "content": "Write a short greeting."},
+            ],
+            "max_tokens": 64,
+        },
+    )
+    data = resp.json()
+    assert "choices" in data
+    assert len(data["choices"]) == 1
+    choice = data["choices"][0]
+    assert choice["message"]["role"] == "assistant"
+    assert isinstance(choice["message"]["content"], str)
+    assert len(choice["message"]["content"]) > 0
+
+
+def test_oai_chat_completions_includes_metadata(client: TestClient) -> None:
+    """Response includes id, object, created, model, and usage fields."""
+    resp = client.post(
+        "/v1/chat/completions",
+        json={"model": "cf-text", "messages": [{"role": "user", "content": "Hi"}]},
+    )
+    data = resp.json()
+    assert data["object"] == "chat.completion"
+    assert isinstance(data["id"], str)
+    assert data["id"].startswith("cftext-")
+    assert isinstance(data["created"], int)
+    assert "usage" in data
+
+
+def test_health_endpoint_still_works(client: TestClient) -> None:
+    """Existing /health endpoint is unaffected by the new OAI route."""
+    resp = client.get("/health")
+    assert resp.status_code == 200
+    assert resp.json()["status"] == "ok"