feat(text): multimodal content-block support + VLM mmproj passthrough

Add OpenAI-style content block models (ContentBlockText, ContentBlockImageURL)
to cf-text FastAPI app; update ChatMessage.content to accept str | list.
LlamaCppBackend gains mmproj_path + chat_format args for external projector
VLMs; embedded VLMs (Qwen2-VL, MiniCPM-V) detected via GGUF metadata.
Text-only backends raise ValueError on image input rather than silently
dropping them. Adds --mmproj CLI arg wired through create_app().

Closes: #66
This commit is contained in:
pyr0ball 2026-06-05 10:18:55 -07:00
parent 5a363f3b6c
commit 93ab528261
3 changed files with 276 additions and 38 deletions

View file

@ -1,14 +1,15 @@
""" """
cf-text FastAPI service managed by cf-orch. cf-text FastAPI service managed by cf-orch.
Lightweight local text generation. Supports GGUF models via llama.cpp and Lightweight local text generation and PII filtering. Supports GGUF models via
HuggingFace transformers. Sits alongside vllm/ollama for products that need llama.cpp, HuggingFace transformers, and token-classification models (classifier
fast, frequent inference from small local models (3B7B Q4). backend) for PII detection and redaction.
Endpoints: Endpoints:
GET /health {"status": "ok", "model": str, "vram_mb": int, "backend": str} GET /health {"status": "ok", "model": str, "vram_mb": int, "backend": str}
POST /generate GenerateResponse POST /generate GenerateResponse (text-gen backends only)
POST /chat GenerateResponse POST /chat GenerateResponse (text-gen backends only)
POST /filter FilterResponse (classifier backend only)
Usage: Usage:
python -m circuitforge_core.text.app \ python -m circuitforge_core.text.app \
@ -34,17 +35,46 @@ import os
import time import time
import uuid import uuid
from functools import partial from functools import partial
from typing import Annotated, Literal, Union
import uvicorn import uvicorn
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from pydantic import BaseModel from pydantic import BaseModel, Field
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
from circuitforge_core.text.backends.base import make_text_backend from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
from circuitforge_core.text.filter import FilterResult, PIIFilter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_backend = None _backend = None
_pii_filter: PIIFilter | None = None
# ── Content block types (OpenAI multimodal format) ────────────────────────────
class ContentBlockText(BaseModel):
type: Literal["text"]
text: str
class ContentBlockImageURL(BaseModel):
type: Literal["image_url"]
image_url: dict[str, str]
ContentBlock = Annotated[
Union[ContentBlockText, ContentBlockImageURL],
Field(discriminator="type"),
]
def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
"""Convert an API message to a BackendChatMessage with raw content dicts."""
if isinstance(content, str):
return BackendChatMessage(role, content)
return BackendChatMessage(role, [b.model_dump() for b in content])
# ── Request / response models ───────────────────────────────────────────────── # ── Request / response models ─────────────────────────────────────────────────
@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):
class ChatMessageModel(BaseModel): class ChatMessageModel(BaseModel):
role: str role: str
content: str content: Union[str, list[ContentBlock]] = ""
class ChatRequest(BaseModel): class ChatRequest(BaseModel):
@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
model: str = "" model: str = ""
class FilterRequest(BaseModel):
text: str
class PIISpanResponse(BaseModel):
label: str
start: int
end: int
text: str
score: float
class FilterResponse(BaseModel):
redacted_text: str
spans: list[PIISpanResponse]
original_text: str
model: str = ""
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ────── # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
class OAIMessageModel(BaseModel): class OAIMessageModel(BaseModel):
role: str role: str
content: str content: Union[str, list[ContentBlock]] = ""
class OAIChatRequest(BaseModel): class OAIChatRequest(BaseModel):
@ -120,6 +169,7 @@ def create_app(
gpu_ids: str | None = None, gpu_ids: str | None = None,
backend: str | None = None, backend: str | None = None,
mock: bool = False, mock: bool = False,
mmproj_path: str = "",
) -> FastAPI: ) -> FastAPI:
"""Start the cf-text FastAPI app. """Start the cf-text FastAPI app.
@ -127,8 +177,12 @@ def create_app(
(e.g. "0,1"). When set, overrides ``gpu_id`` and sets (e.g. "0,1"). When set, overrides ``gpu_id`` and sets
``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's ``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
``device_map="auto"`` can shard the model across all listed devices. ``device_map="auto"`` can shard the model across all listed devices.
When ``backend="classifier"``, the service skips the text-gen backends
and loads a token-classification pipeline instead. Only ``POST /filter``
is available in that mode; ``/generate`` and ``/chat`` return 501.
""" """
global _backend global _backend, _pii_filter
if not mock and not model_path: if not mock and not model_path:
raise ValueError( raise ValueError(
@ -139,13 +193,26 @@ def create_app(
visible = gpu_ids if gpu_ids else str(gpu_id) visible = gpu_ids if gpu_ids else str(gpu_id)
os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible) os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
_backend = make_text_backend(model_path, backend=backend, mock=mock) resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
if resolved_backend == "classifier" or (not resolved_backend and False):
classifier_backend = make_classifier_backend(model_path)
_pii_filter = PIIFilter.from_backend(classifier_backend)
logger.info(
"cf-text (classifier) ready: model=%r vram=%dMB",
classifier_backend.model_name,
classifier_backend.vram_mb,
)
else:
_backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb) logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
app = FastAPI(title="cf-text", version="0.1.0") app = FastAPI(title="cf-text", version="0.1.0")
@app.get("/health") @app.get("/health")
def health() -> dict: def health() -> dict:
if _pii_filter is not None:
b = _pii_filter._backend
return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
if _backend is None: if _backend is None:
raise HTTPException(503, detail="backend not initialised") raise HTTPException(503, detail="backend not initialised")
return { return {
@ -154,8 +221,35 @@ def create_app(
"vram_mb": _backend.vram_mb, "vram_mb": _backend.vram_mb,
} }
@app.post("/filter")
async def filter_text(req: FilterRequest) -> FilterResponse:
if _pii_filter is None:
raise HTTPException(
501,
detail="This cf-text instance is not running a classifier backend. "
"Start with --backend classifier and a token-classification model.",
)
result = await _pii_filter.filter_async(req.text)
return FilterResponse(
redacted_text=result.redacted_text,
spans=[
PIISpanResponse(
label=s.label,
start=s.start,
end=s.end,
text=s.text,
score=s.score,
)
for s in result.spans
],
original_text=result.original_text,
model=_pii_filter._backend.model_name,
)
@app.post("/generate") @app.post("/generate")
async def generate(req: GenerateRequest) -> GenerateResponse: async def generate(req: GenerateRequest) -> GenerateResponse:
if _pii_filter is not None:
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
if _backend is None: if _backend is None:
raise HTTPException(503, detail="backend not initialised") raise HTTPException(503, detail="backend not initialised")
result = await _backend.generate_async( result = await _backend.generate_async(
@ -172,16 +266,20 @@ def create_app(
@app.post("/chat") @app.post("/chat")
async def chat(req: ChatRequest) -> GenerateResponse: async def chat(req: ChatRequest) -> GenerateResponse:
if _pii_filter is not None:
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
if _backend is None: if _backend is None:
raise HTTPException(503, detail="backend not initialised") raise HTTPException(503, detail="backend not initialised")
messages = [BackendChatMessage(m.role, m.content) for m in req.messages] messages = [_to_backend_message(m.role, m.content) for m in req.messages]
# chat() is sync-only in the Protocol; run in thread pool to avoid blocking
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
try:
result = await loop.run_in_executor( result = await loop.run_in_executor(
None, None,
partial(_backend.chat, messages, partial(_backend.chat, messages,
max_tokens=req.max_tokens, temperature=req.temperature), max_tokens=req.max_tokens, temperature=req.temperature),
) )
except ValueError as exc:
raise HTTPException(422, detail=str(exc)) from exc
return GenerateResponse( return GenerateResponse(
text=result.text, text=result.text,
tokens_used=result.tokens_used, tokens_used=result.tokens_used,
@ -198,13 +296,16 @@ def create_app(
""" """
if _backend is None: if _backend is None:
raise HTTPException(503, detail="backend not initialised") raise HTTPException(503, detail="backend not initialised")
messages = [BackendChatMessage(m.role, m.content) for m in req.messages] messages = [_to_backend_message(m.role, m.content) for m in req.messages]
max_tok = req.max_tokens or 512 max_tok = req.max_tokens or 512
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
try:
result = await loop.run_in_executor( result = await loop.run_in_executor(
None, None,
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature), partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
) )
except ValueError as exc:
raise HTTPException(422, detail=str(exc)) from exc
return OAIChatResponse( return OAIChatResponse(
id=f"cftext-{uuid.uuid4().hex[:12]}", id=f"cftext-{uuid.uuid4().hex[:12]}",
created=int(time.time()), created=int(time.time()),
@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument("--gpu-ids", default=None, parser.add_argument("--gpu-ids", default=None,
help="Comma-separated CUDA device indices for multi-GPU spanning " help="Comma-separated CUDA device indices for multi-GPU spanning "
"(e.g. '0,1'). Overrides --gpu-id when set.") "(e.g. '0,1'). Overrides --gpu-id when set.")
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None) parser.add_argument(
"--backend",
choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
default=None,
)
parser.add_argument(
"--mmproj", default="",
help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
"Qwen2-VL and other self-contained VLMs don't need this.",
)
parser.add_argument("--mock", action="store_true", parser.add_argument("--mock", action="store_true",
help="Run in mock mode (no model or GPU needed)") help="Run in mock mode (no model or GPU needed)")
return parser.parse_args() return parser.parse_args()
@ -247,5 +357,6 @@ if __name__ == "__main__":
gpu_ids=args.gpu_ids, gpu_ids=args.gpu_ids,
backend=args.backend, backend=args.backend,
mock=mock, mock=mock,
mmproj_path=args.mmproj,
) )
uvicorn.run(app, host=args.host, port=args.port, log_level="info") uvicorn.run(app, host=args.host, port=args.port, log_level="info")

View file

@ -24,17 +24,44 @@ class GenerateResult:
class ChatMessage: class ChatMessage:
"""A single message in a chat conversation.""" """A single message in a chat conversation.
def __init__(self, role: str, content: str) -> None: ``content`` is either a plain string or a list of OpenAI-format content
blocks (dicts with ``type: "text"`` or ``type: "image_url"``). Backends
that do not support images should call ``text_only`` to get the string
form before passing to the model.
"""
def __init__(self, role: str, content: "str | list") -> None:
if role not in ("system", "user", "assistant"): if role not in ("system", "user", "assistant"):
raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.") raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
self.role = role self.role = role
self.content = content self.content: "str | list" = content
def to_dict(self) -> dict: def to_dict(self) -> dict:
return {"role": self.role, "content": self.content} return {"role": self.role, "content": self.content}
@property
def has_images(self) -> bool:
"""True when at least one content block is an image_url block."""
if isinstance(self.content, str):
return False
return any(
isinstance(b, dict) and b.get("type") == "image_url"
for b in self.content
)
@property
def text_only(self) -> str:
"""Flatten multimodal content to text. Returns content as-is if already str."""
if isinstance(self.content, str):
return self.content
return "\n".join(
b["text"]
for b in self.content
if isinstance(b, dict) and b.get("type") == "text"
)
# ── TextBackend Protocol ────────────────────────────────────────────────────── # ── TextBackend Protocol ──────────────────────────────────────────────────────
@ -116,6 +143,33 @@ class TextBackend(Protocol):
... ...
# ── FilterBackend Protocol ────────────────────────────────────────────────────
@runtime_checkable
class FilterBackend(Protocol):
"""
Abstract interface for token-classification / PII-filter backends.
Separate from TextBackend returns entity spans and redacted text,
not generated text.
"""
def classify(self, text: str) -> list[dict]:
"""Synchronous classify — returns list of entity span dicts."""
...
async def classify_async(self, text: str) -> list[dict]:
"""Async classify — runs in thread pool."""
...
@property
def model_name(self) -> str: ...
@property
def vram_mb(self) -> int: ...
# ── Backend selection ───────────────────────────────────────────────────────── # ── Backend selection ─────────────────────────────────────────────────────────
@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:
Raise ValueError for unrecognised override values. Raise ValueError for unrecognised override values.
""" """
_VALID = ("llamacpp", "transformers", "ollama", "vllm") _VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")
# 1. Caller-supplied override — highest trust, no inspection needed. # 1. Caller-supplied override — highest trust, no inspection needed.
resolved = backend or os.environ.get("CF_TEXT_BACKEND") resolved = backend or os.environ.get("CF_TEXT_BACKEND")
@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
# 3. Format detection — GGUF files are unambiguously llama-cpp territory. # 3. Format detection — GGUF files are unambiguously llama-cpp territory.
if model_path.lower().endswith(".gguf"): if model_path.lower().endswith(".gguf"):
return "llamacpp" return "llamacpp"
# 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
if os.path.isdir(model_path):
import glob as _glob
if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
return "llamacpp"
# 4. Safe default — transformers covers HF repo IDs and safetensors dirs. # 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
return "transformers" return "transformers"
@ -165,6 +224,7 @@ def make_text_backend(
model_path: str, model_path: str,
backend: str | None = None, backend: str | None = None,
mock: bool | None = None, mock: bool | None = None,
mmproj_path: str = "",
) -> "TextBackend": ) -> "TextBackend":
""" """
Return a TextBackend for the given model. Return a TextBackend for the given model.
@ -181,7 +241,7 @@ def make_text_backend(
if resolved == "llamacpp": if resolved == "llamacpp":
from circuitforge_core.text.backends.llamacpp import LlamaCppBackend from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
return LlamaCppBackend(model_path=model_path) return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)
if resolved == "transformers": if resolved == "transformers":
from circuitforge_core.text.backends.transformers import TransformersBackend from circuitforge_core.text.backends.transformers import TransformersBackend
@ -195,4 +255,22 @@ def make_text_backend(
from circuitforge_core.text.backends.vllm import VllmBackend from circuitforge_core.text.backends.vllm import VllmBackend
return VllmBackend(model_path=model_path) return VllmBackend(model_path=model_path)
raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.") raise ValueError(
f"Unknown backend {resolved!r}. "
"Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
)
def make_classifier_backend(model_path: str) -> "FilterBackend":
"""
Return a FilterBackend for the given token-classification model.
CF_TEXT_MOCK=1 MockClassifierBackend (no GPU, no model file needed)
Otherwise ClassifierBackend via transformers pipeline
"""
if os.environ.get("CF_TEXT_MOCK", "") == "1":
from circuitforge_core.text.backends.mock import MockClassifierBackend
return MockClassifierBackend(model_name=model_path)
from circuitforge_core.text.backends.classifier import ClassifierBackend
return ClassifierBackend(model_path=model_path)

View file

@ -48,7 +48,16 @@ class LlamaCppBackend:
Requires: pip install circuitforge-core[text-llamacpp] Requires: pip install circuitforge-core[text-llamacpp]
""" """
def __init__(self, model_path: str) -> None: def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
"""Load a GGUF model.
``mmproj_path``: path to a separate multimodal projector file (needed
for LLaVA-style VLMs where the visual encoder is a separate .gguf).
Qwen2-VL and similar models with an embedded projector don't need this.
``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
"moondream"). Required when mmproj_path is set.
"""
try: try:
from llama_cpp import Llama # type: ignore[import] from llama_cpp import Llama # type: ignore[import]
except ImportError as exc: except ImportError as exc:
@ -63,20 +72,53 @@ class LlamaCppBackend:
"Download a GGUF model and set CF_TEXT_MODEL to its path." "Download a GGUF model and set CF_TEXT_MODEL to its path."
) )
n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None # If given a directory, find the .gguf file inside it.
logger.info( if Path(model_path).is_dir():
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)", candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS, if not candidates:
raise FileNotFoundError(
f"No .gguf file found in directory: {model_path}"
) )
self._llm = Llama( model_path = str(candidates[0])
n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
kwargs: dict = dict(
model_path=model_path, model_path=model_path,
n_ctx=_DEFAULT_N_CTX, n_ctx=_DEFAULT_N_CTX,
n_gpu_layers=_DEFAULT_N_GPU_LAYERS, n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
n_threads=n_threads, n_threads=n_threads,
verbose=False, verbose=False,
) )
if mmproj_path:
kwargs["clip_model_path"] = mmproj_path
kwargs["chat_format"] = chat_format or "llava-1-5"
logger.info(
"Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
)
else:
logger.info(
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
)
self._llm = Llama(**kwargs)
self._model_path = model_path self._model_path = model_path
self._vram_mb = _estimate_vram_mb(model_path) self._vram_mb = _estimate_vram_mb(model_path)
# True when the model was initialised with a visual encoder (explicit
# mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
def _detect_embedded_vlm(self) -> bool:
"""Heuristic: check model metadata for a known multimodal architecture."""
try:
meta = self._llm.metadata or {}
arch = str(meta.get("general.architecture", "")).lower()
# Qwen2-VL and similar embed the vision encoder inside the GGUF.
return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
except Exception:
return False
@property @property
def model_name(self) -> str: def model_name(self) -> str:
@ -181,7 +223,14 @@ class LlamaCppBackend:
max_tokens: int = 512, max_tokens: int = 512,
temperature: float = 0.7, temperature: float = 0.7,
) -> GenerateResult: ) -> GenerateResult:
# llama-cpp-python has native chat_completion for instruct models # Detect image content before calling the model.
if any(m.has_images for m in messages) and not self._is_vlm:
raise ValueError(
"model does not support image input — "
"load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
)
# llama-cpp-python create_chat_completion accepts content as str or
# list-of-blocks (OpenAI multimodal format) natively.
output = self._llm.create_chat_completion( output = self._llm.create_chat_completion(
messages=[m.to_dict() for m in messages], messages=[m.to_dict() for m in messages],
max_tokens=max_tokens, max_tokens=max_tokens,