feat(text): multimodal content-block support + VLM mmproj passthrough
Add OpenAI-style content block models (ContentBlockText, ContentBlockImageURL) to cf-text FastAPI app; update ChatMessage.content to accept str | list. LlamaCppBackend gains mmproj_path + chat_format args for external projector VLMs; embedded VLMs (Qwen2-VL, MiniCPM-V) detected via GGUF metadata. Text-only backends raise ValueError on image input rather than silently dropping them. Adds --mmproj CLI arg wired through create_app(). Closes: #66
This commit is contained in:
parent
5a363f3b6c
commit
93ab528261
3 changed files with 276 additions and 38 deletions
|
|
@ -1,14 +1,15 @@
|
|||
"""
|
||||
cf-text FastAPI service — managed by cf-orch.
|
||||
|
||||
Lightweight local text generation. Supports GGUF models via llama.cpp and
|
||||
HuggingFace transformers. Sits alongside vllm/ollama for products that need
|
||||
fast, frequent inference from small local models (3B–7B Q4).
|
||||
Lightweight local text generation and PII filtering. Supports GGUF models via
|
||||
llama.cpp, HuggingFace transformers, and token-classification models (classifier
|
||||
backend) for PII detection and redaction.
|
||||
|
||||
Endpoints:
|
||||
GET /health → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
|
||||
POST /generate → GenerateResponse
|
||||
POST /chat → GenerateResponse
|
||||
POST /generate → GenerateResponse (text-gen backends only)
|
||||
POST /chat → GenerateResponse (text-gen backends only)
|
||||
POST /filter → FilterResponse (classifier backend only)
|
||||
|
||||
Usage:
|
||||
python -m circuitforge_core.text.app \
|
||||
|
|
@ -34,17 +35,46 @@ import os
|
|||
import time
|
||||
import uuid
|
||||
from functools import partial
|
||||
from typing import Annotated, Literal, Union
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
|
||||
from circuitforge_core.text.backends.base import make_text_backend
|
||||
from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
|
||||
from circuitforge_core.text.filter import FilterResult, PIIFilter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_backend = None
|
||||
_pii_filter: PIIFilter | None = None
|
||||
|
||||
|
||||
# ── Content block types (OpenAI multimodal format) ────────────────────────────
|
||||
|
||||
|
||||
class ContentBlockText(BaseModel):
|
||||
type: Literal["text"]
|
||||
text: str
|
||||
|
||||
|
||||
class ContentBlockImageURL(BaseModel):
|
||||
type: Literal["image_url"]
|
||||
image_url: dict[str, str]
|
||||
|
||||
|
||||
ContentBlock = Annotated[
|
||||
Union[ContentBlockText, ContentBlockImageURL],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
|
||||
"""Convert an API message to a BackendChatMessage with raw content dicts."""
|
||||
if isinstance(content, str):
|
||||
return BackendChatMessage(role, content)
|
||||
return BackendChatMessage(role, [b.model_dump() for b in content])
|
||||
|
||||
|
||||
# ── Request / response models ─────────────────────────────────────────────────
|
||||
|
|
@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):
|
|||
|
||||
class ChatMessageModel(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
content: Union[str, list[ContentBlock]] = ""
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
|
|
@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
|
|||
model: str = ""
|
||||
|
||||
|
||||
class FilterRequest(BaseModel):
|
||||
text: str
|
||||
|
||||
|
||||
class PIISpanResponse(BaseModel):
|
||||
label: str
|
||||
start: int
|
||||
end: int
|
||||
text: str
|
||||
score: float
|
||||
|
||||
|
||||
class FilterResponse(BaseModel):
|
||||
redacted_text: str
|
||||
spans: list[PIISpanResponse]
|
||||
original_text: str
|
||||
model: str = ""
|
||||
|
||||
|
||||
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
|
||||
|
||||
|
||||
class OAIMessageModel(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
content: Union[str, list[ContentBlock]] = ""
|
||||
|
||||
|
||||
class OAIChatRequest(BaseModel):
|
||||
|
|
@ -120,6 +169,7 @@ def create_app(
|
|||
gpu_ids: str | None = None,
|
||||
backend: str | None = None,
|
||||
mock: bool = False,
|
||||
mmproj_path: str = "",
|
||||
) -> FastAPI:
|
||||
"""Start the cf-text FastAPI app.
|
||||
|
||||
|
|
@ -127,8 +177,12 @@ def create_app(
|
|||
(e.g. "0,1"). When set, overrides ``gpu_id`` and sets
|
||||
``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
|
||||
``device_map="auto"`` can shard the model across all listed devices.
|
||||
|
||||
When ``backend="classifier"``, the service skips the text-gen backends
|
||||
and loads a token-classification pipeline instead. Only ``POST /filter``
|
||||
is available in that mode; ``/generate`` and ``/chat`` return 501.
|
||||
"""
|
||||
global _backend
|
||||
global _backend, _pii_filter
|
||||
|
||||
if not mock and not model_path:
|
||||
raise ValueError(
|
||||
|
|
@ -139,13 +193,26 @@ def create_app(
|
|||
visible = gpu_ids if gpu_ids else str(gpu_id)
|
||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
|
||||
|
||||
_backend = make_text_backend(model_path, backend=backend, mock=mock)
|
||||
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
||||
resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
|
||||
if resolved_backend == "classifier" or (not resolved_backend and False):
|
||||
classifier_backend = make_classifier_backend(model_path)
|
||||
_pii_filter = PIIFilter.from_backend(classifier_backend)
|
||||
logger.info(
|
||||
"cf-text (classifier) ready: model=%r vram=%dMB",
|
||||
classifier_backend.model_name,
|
||||
classifier_backend.vram_mb,
|
||||
)
|
||||
else:
|
||||
_backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
|
||||
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
||||
|
||||
app = FastAPI(title="cf-text", version="0.1.0")
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
if _pii_filter is not None:
|
||||
b = _pii_filter._backend
|
||||
return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
return {
|
||||
|
|
@ -154,8 +221,35 @@ def create_app(
|
|||
"vram_mb": _backend.vram_mb,
|
||||
}
|
||||
|
||||
@app.post("/filter")
|
||||
async def filter_text(req: FilterRequest) -> FilterResponse:
|
||||
if _pii_filter is None:
|
||||
raise HTTPException(
|
||||
501,
|
||||
detail="This cf-text instance is not running a classifier backend. "
|
||||
"Start with --backend classifier and a token-classification model.",
|
||||
)
|
||||
result = await _pii_filter.filter_async(req.text)
|
||||
return FilterResponse(
|
||||
redacted_text=result.redacted_text,
|
||||
spans=[
|
||||
PIISpanResponse(
|
||||
label=s.label,
|
||||
start=s.start,
|
||||
end=s.end,
|
||||
text=s.text,
|
||||
score=s.score,
|
||||
)
|
||||
for s in result.spans
|
||||
],
|
||||
original_text=result.original_text,
|
||||
model=_pii_filter._backend.model_name,
|
||||
)
|
||||
|
||||
@app.post("/generate")
|
||||
async def generate(req: GenerateRequest) -> GenerateResponse:
|
||||
if _pii_filter is not None:
|
||||
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
result = await _backend.generate_async(
|
||||
|
|
@ -172,16 +266,20 @@ def create_app(
|
|||
|
||||
@app.post("/chat")
|
||||
async def chat(req: ChatRequest) -> GenerateResponse:
|
||||
if _pii_filter is not None:
|
||||
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
||||
# chat() is sync-only in the Protocol; run in thread pool to avoid blocking
|
||||
messages = [_to_backend_message(m.role, m.content) for m in req.messages]
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages,
|
||||
max_tokens=req.max_tokens, temperature=req.temperature),
|
||||
)
|
||||
try:
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages,
|
||||
max_tokens=req.max_tokens, temperature=req.temperature),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(422, detail=str(exc)) from exc
|
||||
return GenerateResponse(
|
||||
text=result.text,
|
||||
tokens_used=result.tokens_used,
|
||||
|
|
@ -198,13 +296,16 @@ def create_app(
|
|||
"""
|
||||
if _backend is None:
|
||||
raise HTTPException(503, detail="backend not initialised")
|
||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
||||
messages = [_to_backend_message(m.role, m.content) for m in req.messages]
|
||||
max_tok = req.max_tokens or 512
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
|
||||
)
|
||||
try:
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(422, detail=str(exc)) from exc
|
||||
return OAIChatResponse(
|
||||
id=f"cftext-{uuid.uuid4().hex[:12]}",
|
||||
created=int(time.time()),
|
||||
|
|
@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
|
|||
parser.add_argument("--gpu-ids", default=None,
|
||||
help="Comma-separated CUDA device indices for multi-GPU spanning "
|
||||
"(e.g. '0,1'). Overrides --gpu-id when set.")
|
||||
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mmproj", default="",
|
||||
help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
|
||||
"Qwen2-VL and other self-contained VLMs don't need this.",
|
||||
)
|
||||
parser.add_argument("--mock", action="store_true",
|
||||
help="Run in mock mode (no model or GPU needed)")
|
||||
return parser.parse_args()
|
||||
|
|
@ -247,5 +357,6 @@ if __name__ == "__main__":
|
|||
gpu_ids=args.gpu_ids,
|
||||
backend=args.backend,
|
||||
mock=mock,
|
||||
mmproj_path=args.mmproj,
|
||||
)
|
||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||
|
|
|
|||
|
|
@ -24,17 +24,44 @@ class GenerateResult:
|
|||
|
||||
|
||||
class ChatMessage:
|
||||
"""A single message in a chat conversation."""
|
||||
"""A single message in a chat conversation.
|
||||
|
||||
def __init__(self, role: str, content: str) -> None:
|
||||
``content`` is either a plain string or a list of OpenAI-format content
|
||||
blocks (dicts with ``type: "text"`` or ``type: "image_url"``). Backends
|
||||
that do not support images should call ``text_only`` to get the string
|
||||
form before passing to the model.
|
||||
"""
|
||||
|
||||
def __init__(self, role: str, content: "str | list") -> None:
|
||||
if role not in ("system", "user", "assistant"):
|
||||
raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
|
||||
self.role = role
|
||||
self.content = content
|
||||
self.content: "str | list" = content
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {"role": self.role, "content": self.content}
|
||||
|
||||
@property
|
||||
def has_images(self) -> bool:
|
||||
"""True when at least one content block is an image_url block."""
|
||||
if isinstance(self.content, str):
|
||||
return False
|
||||
return any(
|
||||
isinstance(b, dict) and b.get("type") == "image_url"
|
||||
for b in self.content
|
||||
)
|
||||
|
||||
@property
|
||||
def text_only(self) -> str:
|
||||
"""Flatten multimodal content to text. Returns content as-is if already str."""
|
||||
if isinstance(self.content, str):
|
||||
return self.content
|
||||
return "\n".join(
|
||||
b["text"]
|
||||
for b in self.content
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
)
|
||||
|
||||
|
||||
# ── TextBackend Protocol ──────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -116,6 +143,33 @@ class TextBackend(Protocol):
|
|||
...
|
||||
|
||||
|
||||
# ── FilterBackend Protocol ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class FilterBackend(Protocol):
|
||||
"""
|
||||
Abstract interface for token-classification / PII-filter backends.
|
||||
|
||||
Separate from TextBackend — returns entity spans and redacted text,
|
||||
not generated text.
|
||||
"""
|
||||
|
||||
def classify(self, text: str) -> list[dict]:
|
||||
"""Synchronous classify — returns list of entity span dicts."""
|
||||
...
|
||||
|
||||
async def classify_async(self, text: str) -> list[dict]:
|
||||
"""Async classify — runs in thread pool."""
|
||||
...
|
||||
|
||||
@property
|
||||
def model_name(self) -> str: ...
|
||||
|
||||
@property
|
||||
def vram_mb(self) -> int: ...
|
||||
|
||||
|
||||
# ── Backend selection ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
|
@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:
|
|||
|
||||
Raise ValueError for unrecognised override values.
|
||||
"""
|
||||
_VALID = ("llamacpp", "transformers", "ollama", "vllm")
|
||||
_VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")
|
||||
|
||||
# 1. Caller-supplied override — highest trust, no inspection needed.
|
||||
resolved = backend or os.environ.get("CF_TEXT_BACKEND")
|
||||
|
|
@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
|
|||
# 3. Format detection — GGUF files are unambiguously llama-cpp territory.
|
||||
if model_path.lower().endswith(".gguf"):
|
||||
return "llamacpp"
|
||||
# 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
|
||||
if os.path.isdir(model_path):
|
||||
import glob as _glob
|
||||
if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
|
||||
return "llamacpp"
|
||||
|
||||
# 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
|
||||
return "transformers"
|
||||
|
|
@ -165,6 +224,7 @@ def make_text_backend(
|
|||
model_path: str,
|
||||
backend: str | None = None,
|
||||
mock: bool | None = None,
|
||||
mmproj_path: str = "",
|
||||
) -> "TextBackend":
|
||||
"""
|
||||
Return a TextBackend for the given model.
|
||||
|
|
@ -181,7 +241,7 @@ def make_text_backend(
|
|||
|
||||
if resolved == "llamacpp":
|
||||
from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
|
||||
return LlamaCppBackend(model_path=model_path)
|
||||
return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)
|
||||
|
||||
if resolved == "transformers":
|
||||
from circuitforge_core.text.backends.transformers import TransformersBackend
|
||||
|
|
@ -195,4 +255,22 @@ def make_text_backend(
|
|||
from circuitforge_core.text.backends.vllm import VllmBackend
|
||||
return VllmBackend(model_path=model_path)
|
||||
|
||||
raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
|
||||
raise ValueError(
|
||||
f"Unknown backend {resolved!r}. "
|
||||
"Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
|
||||
)
|
||||
|
||||
|
||||
def make_classifier_backend(model_path: str) -> "FilterBackend":
|
||||
"""
|
||||
Return a FilterBackend for the given token-classification model.
|
||||
|
||||
CF_TEXT_MOCK=1 → MockClassifierBackend (no GPU, no model file needed)
|
||||
Otherwise → ClassifierBackend via transformers pipeline
|
||||
"""
|
||||
if os.environ.get("CF_TEXT_MOCK", "") == "1":
|
||||
from circuitforge_core.text.backends.mock import MockClassifierBackend
|
||||
return MockClassifierBackend(model_name=model_path)
|
||||
|
||||
from circuitforge_core.text.backends.classifier import ClassifierBackend
|
||||
return ClassifierBackend(model_path=model_path)
|
||||
|
|
|
|||
|
|
@ -48,7 +48,16 @@ class LlamaCppBackend:
|
|||
Requires: pip install circuitforge-core[text-llamacpp]
|
||||
"""
|
||||
|
||||
def __init__(self, model_path: str) -> None:
|
||||
def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
|
||||
"""Load a GGUF model.
|
||||
|
||||
``mmproj_path``: path to a separate multimodal projector file (needed
|
||||
for LLaVA-style VLMs where the visual encoder is a separate .gguf).
|
||||
Qwen2-VL and similar models with an embedded projector don't need this.
|
||||
|
||||
``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
|
||||
"moondream"). Required when mmproj_path is set.
|
||||
"""
|
||||
try:
|
||||
from llama_cpp import Llama # type: ignore[import]
|
||||
except ImportError as exc:
|
||||
|
|
@ -63,20 +72,53 @@ class LlamaCppBackend:
|
|||
"Download a GGUF model and set CF_TEXT_MODEL to its path."
|
||||
)
|
||||
|
||||
# If given a directory, find the .gguf file inside it.
|
||||
if Path(model_path).is_dir():
|
||||
candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
|
||||
if not candidates:
|
||||
raise FileNotFoundError(
|
||||
f"No .gguf file found in directory: {model_path}"
|
||||
)
|
||||
model_path = str(candidates[0])
|
||||
|
||||
n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
|
||||
logger.info(
|
||||
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
|
||||
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
||||
)
|
||||
self._llm = Llama(
|
||||
|
||||
kwargs: dict = dict(
|
||||
model_path=model_path,
|
||||
n_ctx=_DEFAULT_N_CTX,
|
||||
n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
|
||||
n_threads=n_threads,
|
||||
verbose=False,
|
||||
)
|
||||
if mmproj_path:
|
||||
kwargs["clip_model_path"] = mmproj_path
|
||||
kwargs["chat_format"] = chat_format or "llava-1-5"
|
||||
logger.info(
|
||||
"Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
|
||||
model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
|
||||
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
||||
)
|
||||
|
||||
self._llm = Llama(**kwargs)
|
||||
self._model_path = model_path
|
||||
self._vram_mb = _estimate_vram_mb(model_path)
|
||||
# True when the model was initialised with a visual encoder (explicit
|
||||
# mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
|
||||
self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
|
||||
|
||||
def _detect_embedded_vlm(self) -> bool:
|
||||
"""Heuristic: check model metadata for a known multimodal architecture."""
|
||||
try:
|
||||
meta = self._llm.metadata or {}
|
||||
arch = str(meta.get("general.architecture", "")).lower()
|
||||
# Qwen2-VL and similar embed the vision encoder inside the GGUF.
|
||||
return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
|
|
@ -181,7 +223,14 @@ class LlamaCppBackend:
|
|||
max_tokens: int = 512,
|
||||
temperature: float = 0.7,
|
||||
) -> GenerateResult:
|
||||
# llama-cpp-python has native chat_completion for instruct models
|
||||
# Detect image content before calling the model.
|
||||
if any(m.has_images for m in messages) and not self._is_vlm:
|
||||
raise ValueError(
|
||||
"model does not support image input — "
|
||||
"load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
|
||||
)
|
||||
# llama-cpp-python create_chat_completion accepts content as str or
|
||||
# list-of-blocks (OpenAI multimodal format) natively.
|
||||
output = self._llm.create_chat_completion(
|
||||
messages=[m.to_dict() for m in messages],
|
||||
max_tokens=max_tokens,
|
||||
|
|
|
|||
Loading…
Reference in a new issue