feat(text): multimodal content-block support + VLM mmproj passthrough

Add OpenAI-style content block models (ContentBlockText, ContentBlockImageURL) to cf-text FastAPI app; update ChatMessage.content to accept str | list. LlamaCppBackend gains mmproj_path + chat_format args for external projector VLMs; embedded VLMs (Qwen2-VL, MiniCPM-V) detected via GGUF metadata. Text-only backends raise ValueError on image input rather than silently dropping them. Adds --mmproj CLI arg wired through create_app(). Closes: #66
2026-06-05 10:18:55 -07:00 · 2026-06-05 10:18:55 -07:00 · 93ab528261
commit 93ab528261
parent 5a363f3b6c
3 changed files with 276 additions and 38 deletions
--- a/circuitforge_core/text/app.py
+++ b/circuitforge_core/text/app.py
@ -1,14 +1,15 @@
 """
 cf-text FastAPI service — managed by cf-orch.
-Lightweight local text generation. Supports GGUF models via llama.cpp and
+Lightweight local text generation and PII filtering. Supports GGUF models via
-HuggingFace transformers. Sits alongside vllm/ollama for products that need
+llama.cpp, HuggingFace transformers, and token-classification models (classifier
-fast, frequent inference from small local models (3B–7B Q4).
+backend) for PII detection and redaction.
 Endpoints:
  GET  /health      → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
-  POST /generate    → GenerateResponse
+  POST /generate    → GenerateResponse          (text-gen backends only)
-  POST /chat        → GenerateResponse
+  POST /chat        → GenerateResponse          (text-gen backends only)
  POST /filter      → FilterResponse            (classifier backend only)
 Usage:
    python -m circuitforge_core.text.app \
@ -34,17 +35,46 @@ import os
 import time
 import uuid
 from functools import partial
 from typing import Annotated, Literal, Union
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
-from circuitforge_core.text.backends.base import make_text_backend
+from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
 from circuitforge_core.text.filter import FilterResult, PIIFilter
 logger = logging.getLogger(__name__)
 _backend = None
 _pii_filter: PIIFilter | None = None
 # ── Content block types (OpenAI multimodal format) ────────────────────────────
 class ContentBlockText(BaseModel):
    type: Literal["text"]
    text: str
 class ContentBlockImageURL(BaseModel):
    type: Literal["image_url"]
    image_url: dict[str, str]
 ContentBlock = Annotated[
    Union[ContentBlockText, ContentBlockImageURL],
    Field(discriminator="type"),
 ]
 def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
    """Convert an API message to a BackendChatMessage with raw content dicts."""
    if isinstance(content, str):
        return BackendChatMessage(role, content)
    return BackendChatMessage(role, [b.model_dump() for b in content])
 # ── Request / response models ─────────────────────────────────────────────────
@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):
 class ChatMessageModel(BaseModel):
    role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""
 class ChatRequest(BaseModel):
@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
    model: str = ""
 class FilterRequest(BaseModel):
    text: str
 class PIISpanResponse(BaseModel):
    label: str
    start: int
    end: int
    text: str
    score: float
 class FilterResponse(BaseModel):
    redacted_text: str
    spans: list[PIISpanResponse]
    original_text: str
    model: str = ""
 # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
 class OAIMessageModel(BaseModel):
    role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""
 class OAIChatRequest(BaseModel):
@ -120,6 +169,7 @@ def create_app(
    gpu_ids: str | None = None,
    backend: str | None = None,
    mock: bool = False,
    mmproj_path: str = "",
 ) -> FastAPI:
    """Start the cf-text FastAPI app.
@ -127,8 +177,12 @@ def create_app(
    (e.g. "0,1"). When set, overrides ``gpu_id`` and sets
    ``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
    ``device_map="auto"`` can shard the model across all listed devices.
    When ``backend="classifier"``, the service skips the text-gen backends
    and loads a token-classification pipeline instead. Only ``POST /filter``
    is available in that mode; ``/generate`` and ``/chat`` return 501.
    """
-    global _backend
+    global _backend, _pii_filter
    if not mock and not model_path:
        raise ValueError(
@ -139,13 +193,26 @@ def create_app(
    visible = gpu_ids if gpu_ids else str(gpu_id)
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
-    _backend = make_text_backend(model_path, backend=backend, mock=mock)
+    resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
-    logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
+    if resolved_backend == "classifier" or (not resolved_backend and False):
        classifier_backend = make_classifier_backend(model_path)
        _pii_filter = PIIFilter.from_backend(classifier_backend)
        logger.info(
            "cf-text (classifier) ready: model=%r vram=%dMB",
            classifier_backend.model_name,
            classifier_backend.vram_mb,
        )
    else:
        _backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
        logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
    app = FastAPI(title="cf-text", version="0.1.0")
    @app.get("/health")
    def health() -> dict:
        if _pii_filter is not None:
            b = _pii_filter._backend
            return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        return {
@ -154,8 +221,35 @@ def create_app(
            "vram_mb": _backend.vram_mb,
        }
    @app.post("/filter")
    async def filter_text(req: FilterRequest) -> FilterResponse:
        if _pii_filter is None:
            raise HTTPException(
                501,
                detail="This cf-text instance is not running a classifier backend. "
                       "Start with --backend classifier and a token-classification model.",
            )
        result = await _pii_filter.filter_async(req.text)
        return FilterResponse(
            redacted_text=result.redacted_text,
            spans=[
                PIISpanResponse(
                    label=s.label,
                    start=s.start,
                    end=s.end,
                    text=s.text,
                    score=s.score,
                )
                for s in result.spans
            ],
            original_text=result.original_text,
            model=_pii_filter._backend.model_name,
        )
    @app.post("/generate")
    async def generate(req: GenerateRequest) -> GenerateResponse:
        if _pii_filter is not None:
            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        result = await _backend.generate_async(
@ -172,16 +266,20 @@ def create_app(
    @app.post("/chat")
    async def chat(req: ChatRequest) -> GenerateResponse:
        if _pii_filter is not None:
            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
        # chat() is sync-only in the Protocol; run in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
+        try:
-            None,
+            result = await loop.run_in_executor(
-            partial(_backend.chat, messages,
+                None,
-                    max_tokens=req.max_tokens, temperature=req.temperature),
+                partial(_backend.chat, messages,
-        )
+                        max_tokens=req.max_tokens, temperature=req.temperature),
            )
        except ValueError as exc:
            raise HTTPException(422, detail=str(exc)) from exc
        return GenerateResponse(
            text=result.text,
            tokens_used=result.tokens_used,
@ -198,13 +296,16 @@ def create_app(
        """
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
        max_tok = req.max_tokens or 512
        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
+        try:
-            None,
+            result = await loop.run_in_executor(
-            partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
+                None,
-        )
+                partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
            )
        except ValueError as exc:
            raise HTTPException(422, detail=str(exc)) from exc
        return OAIChatResponse(
            id=f"cftext-{uuid.uuid4().hex[:12]}",
            created=int(time.time()),
@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
    parser.add_argument("--gpu-ids", default=None,
                        help="Comma-separated CUDA device indices for multi-GPU spanning "
                             "(e.g. '0,1'). Overrides --gpu-id when set.")
-    parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
+    parser.add_argument(
        "--backend",
        choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
        default=None,
    )
    parser.add_argument(
        "--mmproj", default="",
        help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
             "Qwen2-VL and other self-contained VLMs don't need this.",
    )
    parser.add_argument("--mock", action="store_true",
                        help="Run in mock mode (no model or GPU needed)")
    return parser.parse_args()
@ -247,5 +357,6 @@ if __name__ == "__main__":
        gpu_ids=args.gpu_ids,
        backend=args.backend,
        mock=mock,
        mmproj_path=args.mmproj,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/circuitforge_core/text/backends/base.py
+++ b/circuitforge_core/text/backends/base.py
@ -24,17 +24,44 @@ class GenerateResult:
 class ChatMessage:
-    """A single message in a chat conversation."""
+    """A single message in a chat conversation.
-    def __init__(self, role: str, content: str) -> None:
+    ``content`` is either a plain string or a list of OpenAI-format content
    blocks (dicts with ``type: "text"`` or ``type: "image_url"``).  Backends
    that do not support images should call ``text_only`` to get the string
    form before passing to the model.
    """
    def __init__(self, role: str, content: "str | list") -> None:
        if role not in ("system", "user", "assistant"):
            raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
        self.role = role
-        self.content = content
+        self.content: "str | list" = content
    def to_dict(self) -> dict:
        return {"role": self.role, "content": self.content}
    @property
    def has_images(self) -> bool:
        """True when at least one content block is an image_url block."""
        if isinstance(self.content, str):
            return False
        return any(
            isinstance(b, dict) and b.get("type") == "image_url"
            for b in self.content
        )
    @property
    def text_only(self) -> str:
        """Flatten multimodal content to text. Returns content as-is if already str."""
        if isinstance(self.content, str):
            return self.content
        return "\n".join(
            b["text"]
            for b in self.content
            if isinstance(b, dict) and b.get("type") == "text"
        )
 # ── TextBackend Protocol ──────────────────────────────────────────────────────
@ -116,6 +143,33 @@ class TextBackend(Protocol):
        ...
 # ── FilterBackend Protocol ────────────────────────────────────────────────────
@runtime_checkable
 class FilterBackend(Protocol):
    """
    Abstract interface for token-classification / PII-filter backends.
    Separate from TextBackend — returns entity spans and redacted text,
    not generated text.
    """
    def classify(self, text: str) -> list[dict]:
        """Synchronous classify — returns list of entity span dicts."""
        ...
    async def classify_async(self, text: str) -> list[dict]:
        """Async classify — runs in thread pool."""
        ...
    @property
    def model_name(self) -> str: ...
    @property
    def vram_mb(self) -> int: ...
 # ── Backend selection ─────────────────────────────────────────────────────────
@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:
    Raise ValueError for unrecognised override values.
    """
-    _VALID = ("llamacpp", "transformers", "ollama", "vllm")
+    _VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")
    # 1. Caller-supplied override — highest trust, no inspection needed.
    resolved = backend or os.environ.get("CF_TEXT_BACKEND")
@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
    # 3. Format detection — GGUF files are unambiguously llama-cpp territory.
    if model_path.lower().endswith(".gguf"):
        return "llamacpp"
    # 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
    if os.path.isdir(model_path):
        import glob as _glob
        if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
            return "llamacpp"
    # 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
    return "transformers"
@ -165,6 +224,7 @@ def make_text_backend(
    model_path: str,
    backend: str | None = None,
    mock: bool | None = None,
    mmproj_path: str = "",
 ) -> "TextBackend":
    """
    Return a TextBackend for the given model.
@ -181,7 +241,7 @@ def make_text_backend(
    if resolved == "llamacpp":
        from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
-        return LlamaCppBackend(model_path=model_path)
+        return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)
    if resolved == "transformers":
        from circuitforge_core.text.backends.transformers import TransformersBackend
@ -195,4 +255,22 @@ def make_text_backend(
        from circuitforge_core.text.backends.vllm import VllmBackend
        return VllmBackend(model_path=model_path)
-    raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
+    raise ValueError(
        f"Unknown backend {resolved!r}. "
        "Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
    )
 def make_classifier_backend(model_path: str) -> "FilterBackend":
    """
    Return a FilterBackend for the given token-classification model.
    CF_TEXT_MOCK=1  → MockClassifierBackend (no GPU, no model file needed)
    Otherwise       → ClassifierBackend via transformers pipeline
    """
    if os.environ.get("CF_TEXT_MOCK", "") == "1":
        from circuitforge_core.text.backends.mock import MockClassifierBackend
        return MockClassifierBackend(model_name=model_path)
    from circuitforge_core.text.backends.classifier import ClassifierBackend
    return ClassifierBackend(model_path=model_path)
--- a/circuitforge_core/text/backends/llamacpp.py
+++ b/circuitforge_core/text/backends/llamacpp.py
@ -48,7 +48,16 @@ class LlamaCppBackend:
    Requires: pip install circuitforge-core[text-llamacpp]
    """
-    def __init__(self, model_path: str) -> None:
+    def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
        """Load a GGUF model.
        ``mmproj_path``: path to a separate multimodal projector file (needed
        for LLaVA-style VLMs where the visual encoder is a separate .gguf).
        Qwen2-VL and similar models with an embedded projector don't need this.
        ``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
        "moondream").  Required when mmproj_path is set.
        """
        try:
            from llama_cpp import Llama  # type: ignore[import]
        except ImportError as exc:
@ -63,20 +72,53 @@ class LlamaCppBackend:
                "Download a GGUF model and set CF_TEXT_MODEL to its path."
            )
        # If given a directory, find the .gguf file inside it.
        if Path(model_path).is_dir():
            candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
            if not candidates:
                raise FileNotFoundError(
                    f"No .gguf file found in directory: {model_path}"
                )
            model_path = str(candidates[0])
        n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
-        logger.info(
+
-            "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
+        kwargs: dict = dict(
            model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
        )
        self._llm = Llama(
            model_path=model_path,
            n_ctx=_DEFAULT_N_CTX,
            n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
            n_threads=n_threads,
            verbose=False,
        )
        if mmproj_path:
            kwargs["clip_model_path"] = mmproj_path
            kwargs["chat_format"] = chat_format or "llava-1-5"
            logger.info(
                "Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
                model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
            )
        else:
            logger.info(
                "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
                model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
            )
        self._llm = Llama(**kwargs)
        self._model_path = model_path
        self._vram_mb = _estimate_vram_mb(model_path)
        # True when the model was initialised with a visual encoder (explicit
        # mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
        self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
    def _detect_embedded_vlm(self) -> bool:
        """Heuristic: check model metadata for a known multimodal architecture."""
        try:
            meta = self._llm.metadata or {}
            arch = str(meta.get("general.architecture", "")).lower()
            # Qwen2-VL and similar embed the vision encoder inside the GGUF.
            return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
        except Exception:
            return False
    @property
    def model_name(self) -> str:
@ -181,7 +223,14 @@ class LlamaCppBackend:
        max_tokens: int = 512,
        temperature: float = 0.7,
    ) -> GenerateResult:
-        # llama-cpp-python has native chat_completion for instruct models
+        # Detect image content before calling the model.
        if any(m.has_images for m in messages) and not self._is_vlm:
            raise ValueError(
                "model does not support image input — "
                "load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
            )
        # llama-cpp-python create_chat_completion accepts content as str or
        # list-of-blocks (OpenAI multimodal format) natively.
        output = self._llm.create_chat_completion(
            messages=[m.to_dict() for m in messages],
            max_tokens=max_tokens,