feat(text): multimodal content-block support + VLM mmproj passthrough

Add OpenAI-style content block models (ContentBlockText, ContentBlockImageURL) to cf-text FastAPI app; update ChatMessage.content to accept str | list. LlamaCppBackend gains mmproj_path + chat_format args for external projector VLMs; embedded VLMs (Qwen2-VL, MiniCPM-V) detected via GGUF metadata. Text-only backends raise ValueError on image input rather than silently dropping them. Adds --mmproj CLI arg wired through create_app(). Closes: #66
2026-06-05 10:18:55 -07:00 · 2026-06-05 10:18:55 -07:00 · 93ab528261
commit 93ab528261
parent 5a363f3b6c
3 changed files with 276 additions and 38 deletions
--- a/circuitforge_core/text/app.py
+++ b/circuitforge_core/text/app.py
@ -1,14 +1,15 @@
 """
 cf-text FastAPI service — managed by cf-orch.

-Lightweight local text generation. Supports GGUF models via llama.cpp and
-HuggingFace transformers. Sits alongside vllm/ollama for products that need
-fast, frequent inference from small local models (3B–7B Q4).
+Lightweight local text generation and PII filtering. Supports GGUF models via
+llama.cpp, HuggingFace transformers, and token-classification models (classifier
+backend) for PII detection and redaction.

 Endpoints:
  GET  /health      → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
-  POST /generate    → GenerateResponse
-  POST /chat        → GenerateResponse
+  POST /generate    → GenerateResponse          (text-gen backends only)
+  POST /chat        → GenerateResponse          (text-gen backends only)
+  POST /filter      → FilterResponse            (classifier backend only)

 Usage:
    python -m circuitforge_core.text.app \
@ -34,17 +35,46 @@ import os
 import time
 import uuid
 from functools import partial
+from typing import Annotated, Literal, Union

 import uvicorn
 from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
-from circuitforge_core.text.backends.base import make_text_backend
+from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
+from circuitforge_core.text.filter import FilterResult, PIIFilter

 logger = logging.getLogger(__name__)

 _backend = None
+_pii_filter: PIIFilter | None = None
+
+
+# ── Content block types (OpenAI multimodal format) ────────────────────────────
+
+
+class ContentBlockText(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ContentBlockImageURL(BaseModel):
+    type: Literal["image_url"]
+    image_url: dict[str, str]
+
+
+ContentBlock = Annotated[
+    Union[ContentBlockText, ContentBlockImageURL],
+    Field(discriminator="type"),
+]
+
+
+def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
+    """Convert an API message to a BackendChatMessage with raw content dicts."""
+    if isinstance(content, str):
+        return BackendChatMessage(role, content)
+    return BackendChatMessage(role, [b.model_dump() for b in content])


 # ── Request / response models ─────────────────────────────────────────────────
@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):

 class ChatMessageModel(BaseModel):
    role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""


 class ChatRequest(BaseModel):
@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
    model: str = ""


+class FilterRequest(BaseModel):
+    text: str
+
+
+class PIISpanResponse(BaseModel):
+    label: str
+    start: int
+    end: int
+    text: str
+    score: float
+
+
+class FilterResponse(BaseModel):
+    redacted_text: str
+    spans: list[PIISpanResponse]
+    original_text: str
+    model: str = ""
+
+
 # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────


 class OAIMessageModel(BaseModel):
    role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""


 class OAIChatRequest(BaseModel):
@ -120,6 +169,7 @@ def create_app(
    gpu_ids: str | None = None,
    backend: str | None = None,
    mock: bool = False,
+    mmproj_path: str = "",
 ) -> FastAPI:
    """Start the cf-text FastAPI app.

@ -127,8 +177,12 @@ def create_app(
    (e.g. "0,1"). When set, overrides ``gpu_id`` and sets
    ``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
    ``device_map="auto"`` can shard the model across all listed devices.
+
+    When ``backend="classifier"``, the service skips the text-gen backends
+    and loads a token-classification pipeline instead. Only ``POST /filter``
+    is available in that mode; ``/generate`` and ``/chat`` return 501.
    """
-    global _backend
+    global _backend, _pii_filter

    if not mock and not model_path:
        raise ValueError(
@ -139,13 +193,26 @@ def create_app(
    visible = gpu_ids if gpu_ids else str(gpu_id)
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)

-    _backend = make_text_backend(model_path, backend=backend, mock=mock)
-    logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
+    resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
+    if resolved_backend == "classifier" or (not resolved_backend and False):
+        classifier_backend = make_classifier_backend(model_path)
+        _pii_filter = PIIFilter.from_backend(classifier_backend)
+        logger.info(
+            "cf-text (classifier) ready: model=%r vram=%dMB",
+            classifier_backend.model_name,
+            classifier_backend.vram_mb,
+        )
+    else:
+        _backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
+        logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)

    app = FastAPI(title="cf-text", version="0.1.0")

    @app.get("/health")
    def health() -> dict:
+        if _pii_filter is not None:
+            b = _pii_filter._backend
+            return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        return {
@ -154,8 +221,35 @@ def create_app(
            "vram_mb": _backend.vram_mb,
        }

+    @app.post("/filter")
+    async def filter_text(req: FilterRequest) -> FilterResponse:
+        if _pii_filter is None:
+            raise HTTPException(
+                501,
+                detail="This cf-text instance is not running a classifier backend. "
+                       "Start with --backend classifier and a token-classification model.",
+            )
+        result = await _pii_filter.filter_async(req.text)
+        return FilterResponse(
+            redacted_text=result.redacted_text,
+            spans=[
+                PIISpanResponse(
+                    label=s.label,
+                    start=s.start,
+                    end=s.end,
+                    text=s.text,
+                    score=s.score,
+                )
+                for s in result.spans
+            ],
+            original_text=result.original_text,
+            model=_pii_filter._backend.model_name,
+        )
+
    @app.post("/generate")
    async def generate(req: GenerateRequest) -> GenerateResponse:
+        if _pii_filter is not None:
+            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
        result = await _backend.generate_async(
@ -172,16 +266,20 @@ def create_app(

    @app.post("/chat")
    async def chat(req: ChatRequest) -> GenerateResponse:
+        if _pii_filter is not None:
+            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
-        # chat() is sync-only in the Protocol; run in thread pool to avoid blocking
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            partial(_backend.chat, messages,
-                    max_tokens=req.max_tokens, temperature=req.temperature),
-        )
+        try:
+            result = await loop.run_in_executor(
+                None,
+                partial(_backend.chat, messages,
+                        max_tokens=req.max_tokens, temperature=req.temperature),
+            )
+        except ValueError as exc:
+            raise HTTPException(422, detail=str(exc)) from exc
        return GenerateResponse(
            text=result.text,
            tokens_used=result.tokens_used,
@ -198,13 +296,16 @@ def create_app(
        """
        if _backend is None:
            raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
        max_tok = req.max_tokens or 512
        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
-        )
+        try:
+            result = await loop.run_in_executor(
+                None,
+                partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
+            )
+        except ValueError as exc:
+            raise HTTPException(422, detail=str(exc)) from exc
        return OAIChatResponse(
            id=f"cftext-{uuid.uuid4().hex[:12]}",
            created=int(time.time()),
@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
    parser.add_argument("--gpu-ids", default=None,
                        help="Comma-separated CUDA device indices for multi-GPU spanning "
                             "(e.g. '0,1'). Overrides --gpu-id when set.")
-    parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
+    parser.add_argument(
+        "--backend",
+        choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
+        default=None,
+    )
+    parser.add_argument(
+        "--mmproj", default="",
+        help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
+             "Qwen2-VL and other self-contained VLMs don't need this.",
+    )
    parser.add_argument("--mock", action="store_true",
                        help="Run in mock mode (no model or GPU needed)")
    return parser.parse_args()
@ -247,5 +357,6 @@ if __name__ == "__main__":
        gpu_ids=args.gpu_ids,
        backend=args.backend,
        mock=mock,
+        mmproj_path=args.mmproj,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/circuitforge_core/text/backends/base.py
+++ b/circuitforge_core/text/backends/base.py
@ -24,17 +24,44 @@ class GenerateResult:


 class ChatMessage:
-    """A single message in a chat conversation."""
+    """A single message in a chat conversation.

-    def __init__(self, role: str, content: str) -> None:
+    ``content`` is either a plain string or a list of OpenAI-format content
+    blocks (dicts with ``type: "text"`` or ``type: "image_url"``).  Backends
+    that do not support images should call ``text_only`` to get the string
+    form before passing to the model.
+    """
+
+    def __init__(self, role: str, content: "str | list") -> None:
        if role not in ("system", "user", "assistant"):
            raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
        self.role = role
-        self.content = content
+        self.content: "str | list" = content

    def to_dict(self) -> dict:
        return {"role": self.role, "content": self.content}

+    @property
+    def has_images(self) -> bool:
+        """True when at least one content block is an image_url block."""
+        if isinstance(self.content, str):
+            return False
+        return any(
+            isinstance(b, dict) and b.get("type") == "image_url"
+            for b in self.content
+        )
+
+    @property
+    def text_only(self) -> str:
+        """Flatten multimodal content to text. Returns content as-is if already str."""
+        if isinstance(self.content, str):
+            return self.content
+        return "\n".join(
+            b["text"]
+            for b in self.content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+

 # ── TextBackend Protocol ──────────────────────────────────────────────────────

@ -116,6 +143,33 @@ class TextBackend(Protocol):
        ...


+# ── FilterBackend Protocol ────────────────────────────────────────────────────
+
+
+@runtime_checkable
+class FilterBackend(Protocol):
+    """
+    Abstract interface for token-classification / PII-filter backends.
+
+    Separate from TextBackend — returns entity spans and redacted text,
+    not generated text.
+    """
+
+    def classify(self, text: str) -> list[dict]:
+        """Synchronous classify — returns list of entity span dicts."""
+        ...
+
+    async def classify_async(self, text: str) -> list[dict]:
+        """Async classify — runs in thread pool."""
+        ...
+
+    @property
+    def model_name(self) -> str: ...
+
+    @property
+    def vram_mb(self) -> int: ...
+
+
 # ── Backend selection ─────────────────────────────────────────────────────────


@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:

    Raise ValueError for unrecognised override values.
    """
-    _VALID = ("llamacpp", "transformers", "ollama", "vllm")
+    _VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")

    # 1. Caller-supplied override — highest trust, no inspection needed.
    resolved = backend or os.environ.get("CF_TEXT_BACKEND")
@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
    # 3. Format detection — GGUF files are unambiguously llama-cpp territory.
    if model_path.lower().endswith(".gguf"):
        return "llamacpp"
+    # 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
+    if os.path.isdir(model_path):
+        import glob as _glob
+        if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
+            return "llamacpp"

    # 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
    return "transformers"
@ -165,6 +224,7 @@ def make_text_backend(
    model_path: str,
    backend: str | None = None,
    mock: bool | None = None,
+    mmproj_path: str = "",
 ) -> "TextBackend":
    """
    Return a TextBackend for the given model.
@ -181,7 +241,7 @@ def make_text_backend(

    if resolved == "llamacpp":
        from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
-        return LlamaCppBackend(model_path=model_path)
+        return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)

    if resolved == "transformers":
        from circuitforge_core.text.backends.transformers import TransformersBackend
@ -195,4 +255,22 @@ def make_text_backend(
        from circuitforge_core.text.backends.vllm import VllmBackend
        return VllmBackend(model_path=model_path)

-    raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
+    raise ValueError(
+        f"Unknown backend {resolved!r}. "
+        "Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
+    )
+
+
+def make_classifier_backend(model_path: str) -> "FilterBackend":
+    """
+    Return a FilterBackend for the given token-classification model.
+
+    CF_TEXT_MOCK=1  → MockClassifierBackend (no GPU, no model file needed)
+    Otherwise       → ClassifierBackend via transformers pipeline
+    """
+    if os.environ.get("CF_TEXT_MOCK", "") == "1":
+        from circuitforge_core.text.backends.mock import MockClassifierBackend
+        return MockClassifierBackend(model_name=model_path)
+
+    from circuitforge_core.text.backends.classifier import ClassifierBackend
+    return ClassifierBackend(model_path=model_path)
--- a/circuitforge_core/text/backends/llamacpp.py
+++ b/circuitforge_core/text/backends/llamacpp.py
@ -48,7 +48,16 @@ class LlamaCppBackend:
    Requires: pip install circuitforge-core[text-llamacpp]
    """

-    def __init__(self, model_path: str) -> None:
+    def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
+        """Load a GGUF model.
+
+        ``mmproj_path``: path to a separate multimodal projector file (needed
+        for LLaVA-style VLMs where the visual encoder is a separate .gguf).
+        Qwen2-VL and similar models with an embedded projector don't need this.
+
+        ``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
+        "moondream").  Required when mmproj_path is set.
+        """
        try:
            from llama_cpp import Llama  # type: ignore[import]
        except ImportError as exc:
@ -63,20 +72,53 @@ class LlamaCppBackend:
                "Download a GGUF model and set CF_TEXT_MODEL to its path."
            )

+        # If given a directory, find the .gguf file inside it.
+        if Path(model_path).is_dir():
+            candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
+            if not candidates:
+                raise FileNotFoundError(
+                    f"No .gguf file found in directory: {model_path}"
+                )
+            model_path = str(candidates[0])
+
        n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
-        logger.info(
-            "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
-            model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
-        )
-        self._llm = Llama(
+
+        kwargs: dict = dict(
            model_path=model_path,
            n_ctx=_DEFAULT_N_CTX,
            n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
            n_threads=n_threads,
            verbose=False,
        )
+        if mmproj_path:
+            kwargs["clip_model_path"] = mmproj_path
+            kwargs["chat_format"] = chat_format or "llava-1-5"
+            logger.info(
+                "Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
+                model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
+            )
+        else:
+            logger.info(
+                "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
+                model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
+            )
+
+        self._llm = Llama(**kwargs)
        self._model_path = model_path
        self._vram_mb = _estimate_vram_mb(model_path)
+        # True when the model was initialised with a visual encoder (explicit
+        # mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
+        self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
+
+    def _detect_embedded_vlm(self) -> bool:
+        """Heuristic: check model metadata for a known multimodal architecture."""
+        try:
+            meta = self._llm.metadata or {}
+            arch = str(meta.get("general.architecture", "")).lower()
+            # Qwen2-VL and similar embed the vision encoder inside the GGUF.
+            return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
+        except Exception:
+            return False

    @property
    def model_name(self) -> str:
@ -181,7 +223,14 @@ class LlamaCppBackend:
        max_tokens: int = 512,
        temperature: float = 0.7,
    ) -> GenerateResult:
-        # llama-cpp-python has native chat_completion for instruct models
+        # Detect image content before calling the model.
+        if any(m.has_images for m in messages) and not self._is_vlm:
+            raise ValueError(
+                "model does not support image input — "
+                "load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
+            )
+        # llama-cpp-python create_chat_completion accepts content as str or
+        # list-of-blocks (OpenAI multimodal format) natively.
        output = self._llm.create_chat_completion(
            messages=[m.to_dict() for m in messages],
            max_tokens=max_tokens,