diff --git a/circuitforge_core/text/app.py b/circuitforge_core/text/app.py
index fc84f35..3e9ead1 100644
--- a/circuitforge_core/text/app.py
+++ b/circuitforge_core/text/app.py
@@ -1,14 +1,15 @@
 """
 cf-text FastAPI service — managed by cf-orch.
 
-Lightweight local text generation. Supports GGUF models via llama.cpp and
-HuggingFace transformers. Sits alongside vllm/ollama for products that need
-fast, frequent inference from small local models (3B–7B Q4).
+Lightweight local text generation and PII filtering. Supports GGUF models via
+llama.cpp, HuggingFace transformers, and token-classification models (classifier
+backend) for PII detection and redaction.
 
 Endpoints:
   GET  /health      → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
-  POST /generate    → GenerateResponse
-  POST /chat        → GenerateResponse
+  POST /generate    → GenerateResponse          (text-gen backends only)
+  POST /chat        → GenerateResponse          (text-gen backends only)
+  POST /filter      → FilterResponse            (classifier backend only)
 
 Usage:
     python -m circuitforge_core.text.app \
@@ -34,17 +35,46 @@ import os
 import time
 import uuid
 from functools import partial
+from typing import Annotated, Literal, Union
 
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
-from circuitforge_core.text.backends.base import make_text_backend
+from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
+from circuitforge_core.text.filter import FilterResult, PIIFilter
 
 logger = logging.getLogger(__name__)
 
 _backend = None
+_pii_filter: PIIFilter | None = None
+
+
+# ── Content block types (OpenAI multimodal format) ────────────────────────────
+
+
+class ContentBlockText(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ContentBlockImageURL(BaseModel):
+    type: Literal["image_url"]
+    image_url: dict[str, str]
+
+
+ContentBlock = Annotated[
+    Union[ContentBlockText, ContentBlockImageURL],
+    Field(discriminator="type"),
+]
+
+
+def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
+    """Convert an API message to a BackendChatMessage with raw content dicts."""
+    if isinstance(content, str):
+        return BackendChatMessage(role, content)
+    return BackendChatMessage(role, [b.model_dump() for b in content])
 
 
 # ── Request / response models ─────────────────────────────────────────────────
@@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):
 
 class ChatMessageModel(BaseModel):
     role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""
 
 
 class ChatRequest(BaseModel):
@@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
     model: str = ""
 
 
+class FilterRequest(BaseModel):
+    text: str
+
+
+class PIISpanResponse(BaseModel):
+    label: str
+    start: int
+    end: int
+    text: str
+    score: float
+
+
+class FilterResponse(BaseModel):
+    redacted_text: str
+    spans: list[PIISpanResponse]
+    original_text: str
+    model: str = ""
+
+
 # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
 
 
 class OAIMessageModel(BaseModel):
     role: str
-    content: str
+    content: Union[str, list[ContentBlock]] = ""
 
 
 class OAIChatRequest(BaseModel):
@@ -120,6 +169,7 @@ def create_app(
     gpu_ids: str | None = None,
     backend: str | None = None,
     mock: bool = False,
+    mmproj_path: str = "",
 ) -> FastAPI:
     """Start the cf-text FastAPI app.
 
@@ -127,8 +177,12 @@ def create_app(
     (e.g. "0,1"). When set, overrides ``gpu_id`` and sets
     ``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
     ``device_map="auto"`` can shard the model across all listed devices.
+
+    When ``backend="classifier"``, the service skips the text-gen backends
+    and loads a token-classification pipeline instead. Only ``POST /filter``
+    is available in that mode; ``/generate`` and ``/chat`` return 501.
     """
-    global _backend
+    global _backend, _pii_filter
 
     if not mock and not model_path:
         raise ValueError(
@@ -139,13 +193,26 @@ def create_app(
     visible = gpu_ids if gpu_ids else str(gpu_id)
     os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
 
-    _backend = make_text_backend(model_path, backend=backend, mock=mock)
-    logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
+    resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
+    if resolved_backend == "classifier" or (not resolved_backend and False):
+        classifier_backend = make_classifier_backend(model_path)
+        _pii_filter = PIIFilter.from_backend(classifier_backend)
+        logger.info(
+            "cf-text (classifier) ready: model=%r vram=%dMB",
+            classifier_backend.model_name,
+            classifier_backend.vram_mb,
+        )
+    else:
+        _backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
+        logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
 
     app = FastAPI(title="cf-text", version="0.1.0")
 
     @app.get("/health")
     def health() -> dict:
+        if _pii_filter is not None:
+            b = _pii_filter._backend
+            return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
         if _backend is None:
             raise HTTPException(503, detail="backend not initialised")
         return {
@@ -154,8 +221,35 @@ def create_app(
             "vram_mb": _backend.vram_mb,
         }
 
+    @app.post("/filter")
+    async def filter_text(req: FilterRequest) -> FilterResponse:
+        if _pii_filter is None:
+            raise HTTPException(
+                501,
+                detail="This cf-text instance is not running a classifier backend. "
+                       "Start with --backend classifier and a token-classification model.",
+            )
+        result = await _pii_filter.filter_async(req.text)
+        return FilterResponse(
+            redacted_text=result.redacted_text,
+            spans=[
+                PIISpanResponse(
+                    label=s.label,
+                    start=s.start,
+                    end=s.end,
+                    text=s.text,
+                    score=s.score,
+                )
+                for s in result.spans
+            ],
+            original_text=result.original_text,
+            model=_pii_filter._backend.model_name,
+        )
+
     @app.post("/generate")
     async def generate(req: GenerateRequest) -> GenerateResponse:
+        if _pii_filter is not None:
+            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
         if _backend is None:
             raise HTTPException(503, detail="backend not initialised")
         result = await _backend.generate_async(
@@ -172,16 +266,20 @@ def create_app(
 
     @app.post("/chat")
     async def chat(req: ChatRequest) -> GenerateResponse:
+        if _pii_filter is not None:
+            raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
         if _backend is None:
             raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
-        # chat() is sync-only in the Protocol; run in thread pool to avoid blocking
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
         loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            partial(_backend.chat, messages,
-                    max_tokens=req.max_tokens, temperature=req.temperature),
-        )
+        try:
+            result = await loop.run_in_executor(
+                None,
+                partial(_backend.chat, messages,
+                        max_tokens=req.max_tokens, temperature=req.temperature),
+            )
+        except ValueError as exc:
+            raise HTTPException(422, detail=str(exc)) from exc
         return GenerateResponse(
             text=result.text,
             tokens_used=result.tokens_used,
@@ -198,13 +296,16 @@ def create_app(
         """
         if _backend is None:
             raise HTTPException(503, detail="backend not initialised")
-        messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
+        messages = [_to_backend_message(m.role, m.content) for m in req.messages]
         max_tok = req.max_tokens or 512
         loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
-        )
+        try:
+            result = await loop.run_in_executor(
+                None,
+                partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
+            )
+        except ValueError as exc:
+            raise HTTPException(422, detail=str(exc)) from exc
         return OAIChatResponse(
             id=f"cftext-{uuid.uuid4().hex[:12]}",
             created=int(time.time()),
@@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument("--gpu-ids", default=None,
                         help="Comma-separated CUDA device indices for multi-GPU spanning "
                              "(e.g. '0,1'). Overrides --gpu-id when set.")
-    parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
+    parser.add_argument(
+        "--backend",
+        choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
+        default=None,
+    )
+    parser.add_argument(
+        "--mmproj", default="",
+        help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
+             "Qwen2-VL and other self-contained VLMs don't need this.",
+    )
     parser.add_argument("--mock", action="store_true",
                         help="Run in mock mode (no model or GPU needed)")
     return parser.parse_args()
@@ -247,5 +357,6 @@ if __name__ == "__main__":
         gpu_ids=args.gpu_ids,
         backend=args.backend,
         mock=mock,
+        mmproj_path=args.mmproj,
     )
     uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/circuitforge_core/text/backends/base.py b/circuitforge_core/text/backends/base.py
index 4982778..5e326d2 100644
--- a/circuitforge_core/text/backends/base.py
+++ b/circuitforge_core/text/backends/base.py
@@ -24,17 +24,44 @@ class GenerateResult:
 
 
 class ChatMessage:
-    """A single message in a chat conversation."""
+    """A single message in a chat conversation.
 
-    def __init__(self, role: str, content: str) -> None:
+    ``content`` is either a plain string or a list of OpenAI-format content
+    blocks (dicts with ``type: "text"`` or ``type: "image_url"``).  Backends
+    that do not support images should call ``text_only`` to get the string
+    form before passing to the model.
+    """
+
+    def __init__(self, role: str, content: "str | list") -> None:
         if role not in ("system", "user", "assistant"):
             raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
         self.role = role
-        self.content = content
+        self.content: "str | list" = content
 
     def to_dict(self) -> dict:
         return {"role": self.role, "content": self.content}
 
+    @property
+    def has_images(self) -> bool:
+        """True when at least one content block is an image_url block."""
+        if isinstance(self.content, str):
+            return False
+        return any(
+            isinstance(b, dict) and b.get("type") == "image_url"
+            for b in self.content
+        )
+
+    @property
+    def text_only(self) -> str:
+        """Flatten multimodal content to text. Returns content as-is if already str."""
+        if isinstance(self.content, str):
+            return self.content
+        return "\n".join(
+            b["text"]
+            for b in self.content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+
 
 # ── TextBackend Protocol ──────────────────────────────────────────────────────
 
@@ -116,6 +143,33 @@ class TextBackend(Protocol):
         ...
 
 
+# ── FilterBackend Protocol ────────────────────────────────────────────────────
+
+
+@runtime_checkable
+class FilterBackend(Protocol):
+    """
+    Abstract interface for token-classification / PII-filter backends.
+
+    Separate from TextBackend — returns entity spans and redacted text,
+    not generated text.
+    """
+
+    def classify(self, text: str) -> list[dict]:
+        """Synchronous classify — returns list of entity span dicts."""
+        ...
+
+    async def classify_async(self, text: str) -> list[dict]:
+        """Async classify — runs in thread pool."""
+        ...
+
+    @property
+    def model_name(self) -> str: ...
+
+    @property
+    def vram_mb(self) -> int: ...
+
+
 # ── Backend selection ─────────────────────────────────────────────────────────
 
 
@@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:
 
     Raise ValueError for unrecognised override values.
     """
-    _VALID = ("llamacpp", "transformers", "ollama", "vllm")
+    _VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")
 
     # 1. Caller-supplied override — highest trust, no inspection needed.
     resolved = backend or os.environ.get("CF_TEXT_BACKEND")
@@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
     # 3. Format detection — GGUF files are unambiguously llama-cpp territory.
     if model_path.lower().endswith(".gguf"):
         return "llamacpp"
+    # 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
+    if os.path.isdir(model_path):
+        import glob as _glob
+        if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
+            return "llamacpp"
 
     # 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
     return "transformers"
@@ -165,6 +224,7 @@ def make_text_backend(
     model_path: str,
     backend: str | None = None,
     mock: bool | None = None,
+    mmproj_path: str = "",
 ) -> "TextBackend":
     """
     Return a TextBackend for the given model.
@@ -181,7 +241,7 @@ def make_text_backend(
 
     if resolved == "llamacpp":
         from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
-        return LlamaCppBackend(model_path=model_path)
+        return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)
 
     if resolved == "transformers":
         from circuitforge_core.text.backends.transformers import TransformersBackend
@@ -195,4 +255,22 @@ def make_text_backend(
         from circuitforge_core.text.backends.vllm import VllmBackend
         return VllmBackend(model_path=model_path)
 
-    raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
+    raise ValueError(
+        f"Unknown backend {resolved!r}. "
+        "Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
+    )
+
+
+def make_classifier_backend(model_path: str) -> "FilterBackend":
+    """
+    Return a FilterBackend for the given token-classification model.
+
+    CF_TEXT_MOCK=1  → MockClassifierBackend (no GPU, no model file needed)
+    Otherwise       → ClassifierBackend via transformers pipeline
+    """
+    if os.environ.get("CF_TEXT_MOCK", "") == "1":
+        from circuitforge_core.text.backends.mock import MockClassifierBackend
+        return MockClassifierBackend(model_name=model_path)
+
+    from circuitforge_core.text.backends.classifier import ClassifierBackend
+    return ClassifierBackend(model_path=model_path)
diff --git a/circuitforge_core/text/backends/llamacpp.py b/circuitforge_core/text/backends/llamacpp.py
index 2ddc932..98f42e0 100644
--- a/circuitforge_core/text/backends/llamacpp.py
+++ b/circuitforge_core/text/backends/llamacpp.py
@@ -48,7 +48,16 @@ class LlamaCppBackend:
     Requires: pip install circuitforge-core[text-llamacpp]
     """
 
-    def __init__(self, model_path: str) -> None:
+    def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
+        """Load a GGUF model.
+
+        ``mmproj_path``: path to a separate multimodal projector file (needed
+        for LLaVA-style VLMs where the visual encoder is a separate .gguf).
+        Qwen2-VL and similar models with an embedded projector don't need this.
+
+        ``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
+        "moondream").  Required when mmproj_path is set.
+        """
         try:
             from llama_cpp import Llama  # type: ignore[import]
         except ImportError as exc:
@@ -63,20 +72,53 @@ class LlamaCppBackend:
                 "Download a GGUF model and set CF_TEXT_MODEL to its path."
             )
 
+        # If given a directory, find the .gguf file inside it.
+        if Path(model_path).is_dir():
+            candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
+            if not candidates:
+                raise FileNotFoundError(
+                    f"No .gguf file found in directory: {model_path}"
+                )
+            model_path = str(candidates[0])
+
         n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
-        logger.info(
-            "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
-            model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
-        )
-        self._llm = Llama(
+
+        kwargs: dict = dict(
             model_path=model_path,
             n_ctx=_DEFAULT_N_CTX,
             n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
             n_threads=n_threads,
             verbose=False,
         )
+        if mmproj_path:
+            kwargs["clip_model_path"] = mmproj_path
+            kwargs["chat_format"] = chat_format or "llava-1-5"
+            logger.info(
+                "Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
+                model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
+            )
+        else:
+            logger.info(
+                "Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
+                model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
+            )
+
+        self._llm = Llama(**kwargs)
         self._model_path = model_path
         self._vram_mb = _estimate_vram_mb(model_path)
+        # True when the model was initialised with a visual encoder (explicit
+        # mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
+        self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
+
+    def _detect_embedded_vlm(self) -> bool:
+        """Heuristic: check model metadata for a known multimodal architecture."""
+        try:
+            meta = self._llm.metadata or {}
+            arch = str(meta.get("general.architecture", "")).lower()
+            # Qwen2-VL and similar embed the vision encoder inside the GGUF.
+            return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
+        except Exception:
+            return False
 
     @property
     def model_name(self) -> str:
@@ -181,7 +223,14 @@ class LlamaCppBackend:
         max_tokens: int = 512,
         temperature: float = 0.7,
     ) -> GenerateResult:
-        # llama-cpp-python has native chat_completion for instruct models
+        # Detect image content before calling the model.
+        if any(m.has_images for m in messages) and not self._is_vlm:
+            raise ValueError(
+                "model does not support image input — "
+                "load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
+            )
+        # llama-cpp-python create_chat_completion accepts content as str or
+        # list-of-blocks (OpenAI multimodal format) natively.
         output = self._llm.create_chat_completion(
             messages=[m.to_dict() for m in messages],
             max_tokens=max_tokens,