diff --git a/circuitforge_core/text/app.py b/circuitforge_core/text/app.py index fc84f35..3e9ead1 100644 --- a/circuitforge_core/text/app.py +++ b/circuitforge_core/text/app.py @@ -1,14 +1,15 @@ """ cf-text FastAPI service — managed by cf-orch. -Lightweight local text generation. Supports GGUF models via llama.cpp and -HuggingFace transformers. Sits alongside vllm/ollama for products that need -fast, frequent inference from small local models (3B–7B Q4). +Lightweight local text generation and PII filtering. Supports GGUF models via +llama.cpp, HuggingFace transformers, and token-classification models (classifier +backend) for PII detection and redaction. Endpoints: GET /health → {"status": "ok", "model": str, "vram_mb": int, "backend": str} - POST /generate → GenerateResponse - POST /chat → GenerateResponse + POST /generate → GenerateResponse (text-gen backends only) + POST /chat → GenerateResponse (text-gen backends only) + POST /filter → FilterResponse (classifier backend only) Usage: python -m circuitforge_core.text.app \ @@ -34,17 +35,46 @@ import os import time import uuid from functools import partial +from typing import Annotated, Literal, Union import uvicorn from fastapi import FastAPI, HTTPException -from pydantic import BaseModel +from pydantic import BaseModel, Field from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage -from circuitforge_core.text.backends.base import make_text_backend +from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend +from circuitforge_core.text.filter import FilterResult, PIIFilter logger = logging.getLogger(__name__) _backend = None +_pii_filter: PIIFilter | None = None + + +# ── Content block types (OpenAI multimodal format) ──────────────────────────── + + +class ContentBlockText(BaseModel): + type: Literal["text"] + text: str + + +class ContentBlockImageURL(BaseModel): + type: Literal["image_url"] + image_url: dict[str, str] + + +ContentBlock = Annotated[ + Union[ContentBlockText, ContentBlockImageURL], + Field(discriminator="type"), +] + + +def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage": + """Convert an API message to a BackendChatMessage with raw content dicts.""" + if isinstance(content, str): + return BackendChatMessage(role, content) + return BackendChatMessage(role, [b.model_dump() for b in content]) # ── Request / response models ───────────────────────────────────────────────── @@ -59,7 +89,7 @@ class GenerateRequest(BaseModel): class ChatMessageModel(BaseModel): role: str - content: str + content: Union[str, list[ContentBlock]] = "" class ChatRequest(BaseModel): @@ -74,12 +104,31 @@ class GenerateResponse(BaseModel): model: str = "" +class FilterRequest(BaseModel): + text: str + + +class PIISpanResponse(BaseModel): + label: str + start: int + end: int + text: str + score: float + + +class FilterResponse(BaseModel): + redacted_text: str + spans: list[PIISpanResponse] + original_text: str + model: str = "" + + # ── OpenAI-compat request / response (for LLMRouter openai_compat path) ────── class OAIMessageModel(BaseModel): role: str - content: str + content: Union[str, list[ContentBlock]] = "" class OAIChatRequest(BaseModel): @@ -120,6 +169,7 @@ def create_app( gpu_ids: str | None = None, backend: str | None = None, mock: bool = False, + mmproj_path: str = "", ) -> FastAPI: """Start the cf-text FastAPI app. @@ -127,8 +177,12 @@ def create_app( (e.g. "0,1"). When set, overrides ``gpu_id`` and sets ``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's ``device_map="auto"`` can shard the model across all listed devices. + + When ``backend="classifier"``, the service skips the text-gen backends + and loads a token-classification pipeline instead. Only ``POST /filter`` + is available in that mode; ``/generate`` and ``/chat`` return 501. """ - global _backend + global _backend, _pii_filter if not mock and not model_path: raise ValueError( @@ -139,13 +193,26 @@ def create_app( visible = gpu_ids if gpu_ids else str(gpu_id) os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible) - _backend = make_text_backend(model_path, backend=backend, mock=mock) - logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb) + resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "") + if resolved_backend == "classifier" or (not resolved_backend and False): + classifier_backend = make_classifier_backend(model_path) + _pii_filter = PIIFilter.from_backend(classifier_backend) + logger.info( + "cf-text (classifier) ready: model=%r vram=%dMB", + classifier_backend.model_name, + classifier_backend.vram_mb, + ) + else: + _backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path) + logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb) app = FastAPI(title="cf-text", version="0.1.0") @app.get("/health") def health() -> dict: + if _pii_filter is not None: + b = _pii_filter._backend + return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"} if _backend is None: raise HTTPException(503, detail="backend not initialised") return { @@ -154,8 +221,35 @@ def create_app( "vram_mb": _backend.vram_mb, } + @app.post("/filter") + async def filter_text(req: FilterRequest) -> FilterResponse: + if _pii_filter is None: + raise HTTPException( + 501, + detail="This cf-text instance is not running a classifier backend. " + "Start with --backend classifier and a token-classification model.", + ) + result = await _pii_filter.filter_async(req.text) + return FilterResponse( + redacted_text=result.redacted_text, + spans=[ + PIISpanResponse( + label=s.label, + start=s.start, + end=s.end, + text=s.text, + score=s.score, + ) + for s in result.spans + ], + original_text=result.original_text, + model=_pii_filter._backend.model_name, + ) + @app.post("/generate") async def generate(req: GenerateRequest) -> GenerateResponse: + if _pii_filter is not None: + raise HTTPException(501, detail="classifier backend loaded — use POST /filter") if _backend is None: raise HTTPException(503, detail="backend not initialised") result = await _backend.generate_async( @@ -172,16 +266,20 @@ def create_app( @app.post("/chat") async def chat(req: ChatRequest) -> GenerateResponse: + if _pii_filter is not None: + raise HTTPException(501, detail="classifier backend loaded — use POST /filter") if _backend is None: raise HTTPException(503, detail="backend not initialised") - messages = [BackendChatMessage(m.role, m.content) for m in req.messages] - # chat() is sync-only in the Protocol; run in thread pool to avoid blocking + messages = [_to_backend_message(m.role, m.content) for m in req.messages] loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - partial(_backend.chat, messages, - max_tokens=req.max_tokens, temperature=req.temperature), - ) + try: + result = await loop.run_in_executor( + None, + partial(_backend.chat, messages, + max_tokens=req.max_tokens, temperature=req.temperature), + ) + except ValueError as exc: + raise HTTPException(422, detail=str(exc)) from exc return GenerateResponse( text=result.text, tokens_used=result.tokens_used, @@ -198,13 +296,16 @@ def create_app( """ if _backend is None: raise HTTPException(503, detail="backend not initialised") - messages = [BackendChatMessage(m.role, m.content) for m in req.messages] + messages = [_to_backend_message(m.role, m.content) for m in req.messages] max_tok = req.max_tokens or 512 loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature), - ) + try: + result = await loop.run_in_executor( + None, + partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature), + ) + except ValueError as exc: + raise HTTPException(422, detail=str(exc)) from exc return OAIChatResponse( id=f"cftext-{uuid.uuid4().hex[:12]}", created=int(time.time()), @@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--gpu-ids", default=None, help="Comma-separated CUDA device indices for multi-GPU spanning " "(e.g. '0,1'). Overrides --gpu-id when set.") - parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None) + parser.add_argument( + "--backend", + choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"], + default=None, + ) + parser.add_argument( + "--mmproj", default="", + help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). " + "Qwen2-VL and other self-contained VLMs don't need this.", + ) parser.add_argument("--mock", action="store_true", help="Run in mock mode (no model or GPU needed)") return parser.parse_args() @@ -247,5 +357,6 @@ if __name__ == "__main__": gpu_ids=args.gpu_ids, backend=args.backend, mock=mock, + mmproj_path=args.mmproj, ) uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/circuitforge_core/text/backends/base.py b/circuitforge_core/text/backends/base.py index 4982778..5e326d2 100644 --- a/circuitforge_core/text/backends/base.py +++ b/circuitforge_core/text/backends/base.py @@ -24,17 +24,44 @@ class GenerateResult: class ChatMessage: - """A single message in a chat conversation.""" + """A single message in a chat conversation. - def __init__(self, role: str, content: str) -> None: + ``content`` is either a plain string or a list of OpenAI-format content + blocks (dicts with ``type: "text"`` or ``type: "image_url"``). Backends + that do not support images should call ``text_only`` to get the string + form before passing to the model. + """ + + def __init__(self, role: str, content: "str | list") -> None: if role not in ("system", "user", "assistant"): raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.") self.role = role - self.content = content + self.content: "str | list" = content def to_dict(self) -> dict: return {"role": self.role, "content": self.content} + @property + def has_images(self) -> bool: + """True when at least one content block is an image_url block.""" + if isinstance(self.content, str): + return False + return any( + isinstance(b, dict) and b.get("type") == "image_url" + for b in self.content + ) + + @property + def text_only(self) -> str: + """Flatten multimodal content to text. Returns content as-is if already str.""" + if isinstance(self.content, str): + return self.content + return "\n".join( + b["text"] + for b in self.content + if isinstance(b, dict) and b.get("type") == "text" + ) + # ── TextBackend Protocol ────────────────────────────────────────────────────── @@ -116,6 +143,33 @@ class TextBackend(Protocol): ... +# ── FilterBackend Protocol ──────────────────────────────────────────────────── + + +@runtime_checkable +class FilterBackend(Protocol): + """ + Abstract interface for token-classification / PII-filter backends. + + Separate from TextBackend — returns entity spans and redacted text, + not generated text. + """ + + def classify(self, text: str) -> list[dict]: + """Synchronous classify — returns list of entity span dicts.""" + ... + + async def classify_async(self, text: str) -> list[dict]: + """Async classify — runs in thread pool.""" + ... + + @property + def model_name(self) -> str: ... + + @property + def vram_mb(self) -> int: ... + + # ── Backend selection ───────────────────────────────────────────────────────── @@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str: Raise ValueError for unrecognised override values. """ - _VALID = ("llamacpp", "transformers", "ollama", "vllm") + _VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier") # 1. Caller-supplied override — highest trust, no inspection needed. resolved = backend or os.environ.get("CF_TEXT_BACKEND") @@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str: # 3. Format detection — GGUF files are unambiguously llama-cpp territory. if model_path.lower().endswith(".gguf"): return "llamacpp" + # 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents. + if os.path.isdir(model_path): + import glob as _glob + if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")): + return "llamacpp" # 4. Safe default — transformers covers HF repo IDs and safetensors dirs. return "transformers" @@ -165,6 +224,7 @@ def make_text_backend( model_path: str, backend: str | None = None, mock: bool | None = None, + mmproj_path: str = "", ) -> "TextBackend": """ Return a TextBackend for the given model. @@ -181,7 +241,7 @@ def make_text_backend( if resolved == "llamacpp": from circuitforge_core.text.backends.llamacpp import LlamaCppBackend - return LlamaCppBackend(model_path=model_path) + return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path) if resolved == "transformers": from circuitforge_core.text.backends.transformers import TransformersBackend @@ -195,4 +255,22 @@ def make_text_backend( from circuitforge_core.text.backends.vllm import VllmBackend return VllmBackend(model_path=model_path) - raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.") + raise ValueError( + f"Unknown backend {resolved!r}. " + "Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'." + ) + + +def make_classifier_backend(model_path: str) -> "FilterBackend": + """ + Return a FilterBackend for the given token-classification model. + + CF_TEXT_MOCK=1 → MockClassifierBackend (no GPU, no model file needed) + Otherwise → ClassifierBackend via transformers pipeline + """ + if os.environ.get("CF_TEXT_MOCK", "") == "1": + from circuitforge_core.text.backends.mock import MockClassifierBackend + return MockClassifierBackend(model_name=model_path) + + from circuitforge_core.text.backends.classifier import ClassifierBackend + return ClassifierBackend(model_path=model_path) diff --git a/circuitforge_core/text/backends/llamacpp.py b/circuitforge_core/text/backends/llamacpp.py index 2ddc932..98f42e0 100644 --- a/circuitforge_core/text/backends/llamacpp.py +++ b/circuitforge_core/text/backends/llamacpp.py @@ -48,7 +48,16 @@ class LlamaCppBackend: Requires: pip install circuitforge-core[text-llamacpp] """ - def __init__(self, model_path: str) -> None: + def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None: + """Load a GGUF model. + + ``mmproj_path``: path to a separate multimodal projector file (needed + for LLaVA-style VLMs where the visual encoder is a separate .gguf). + Qwen2-VL and similar models with an embedded projector don't need this. + + ``chat_format``: llama-cpp chat template override (e.g. "llava-1-5", + "moondream"). Required when mmproj_path is set. + """ try: from llama_cpp import Llama # type: ignore[import] except ImportError as exc: @@ -63,20 +72,53 @@ class LlamaCppBackend: "Download a GGUF model and set CF_TEXT_MODEL to its path." ) + # If given a directory, find the .gguf file inside it. + if Path(model_path).is_dir(): + candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF")) + if not candidates: + raise FileNotFoundError( + f"No .gguf file found in directory: {model_path}" + ) + model_path = str(candidates[0]) + n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None - logger.info( - "Loading GGUF model %s (ctx=%d, gpu_layers=%d)", - model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS, - ) - self._llm = Llama( + + kwargs: dict = dict( model_path=model_path, n_ctx=_DEFAULT_N_CTX, n_gpu_layers=_DEFAULT_N_GPU_LAYERS, n_threads=n_threads, verbose=False, ) + if mmproj_path: + kwargs["clip_model_path"] = mmproj_path + kwargs["chat_format"] = chat_format or "llava-1-5" + logger.info( + "Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)", + model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS, + ) + else: + logger.info( + "Loading GGUF model %s (ctx=%d, gpu_layers=%d)", + model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS, + ) + + self._llm = Llama(**kwargs) self._model_path = model_path self._vram_mb = _estimate_vram_mb(model_path) + # True when the model was initialised with a visual encoder (explicit + # mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.). + self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm() + + def _detect_embedded_vlm(self) -> bool: + """Heuristic: check model metadata for a known multimodal architecture.""" + try: + meta = self._llm.metadata or {} + arch = str(meta.get("general.architecture", "")).lower() + # Qwen2-VL and similar embed the vision encoder inside the GGUF. + return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v")) + except Exception: + return False @property def model_name(self) -> str: @@ -181,7 +223,14 @@ class LlamaCppBackend: max_tokens: int = 512, temperature: float = 0.7, ) -> GenerateResult: - # llama-cpp-python has native chat_completion for instruct models + # Detect image content before calling the model. + if any(m.has_images for m in messages) and not self._is_vlm: + raise ValueError( + "model does not support image input — " + "load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision" + ) + # llama-cpp-python create_chat_completion accepts content as str or + # list-of-blocks (OpenAI multimodal format) natively. output = self._llm.create_chat_completion( messages=[m.to_dict() for m in messages], max_tokens=max_tokens,