feat(text): multimodal content-block support + VLM mmproj passthrough
Add OpenAI-style content block models (ContentBlockText, ContentBlockImageURL) to cf-text FastAPI app; update ChatMessage.content to accept str | list. LlamaCppBackend gains mmproj_path + chat_format args for external projector VLMs; embedded VLMs (Qwen2-VL, MiniCPM-V) detected via GGUF metadata. Text-only backends raise ValueError on image input rather than silently dropping them. Adds --mmproj CLI arg wired through create_app(). Closes: #66
This commit is contained in:
parent
5a363f3b6c
commit
93ab528261
3 changed files with 276 additions and 38 deletions
|
|
@ -1,14 +1,15 @@
|
||||||
"""
|
"""
|
||||||
cf-text FastAPI service — managed by cf-orch.
|
cf-text FastAPI service — managed by cf-orch.
|
||||||
|
|
||||||
Lightweight local text generation. Supports GGUF models via llama.cpp and
|
Lightweight local text generation and PII filtering. Supports GGUF models via
|
||||||
HuggingFace transformers. Sits alongside vllm/ollama for products that need
|
llama.cpp, HuggingFace transformers, and token-classification models (classifier
|
||||||
fast, frequent inference from small local models (3B–7B Q4).
|
backend) for PII detection and redaction.
|
||||||
|
|
||||||
Endpoints:
|
Endpoints:
|
||||||
GET /health → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
|
GET /health → {"status": "ok", "model": str, "vram_mb": int, "backend": str}
|
||||||
POST /generate → GenerateResponse
|
POST /generate → GenerateResponse (text-gen backends only)
|
||||||
POST /chat → GenerateResponse
|
POST /chat → GenerateResponse (text-gen backends only)
|
||||||
|
POST /filter → FilterResponse (classifier backend only)
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python -m circuitforge_core.text.app \
|
python -m circuitforge_core.text.app \
|
||||||
|
|
@ -34,17 +35,46 @@ import os
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from typing import Annotated, Literal, Union
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
|
from circuitforge_core.text.backends.base import ChatMessage as BackendChatMessage
|
||||||
from circuitforge_core.text.backends.base import make_text_backend
|
from circuitforge_core.text.backends.base import make_classifier_backend, make_text_backend
|
||||||
|
from circuitforge_core.text.filter import FilterResult, PIIFilter
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_backend = None
|
_backend = None
|
||||||
|
_pii_filter: PIIFilter | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Content block types (OpenAI multimodal format) ────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class ContentBlockText(BaseModel):
|
||||||
|
type: Literal["text"]
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
class ContentBlockImageURL(BaseModel):
|
||||||
|
type: Literal["image_url"]
|
||||||
|
image_url: dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
|
ContentBlock = Annotated[
|
||||||
|
Union[ContentBlockText, ContentBlockImageURL],
|
||||||
|
Field(discriminator="type"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _to_backend_message(role: str, content: "str | list[ContentBlock]") -> "BackendChatMessage":
|
||||||
|
"""Convert an API message to a BackendChatMessage with raw content dicts."""
|
||||||
|
if isinstance(content, str):
|
||||||
|
return BackendChatMessage(role, content)
|
||||||
|
return BackendChatMessage(role, [b.model_dump() for b in content])
|
||||||
|
|
||||||
|
|
||||||
# ── Request / response models ─────────────────────────────────────────────────
|
# ── Request / response models ─────────────────────────────────────────────────
|
||||||
|
|
@ -59,7 +89,7 @@ class GenerateRequest(BaseModel):
|
||||||
|
|
||||||
class ChatMessageModel(BaseModel):
|
class ChatMessageModel(BaseModel):
|
||||||
role: str
|
role: str
|
||||||
content: str
|
content: Union[str, list[ContentBlock]] = ""
|
||||||
|
|
||||||
|
|
||||||
class ChatRequest(BaseModel):
|
class ChatRequest(BaseModel):
|
||||||
|
|
@ -74,12 +104,31 @@ class GenerateResponse(BaseModel):
|
||||||
model: str = ""
|
model: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class FilterRequest(BaseModel):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
class PIISpanResponse(BaseModel):
|
||||||
|
label: str
|
||||||
|
start: int
|
||||||
|
end: int
|
||||||
|
text: str
|
||||||
|
score: float
|
||||||
|
|
||||||
|
|
||||||
|
class FilterResponse(BaseModel):
|
||||||
|
redacted_text: str
|
||||||
|
spans: list[PIISpanResponse]
|
||||||
|
original_text: str
|
||||||
|
model: str = ""
|
||||||
|
|
||||||
|
|
||||||
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
|
# ── OpenAI-compat request / response (for LLMRouter openai_compat path) ──────
|
||||||
|
|
||||||
|
|
||||||
class OAIMessageModel(BaseModel):
|
class OAIMessageModel(BaseModel):
|
||||||
role: str
|
role: str
|
||||||
content: str
|
content: Union[str, list[ContentBlock]] = ""
|
||||||
|
|
||||||
|
|
||||||
class OAIChatRequest(BaseModel):
|
class OAIChatRequest(BaseModel):
|
||||||
|
|
@ -120,6 +169,7 @@ def create_app(
|
||||||
gpu_ids: str | None = None,
|
gpu_ids: str | None = None,
|
||||||
backend: str | None = None,
|
backend: str | None = None,
|
||||||
mock: bool = False,
|
mock: bool = False,
|
||||||
|
mmproj_path: str = "",
|
||||||
) -> FastAPI:
|
) -> FastAPI:
|
||||||
"""Start the cf-text FastAPI app.
|
"""Start the cf-text FastAPI app.
|
||||||
|
|
||||||
|
|
@ -127,8 +177,12 @@ def create_app(
|
||||||
(e.g. "0,1"). When set, overrides ``gpu_id`` and sets
|
(e.g. "0,1"). When set, overrides ``gpu_id`` and sets
|
||||||
``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
|
``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
|
||||||
``device_map="auto"`` can shard the model across all listed devices.
|
``device_map="auto"`` can shard the model across all listed devices.
|
||||||
|
|
||||||
|
When ``backend="classifier"``, the service skips the text-gen backends
|
||||||
|
and loads a token-classification pipeline instead. Only ``POST /filter``
|
||||||
|
is available in that mode; ``/generate`` and ``/chat`` return 501.
|
||||||
"""
|
"""
|
||||||
global _backend
|
global _backend, _pii_filter
|
||||||
|
|
||||||
if not mock and not model_path:
|
if not mock and not model_path:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
@ -139,13 +193,26 @@ def create_app(
|
||||||
visible = gpu_ids if gpu_ids else str(gpu_id)
|
visible = gpu_ids if gpu_ids else str(gpu_id)
|
||||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
|
||||||
|
|
||||||
_backend = make_text_backend(model_path, backend=backend, mock=mock)
|
resolved_backend = backend or os.environ.get("CF_TEXT_BACKEND", "")
|
||||||
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
if resolved_backend == "classifier" or (not resolved_backend and False):
|
||||||
|
classifier_backend = make_classifier_backend(model_path)
|
||||||
|
_pii_filter = PIIFilter.from_backend(classifier_backend)
|
||||||
|
logger.info(
|
||||||
|
"cf-text (classifier) ready: model=%r vram=%dMB",
|
||||||
|
classifier_backend.model_name,
|
||||||
|
classifier_backend.vram_mb,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_backend = make_text_backend(model_path, backend=backend, mock=mock, mmproj_path=mmproj_path)
|
||||||
|
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
||||||
|
|
||||||
app = FastAPI(title="cf-text", version="0.1.0")
|
app = FastAPI(title="cf-text", version="0.1.0")
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health() -> dict:
|
def health() -> dict:
|
||||||
|
if _pii_filter is not None:
|
||||||
|
b = _pii_filter._backend
|
||||||
|
return {"status": "ok", "model": b.model_name, "vram_mb": b.vram_mb, "backend": "classifier"}
|
||||||
if _backend is None:
|
if _backend is None:
|
||||||
raise HTTPException(503, detail="backend not initialised")
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
return {
|
return {
|
||||||
|
|
@ -154,8 +221,35 @@ def create_app(
|
||||||
"vram_mb": _backend.vram_mb,
|
"vram_mb": _backend.vram_mb,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@app.post("/filter")
|
||||||
|
async def filter_text(req: FilterRequest) -> FilterResponse:
|
||||||
|
if _pii_filter is None:
|
||||||
|
raise HTTPException(
|
||||||
|
501,
|
||||||
|
detail="This cf-text instance is not running a classifier backend. "
|
||||||
|
"Start with --backend classifier and a token-classification model.",
|
||||||
|
)
|
||||||
|
result = await _pii_filter.filter_async(req.text)
|
||||||
|
return FilterResponse(
|
||||||
|
redacted_text=result.redacted_text,
|
||||||
|
spans=[
|
||||||
|
PIISpanResponse(
|
||||||
|
label=s.label,
|
||||||
|
start=s.start,
|
||||||
|
end=s.end,
|
||||||
|
text=s.text,
|
||||||
|
score=s.score,
|
||||||
|
)
|
||||||
|
for s in result.spans
|
||||||
|
],
|
||||||
|
original_text=result.original_text,
|
||||||
|
model=_pii_filter._backend.model_name,
|
||||||
|
)
|
||||||
|
|
||||||
@app.post("/generate")
|
@app.post("/generate")
|
||||||
async def generate(req: GenerateRequest) -> GenerateResponse:
|
async def generate(req: GenerateRequest) -> GenerateResponse:
|
||||||
|
if _pii_filter is not None:
|
||||||
|
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
|
||||||
if _backend is None:
|
if _backend is None:
|
||||||
raise HTTPException(503, detail="backend not initialised")
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
result = await _backend.generate_async(
|
result = await _backend.generate_async(
|
||||||
|
|
@ -172,16 +266,20 @@ def create_app(
|
||||||
|
|
||||||
@app.post("/chat")
|
@app.post("/chat")
|
||||||
async def chat(req: ChatRequest) -> GenerateResponse:
|
async def chat(req: ChatRequest) -> GenerateResponse:
|
||||||
|
if _pii_filter is not None:
|
||||||
|
raise HTTPException(501, detail="classifier backend loaded — use POST /filter")
|
||||||
if _backend is None:
|
if _backend is None:
|
||||||
raise HTTPException(503, detail="backend not initialised")
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
messages = [_to_backend_message(m.role, m.content) for m in req.messages]
|
||||||
# chat() is sync-only in the Protocol; run in thread pool to avoid blocking
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
result = await loop.run_in_executor(
|
try:
|
||||||
None,
|
result = await loop.run_in_executor(
|
||||||
partial(_backend.chat, messages,
|
None,
|
||||||
max_tokens=req.max_tokens, temperature=req.temperature),
|
partial(_backend.chat, messages,
|
||||||
)
|
max_tokens=req.max_tokens, temperature=req.temperature),
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise HTTPException(422, detail=str(exc)) from exc
|
||||||
return GenerateResponse(
|
return GenerateResponse(
|
||||||
text=result.text,
|
text=result.text,
|
||||||
tokens_used=result.tokens_used,
|
tokens_used=result.tokens_used,
|
||||||
|
|
@ -198,13 +296,16 @@ def create_app(
|
||||||
"""
|
"""
|
||||||
if _backend is None:
|
if _backend is None:
|
||||||
raise HTTPException(503, detail="backend not initialised")
|
raise HTTPException(503, detail="backend not initialised")
|
||||||
messages = [BackendChatMessage(m.role, m.content) for m in req.messages]
|
messages = [_to_backend_message(m.role, m.content) for m in req.messages]
|
||||||
max_tok = req.max_tokens or 512
|
max_tok = req.max_tokens or 512
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
result = await loop.run_in_executor(
|
try:
|
||||||
None,
|
result = await loop.run_in_executor(
|
||||||
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
|
None,
|
||||||
)
|
partial(_backend.chat, messages, max_tokens=max_tok, temperature=req.temperature),
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise HTTPException(422, detail=str(exc)) from exc
|
||||||
return OAIChatResponse(
|
return OAIChatResponse(
|
||||||
id=f"cftext-{uuid.uuid4().hex[:12]}",
|
id=f"cftext-{uuid.uuid4().hex[:12]}",
|
||||||
created=int(time.time()),
|
created=int(time.time()),
|
||||||
|
|
@ -230,7 +331,16 @@ def _parse_args() -> argparse.Namespace:
|
||||||
parser.add_argument("--gpu-ids", default=None,
|
parser.add_argument("--gpu-ids", default=None,
|
||||||
help="Comma-separated CUDA device indices for multi-GPU spanning "
|
help="Comma-separated CUDA device indices for multi-GPU spanning "
|
||||||
"(e.g. '0,1'). Overrides --gpu-id when set.")
|
"(e.g. '0,1'). Overrides --gpu-id when set.")
|
||||||
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
|
parser.add_argument(
|
||||||
|
"--backend",
|
||||||
|
choices=["llamacpp", "transformers", "ollama", "vllm", "classifier"],
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--mmproj", default="",
|
||||||
|
help="Path to multimodal projector file for VLM GGUF models (LLaVA-style). "
|
||||||
|
"Qwen2-VL and other self-contained VLMs don't need this.",
|
||||||
|
)
|
||||||
parser.add_argument("--mock", action="store_true",
|
parser.add_argument("--mock", action="store_true",
|
||||||
help="Run in mock mode (no model or GPU needed)")
|
help="Run in mock mode (no model or GPU needed)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
@ -247,5 +357,6 @@ if __name__ == "__main__":
|
||||||
gpu_ids=args.gpu_ids,
|
gpu_ids=args.gpu_ids,
|
||||||
backend=args.backend,
|
backend=args.backend,
|
||||||
mock=mock,
|
mock=mock,
|
||||||
|
mmproj_path=args.mmproj,
|
||||||
)
|
)
|
||||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||||
|
|
|
||||||
|
|
@ -24,17 +24,44 @@ class GenerateResult:
|
||||||
|
|
||||||
|
|
||||||
class ChatMessage:
|
class ChatMessage:
|
||||||
"""A single message in a chat conversation."""
|
"""A single message in a chat conversation.
|
||||||
|
|
||||||
def __init__(self, role: str, content: str) -> None:
|
``content`` is either a plain string or a list of OpenAI-format content
|
||||||
|
blocks (dicts with ``type: "text"`` or ``type: "image_url"``). Backends
|
||||||
|
that do not support images should call ``text_only`` to get the string
|
||||||
|
form before passing to the model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, role: str, content: "str | list") -> None:
|
||||||
if role not in ("system", "user", "assistant"):
|
if role not in ("system", "user", "assistant"):
|
||||||
raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
|
raise ValueError(f"Invalid role {role!r}. Must be system, user, or assistant.")
|
||||||
self.role = role
|
self.role = role
|
||||||
self.content = content
|
self.content: "str | list" = content
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
return {"role": self.role, "content": self.content}
|
return {"role": self.role, "content": self.content}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_images(self) -> bool:
|
||||||
|
"""True when at least one content block is an image_url block."""
|
||||||
|
if isinstance(self.content, str):
|
||||||
|
return False
|
||||||
|
return any(
|
||||||
|
isinstance(b, dict) and b.get("type") == "image_url"
|
||||||
|
for b in self.content
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_only(self) -> str:
|
||||||
|
"""Flatten multimodal content to text. Returns content as-is if already str."""
|
||||||
|
if isinstance(self.content, str):
|
||||||
|
return self.content
|
||||||
|
return "\n".join(
|
||||||
|
b["text"]
|
||||||
|
for b in self.content
|
||||||
|
if isinstance(b, dict) and b.get("type") == "text"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── TextBackend Protocol ──────────────────────────────────────────────────────
|
# ── TextBackend Protocol ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
@ -116,6 +143,33 @@ class TextBackend(Protocol):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# ── FilterBackend Protocol ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class FilterBackend(Protocol):
|
||||||
|
"""
|
||||||
|
Abstract interface for token-classification / PII-filter backends.
|
||||||
|
|
||||||
|
Separate from TextBackend — returns entity spans and redacted text,
|
||||||
|
not generated text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def classify(self, text: str) -> list[dict]:
|
||||||
|
"""Synchronous classify — returns list of entity span dicts."""
|
||||||
|
...
|
||||||
|
|
||||||
|
async def classify_async(self, text: str) -> list[dict]:
|
||||||
|
"""Async classify — runs in thread pool."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str: ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vram_mb(self) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# ── Backend selection ─────────────────────────────────────────────────────────
|
# ── Backend selection ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -133,7 +187,7 @@ def _select_backend(model_path: str, backend: str | None) -> str:
|
||||||
|
|
||||||
Raise ValueError for unrecognised override values.
|
Raise ValueError for unrecognised override values.
|
||||||
"""
|
"""
|
||||||
_VALID = ("llamacpp", "transformers", "ollama", "vllm")
|
_VALID = ("llamacpp", "transformers", "ollama", "vllm", "classifier")
|
||||||
|
|
||||||
# 1. Caller-supplied override — highest trust, no inspection needed.
|
# 1. Caller-supplied override — highest trust, no inspection needed.
|
||||||
resolved = backend or os.environ.get("CF_TEXT_BACKEND")
|
resolved = backend or os.environ.get("CF_TEXT_BACKEND")
|
||||||
|
|
@ -153,6 +207,11 @@ def _select_backend(model_path: str, backend: str | None) -> str:
|
||||||
# 3. Format detection — GGUF files are unambiguously llama-cpp territory.
|
# 3. Format detection — GGUF files are unambiguously llama-cpp territory.
|
||||||
if model_path.lower().endswith(".gguf"):
|
if model_path.lower().endswith(".gguf"):
|
||||||
return "llamacpp"
|
return "llamacpp"
|
||||||
|
# 3b. GGUF directory — avocet downloads whole repos; scan for .gguf contents.
|
||||||
|
if os.path.isdir(model_path):
|
||||||
|
import glob as _glob
|
||||||
|
if _glob.glob(os.path.join(model_path, "*.gguf")) or _glob.glob(os.path.join(model_path, "*.GGUF")):
|
||||||
|
return "llamacpp"
|
||||||
|
|
||||||
# 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
|
# 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
|
||||||
return "transformers"
|
return "transformers"
|
||||||
|
|
@ -165,6 +224,7 @@ def make_text_backend(
|
||||||
model_path: str,
|
model_path: str,
|
||||||
backend: str | None = None,
|
backend: str | None = None,
|
||||||
mock: bool | None = None,
|
mock: bool | None = None,
|
||||||
|
mmproj_path: str = "",
|
||||||
) -> "TextBackend":
|
) -> "TextBackend":
|
||||||
"""
|
"""
|
||||||
Return a TextBackend for the given model.
|
Return a TextBackend for the given model.
|
||||||
|
|
@ -181,7 +241,7 @@ def make_text_backend(
|
||||||
|
|
||||||
if resolved == "llamacpp":
|
if resolved == "llamacpp":
|
||||||
from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
|
from circuitforge_core.text.backends.llamacpp import LlamaCppBackend
|
||||||
return LlamaCppBackend(model_path=model_path)
|
return LlamaCppBackend(model_path=model_path, mmproj_path=mmproj_path)
|
||||||
|
|
||||||
if resolved == "transformers":
|
if resolved == "transformers":
|
||||||
from circuitforge_core.text.backends.transformers import TransformersBackend
|
from circuitforge_core.text.backends.transformers import TransformersBackend
|
||||||
|
|
@ -195,4 +255,22 @@ def make_text_backend(
|
||||||
from circuitforge_core.text.backends.vllm import VllmBackend
|
from circuitforge_core.text.backends.vllm import VllmBackend
|
||||||
return VllmBackend(model_path=model_path)
|
return VllmBackend(model_path=model_path)
|
||||||
|
|
||||||
raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
|
raise ValueError(
|
||||||
|
f"Unknown backend {resolved!r}. "
|
||||||
|
"Expected 'llamacpp', 'transformers', 'ollama', 'vllm', or 'classifier'."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_classifier_backend(model_path: str) -> "FilterBackend":
|
||||||
|
"""
|
||||||
|
Return a FilterBackend for the given token-classification model.
|
||||||
|
|
||||||
|
CF_TEXT_MOCK=1 → MockClassifierBackend (no GPU, no model file needed)
|
||||||
|
Otherwise → ClassifierBackend via transformers pipeline
|
||||||
|
"""
|
||||||
|
if os.environ.get("CF_TEXT_MOCK", "") == "1":
|
||||||
|
from circuitforge_core.text.backends.mock import MockClassifierBackend
|
||||||
|
return MockClassifierBackend(model_name=model_path)
|
||||||
|
|
||||||
|
from circuitforge_core.text.backends.classifier import ClassifierBackend
|
||||||
|
return ClassifierBackend(model_path=model_path)
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,16 @@ class LlamaCppBackend:
|
||||||
Requires: pip install circuitforge-core[text-llamacpp]
|
Requires: pip install circuitforge-core[text-llamacpp]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model_path: str) -> None:
|
def __init__(self, model_path: str, mmproj_path: str = "", chat_format: str = "") -> None:
|
||||||
|
"""Load a GGUF model.
|
||||||
|
|
||||||
|
``mmproj_path``: path to a separate multimodal projector file (needed
|
||||||
|
for LLaVA-style VLMs where the visual encoder is a separate .gguf).
|
||||||
|
Qwen2-VL and similar models with an embedded projector don't need this.
|
||||||
|
|
||||||
|
``chat_format``: llama-cpp chat template override (e.g. "llava-1-5",
|
||||||
|
"moondream"). Required when mmproj_path is set.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from llama_cpp import Llama # type: ignore[import]
|
from llama_cpp import Llama # type: ignore[import]
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
|
|
@ -63,20 +72,53 @@ class LlamaCppBackend:
|
||||||
"Download a GGUF model and set CF_TEXT_MODEL to its path."
|
"Download a GGUF model and set CF_TEXT_MODEL to its path."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If given a directory, find the .gguf file inside it.
|
||||||
|
if Path(model_path).is_dir():
|
||||||
|
candidates = sorted(Path(model_path).glob("*.gguf")) or sorted(Path(model_path).glob("*.GGUF"))
|
||||||
|
if not candidates:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"No .gguf file found in directory: {model_path}"
|
||||||
|
)
|
||||||
|
model_path = str(candidates[0])
|
||||||
|
|
||||||
n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
|
n_threads = int(os.environ.get("CF_TEXT_THREADS", "0")) or None
|
||||||
logger.info(
|
|
||||||
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
|
kwargs: dict = dict(
|
||||||
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
|
||||||
)
|
|
||||||
self._llm = Llama(
|
|
||||||
model_path=model_path,
|
model_path=model_path,
|
||||||
n_ctx=_DEFAULT_N_CTX,
|
n_ctx=_DEFAULT_N_CTX,
|
||||||
n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
|
n_gpu_layers=_DEFAULT_N_GPU_LAYERS,
|
||||||
n_threads=n_threads,
|
n_threads=n_threads,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
)
|
)
|
||||||
|
if mmproj_path:
|
||||||
|
kwargs["clip_model_path"] = mmproj_path
|
||||||
|
kwargs["chat_format"] = chat_format or "llava-1-5"
|
||||||
|
logger.info(
|
||||||
|
"Loading VLM %s with mmproj %s (ctx=%d, gpu_layers=%d)",
|
||||||
|
model_path, mmproj_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Loading GGUF model %s (ctx=%d, gpu_layers=%d)",
|
||||||
|
model_path, _DEFAULT_N_CTX, _DEFAULT_N_GPU_LAYERS,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._llm = Llama(**kwargs)
|
||||||
self._model_path = model_path
|
self._model_path = model_path
|
||||||
self._vram_mb = _estimate_vram_mb(model_path)
|
self._vram_mb = _estimate_vram_mb(model_path)
|
||||||
|
# True when the model was initialised with a visual encoder (explicit
|
||||||
|
# mmproj) or when it is a known self-contained VLM (Qwen2-VL, etc.).
|
||||||
|
self._is_vlm = bool(mmproj_path) or self._detect_embedded_vlm()
|
||||||
|
|
||||||
|
def _detect_embedded_vlm(self) -> bool:
|
||||||
|
"""Heuristic: check model metadata for a known multimodal architecture."""
|
||||||
|
try:
|
||||||
|
meta = self._llm.metadata or {}
|
||||||
|
arch = str(meta.get("general.architecture", "")).lower()
|
||||||
|
# Qwen2-VL and similar embed the vision encoder inside the GGUF.
|
||||||
|
return any(tag in arch for tag in ("qwen2_vl", "llava", "moondream", "minicpm-v"))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_name(self) -> str:
|
def model_name(self) -> str:
|
||||||
|
|
@ -181,7 +223,14 @@ class LlamaCppBackend:
|
||||||
max_tokens: int = 512,
|
max_tokens: int = 512,
|
||||||
temperature: float = 0.7,
|
temperature: float = 0.7,
|
||||||
) -> GenerateResult:
|
) -> GenerateResult:
|
||||||
# llama-cpp-python has native chat_completion for instruct models
|
# Detect image content before calling the model.
|
||||||
|
if any(m.has_images for m in messages) and not self._is_vlm:
|
||||||
|
raise ValueError(
|
||||||
|
"model does not support image input — "
|
||||||
|
"load a VLM (with mmproj_path) or route to cf-vision/cf-docuvision"
|
||||||
|
)
|
||||||
|
# llama-cpp-python create_chat_completion accepts content as str or
|
||||||
|
# list-of-blocks (OpenAI multimodal format) natively.
|
||||||
output = self._llm.create_chat_completion(
|
output = self._llm.create_chat_completion(
|
||||||
messages=[m.to_dict() for m in messages],
|
messages=[m.to_dict() for m in messages],
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue