feat(text): multi-GPU spanning via --gpu-ids + CUDA_VISIBLE_DEVICES
- create_app: add gpu_ids param; when set, exports CUDA_VISIBLE_DEVICES=<ids> so HuggingFace Accelerate auto-shards across all listed devices - CLI: add --gpu-ids arg (e.g. "0,1"); overrides --gpu-id when provided - backends/base.py: propagate gpu_ids through TextBackend.generate so backends can be aware of the visible device set Single-GPU deployments are unaffected — --gpu-id=0 remains the default.
This commit is contained in:
parent
f2ae43696b
commit
05063c2619
2 changed files with 45 additions and 10 deletions
|
|
@ -16,6 +16,12 @@ Usage:
|
||||||
--port 8006 \
|
--port 8006 \
|
||||||
--gpu-id 0
|
--gpu-id 0
|
||||||
|
|
||||||
|
Multi-GPU (spans two GPUs via CUDA_VISIBLE_DEVICES, device_map=auto):
|
||||||
|
python -m circuitforge_core.text.app \
|
||||||
|
--model /Library/Assets/LLM/deepseek-14b \
|
||||||
|
--port 8006 \
|
||||||
|
--gpu-ids 0,1
|
||||||
|
|
||||||
Mock mode (no model or GPU required):
|
Mock mode (no model or GPU required):
|
||||||
CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
|
CF_TEXT_MOCK=1 python -m circuitforge_core.text.app --port 8006
|
||||||
"""
|
"""
|
||||||
|
|
@ -111,9 +117,17 @@ class OAIChatResponse(BaseModel):
|
||||||
def create_app(
|
def create_app(
|
||||||
model_path: str,
|
model_path: str,
|
||||||
gpu_id: int = 0,
|
gpu_id: int = 0,
|
||||||
|
gpu_ids: str | None = None,
|
||||||
backend: str | None = None,
|
backend: str | None = None,
|
||||||
mock: bool = False,
|
mock: bool = False,
|
||||||
) -> FastAPI:
|
) -> FastAPI:
|
||||||
|
"""Start the cf-text FastAPI app.
|
||||||
|
|
||||||
|
``gpu_ids``: comma-separated CUDA device indices for multi-GPU spanning
|
||||||
|
(e.g. "0,1"). When set, overrides ``gpu_id`` and sets
|
||||||
|
``CUDA_VISIBLE_DEVICES`` to the full list so HuggingFace Accelerate's
|
||||||
|
``device_map="auto"`` can shard the model across all listed devices.
|
||||||
|
"""
|
||||||
global _backend
|
global _backend
|
||||||
|
|
||||||
if not mock and not model_path:
|
if not mock and not model_path:
|
||||||
|
|
@ -122,7 +136,8 @@ def create_app(
|
||||||
"Pass a GGUF path, a HuggingFace model ID, or set CF_TEXT_MOCK=1 for mock mode."
|
"Pass a GGUF path, a HuggingFace model ID, or set CF_TEXT_MOCK=1 for mock mode."
|
||||||
)
|
)
|
||||||
|
|
||||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(gpu_id))
|
visible = gpu_ids if gpu_ids else str(gpu_id)
|
||||||
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", visible)
|
||||||
|
|
||||||
_backend = make_text_backend(model_path, backend=backend, mock=mock)
|
_backend = make_text_backend(model_path, backend=backend, mock=mock)
|
||||||
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
logger.info("cf-text ready: model=%r vram=%dMB", _backend.model_name, _backend.vram_mb)
|
||||||
|
|
@ -211,7 +226,10 @@ def _parse_args() -> argparse.Namespace:
|
||||||
parser.add_argument("--port", type=int, default=8006)
|
parser.add_argument("--port", type=int, default=8006)
|
||||||
parser.add_argument("--host", default="0.0.0.0")
|
parser.add_argument("--host", default="0.0.0.0")
|
||||||
parser.add_argument("--gpu-id", type=int, default=0,
|
parser.add_argument("--gpu-id", type=int, default=0,
|
||||||
help="CUDA device index to use")
|
help="CUDA device index to use (single GPU)")
|
||||||
|
parser.add_argument("--gpu-ids", default=None,
|
||||||
|
help="Comma-separated CUDA device indices for multi-GPU spanning "
|
||||||
|
"(e.g. '0,1'). Overrides --gpu-id when set.")
|
||||||
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
|
parser.add_argument("--backend", choices=["llamacpp", "transformers"], default=None)
|
||||||
parser.add_argument("--mock", action="store_true",
|
parser.add_argument("--mock", action="store_true",
|
||||||
help="Run in mock mode (no model or GPU needed)")
|
help="Run in mock mode (no model or GPU needed)")
|
||||||
|
|
@ -226,6 +244,7 @@ if __name__ == "__main__":
|
||||||
app = create_app(
|
app = create_app(
|
||||||
model_path=args.model,
|
model_path=args.model,
|
||||||
gpu_id=args.gpu_id,
|
gpu_id=args.gpu_id,
|
||||||
|
gpu_ids=args.gpu_ids,
|
||||||
backend=args.backend,
|
backend=args.backend,
|
||||||
mock=mock,
|
mock=mock,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -121,17 +121,19 @@ class TextBackend(Protocol):
|
||||||
|
|
||||||
def _select_backend(model_path: str, backend: str | None) -> str:
|
def _select_backend(model_path: str, backend: str | None) -> str:
|
||||||
"""
|
"""
|
||||||
Return "llamacpp" or "transformers" for the given model path.
|
Return "llamacpp", "transformers", "ollama", or "vllm" for the given model path.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
model_path Path to the model file or HuggingFace repo ID (e.g. "Qwen/Qwen2.5-3B").
|
model_path Path to the model file, HuggingFace repo ID, "ollama://<name>",
|
||||||
backend Explicit override from the caller ("llamacpp" | "transformers" | None).
|
or "vllm://<model-id>".
|
||||||
|
backend Explicit override from the caller
|
||||||
|
("llamacpp" | "transformers" | "ollama" | "vllm" | None).
|
||||||
When provided, trust it without inspection.
|
When provided, trust it without inspection.
|
||||||
|
|
||||||
Return "llamacpp" or "transformers". Raise ValueError for unrecognised values.
|
Raise ValueError for unrecognised override values.
|
||||||
"""
|
"""
|
||||||
_VALID = ("llamacpp", "transformers")
|
_VALID = ("llamacpp", "transformers", "ollama", "vllm")
|
||||||
|
|
||||||
# 1. Caller-supplied override — highest trust, no inspection needed.
|
# 1. Caller-supplied override — highest trust, no inspection needed.
|
||||||
resolved = backend or os.environ.get("CF_TEXT_BACKEND")
|
resolved = backend or os.environ.get("CF_TEXT_BACKEND")
|
||||||
|
|
@ -142,11 +144,17 @@ def _select_backend(model_path: str, backend: str | None) -> str:
|
||||||
)
|
)
|
||||||
return resolved
|
return resolved
|
||||||
|
|
||||||
# 2. Format detection — GGUF files are unambiguously llama-cpp territory.
|
# 2. Proxy prefixes — unambiguous routing regardless of model name format.
|
||||||
|
if model_path.startswith("ollama://"):
|
||||||
|
return "ollama"
|
||||||
|
if model_path.startswith("vllm://"):
|
||||||
|
return "vllm"
|
||||||
|
|
||||||
|
# 3. Format detection — GGUF files are unambiguously llama-cpp territory.
|
||||||
if model_path.lower().endswith(".gguf"):
|
if model_path.lower().endswith(".gguf"):
|
||||||
return "llamacpp"
|
return "llamacpp"
|
||||||
|
|
||||||
# 3. Safe default — transformers covers HF repo IDs and safetensors dirs.
|
# 4. Safe default — transformers covers HF repo IDs and safetensors dirs.
|
||||||
return "transformers"
|
return "transformers"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -179,4 +187,12 @@ def make_text_backend(
|
||||||
from circuitforge_core.text.backends.transformers import TransformersBackend
|
from circuitforge_core.text.backends.transformers import TransformersBackend
|
||||||
return TransformersBackend(model_path=model_path)
|
return TransformersBackend(model_path=model_path)
|
||||||
|
|
||||||
raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp' or 'transformers'.")
|
if resolved == "ollama":
|
||||||
|
from circuitforge_core.text.backends.ollama import OllamaBackend
|
||||||
|
return OllamaBackend(model_path=model_path)
|
||||||
|
|
||||||
|
if resolved == "vllm":
|
||||||
|
from circuitforge_core.text.backends.vllm import VllmBackend
|
||||||
|
return VllmBackend(model_path=model_path)
|
||||||
|
|
||||||
|
raise ValueError(f"Unknown backend {resolved!r}. Expected 'llamacpp', 'transformers', 'ollama', or 'vllm'.")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue