App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
160 lines
4.1 KiB
Bash
Executable file
160 lines
4.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# scripts/manage-vllm.sh — manage the vLLM inference server
|
|
# Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]
|
|
|
|
set -euo pipefail
|
|
|
|
VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python"
|
|
MODEL_DIR="/Library/Assets/LLM/vllm/models"
|
|
PID_FILE="/tmp/vllm-server.pid"
|
|
LOG_FILE="/tmp/vllm-server.log"
|
|
MODEL_FILE="/tmp/vllm-server.model"
|
|
PORT=8000
|
|
GPU=1
|
|
|
|
_list_model_names() {
|
|
if [[ -d "$MODEL_DIR" ]]; then
|
|
find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort
|
|
fi
|
|
}
|
|
|
|
is_running() {
|
|
if [[ -f "$PID_FILE" ]]; then
|
|
PID=$(cat "$PID_FILE")
|
|
if kill -0 "$PID" 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
start() {
|
|
local model_name="${1:-}"
|
|
|
|
if [[ -z "$model_name" ]]; then
|
|
model_name=$(_list_model_names | head -1)
|
|
if [[ -z "$model_name" ]]; then
|
|
echo "No models found in $MODEL_DIR"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
local model_path
|
|
if [[ "$model_name" == /* ]]; then
|
|
model_path="$model_name"
|
|
model_name=$(basename "$model_path")
|
|
else
|
|
model_path="$MODEL_DIR/$model_name"
|
|
fi
|
|
|
|
if [[ ! -d "$model_path" ]]; then
|
|
echo "Model not found: $model_path"
|
|
exit 1
|
|
fi
|
|
|
|
if is_running; then
|
|
echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
|
|
return 0
|
|
fi
|
|
|
|
echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…"
|
|
echo "$model_name" > "$MODEL_FILE"
|
|
|
|
# Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard
|
|
# transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928.
|
|
CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \
|
|
--model "$model_path" \
|
|
--trust-remote-code \
|
|
--max-model-len 3072 \
|
|
--gpu-memory-utilization 0.75 \
|
|
--enforce-eager \
|
|
--max-num-seqs 8 \
|
|
--port "$PORT" \
|
|
> "$LOG_FILE" 2>&1 &
|
|
echo $! > "$PID_FILE"
|
|
sleep 3
|
|
|
|
if is_running; then
|
|
echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
|
|
else
|
|
echo "Failed to start. Check logs: $LOG_FILE"
|
|
tail -20 "$LOG_FILE"
|
|
rm -f "$PID_FILE" "$MODEL_FILE"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
stop() {
|
|
if ! is_running; then
|
|
echo "Not running."
|
|
rm -f "$PID_FILE"
|
|
return 0
|
|
fi
|
|
|
|
PID=$(cat "$PID_FILE")
|
|
echo "Stopping PID $PID …"
|
|
kill "$PID" 2>/dev/null || true
|
|
sleep 2
|
|
if kill -0 "$PID" 2>/dev/null; then
|
|
kill -9 "$PID" 2>/dev/null || true
|
|
fi
|
|
rm -f "$PID_FILE" "$MODEL_FILE"
|
|
echo "Stopped."
|
|
}
|
|
|
|
restart() {
|
|
local model_name="${1:-}"
|
|
stop
|
|
sleep 1
|
|
start "$model_name"
|
|
}
|
|
|
|
status() {
|
|
if is_running; then
|
|
local model=""
|
|
if [[ -f "$MODEL_FILE" ]]; then
|
|
model=" — model: $(cat "$MODEL_FILE")"
|
|
fi
|
|
echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model"
|
|
else
|
|
echo "Not running."
|
|
fi
|
|
}
|
|
|
|
logs() {
|
|
if [[ -f "$LOG_FILE" ]]; then
|
|
tail -50 "$LOG_FILE"
|
|
else
|
|
echo "No log file found at $LOG_FILE"
|
|
fi
|
|
}
|
|
|
|
list() {
|
|
echo "Available models in $MODEL_DIR:"
|
|
_list_model_names | while read -r name; do
|
|
echo " - $name"
|
|
done
|
|
}
|
|
|
|
CMD="${1:-help}"
|
|
case "$CMD" in
|
|
start) start "${2:-}" ;;
|
|
stop) stop ;;
|
|
restart) restart "${2:-}" ;;
|
|
status) status ;;
|
|
logs) logs ;;
|
|
list) list ;;
|
|
*)
|
|
echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]"
|
|
echo ""
|
|
echo " start [model] Start vLLM with the specified model (default: first in $MODEL_DIR)"
|
|
echo " stop Stop the running vLLM server"
|
|
echo " restart [model] Stop then start (pass a new model name to swap)"
|
|
echo " status Show whether it's running and which model is loaded"
|
|
echo " logs Tail the last 50 lines of the log"
|
|
echo " list List available models"
|
|
echo ""
|
|
echo " GPU: $GPU (CUDA_VISIBLE_DEVICES)"
|
|
echo " Port: $PORT"
|
|
;;
|
|
esac
|