peregrine/scripts/manage-vllm.sh

#!/usr/bin/env bash
# scripts/manage-vllm.sh — manage the vLLM inference server
# Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]

set -euo pipefail

VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python"
MODEL_DIR="/Library/Assets/LLM/vllm/models"
PID_FILE="/tmp/vllm-server.pid"
LOG_FILE="/tmp/vllm-server.log"
MODEL_FILE="/tmp/vllm-server.model"
PORT=8000
GPU=1

_list_model_names() {
    if [[ -d "$MODEL_DIR" ]]; then
        find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort
    fi
}

is_running() {
    if [[ -f "$PID_FILE" ]]; then
        PID=$(cat "$PID_FILE")
        if kill -0 "$PID" 2>/dev/null; then
            return 0
        fi
    fi
    return 1
}

start() {
    local model_name="${1:-}"

    if [[ -z "$model_name" ]]; then
        model_name=$(_list_model_names | head -1)
        if [[ -z "$model_name" ]]; then
            echo "No models found in $MODEL_DIR"
            exit 1
        fi
    fi

    local model_path
    if [[ "$model_name" == /* ]]; then
        model_path="$model_name"
        model_name=$(basename "$model_path")
    else
        model_path="$MODEL_DIR/$model_name"
    fi

    if [[ ! -d "$model_path" ]]; then
        echo "Model not found: $model_path"
        exit 1
    fi

    if is_running; then
        echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
        return 0
    fi

    echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…"
    echo "$model_name" > "$MODEL_FILE"

    # Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard
    # transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928.
    CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \
        --model "$model_path" \
        --trust-remote-code \
        --max-model-len 3072 \
        --gpu-memory-utilization 0.75 \
        --enforce-eager \
        --max-num-seqs 8 \
        --port "$PORT" \
        > "$LOG_FILE" 2>&1 &
    echo $! > "$PID_FILE"
    sleep 3

    if is_running; then
        echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
    else
        echo "Failed to start. Check logs: $LOG_FILE"
        tail -20 "$LOG_FILE"
        rm -f "$PID_FILE" "$MODEL_FILE"
        exit 1
    fi
}

stop() {
    if ! is_running; then
        echo "Not running."
        rm -f "$PID_FILE"
        return 0
    fi

    PID=$(cat "$PID_FILE")
    echo "Stopping PID $PID …"
    kill "$PID" 2>/dev/null || true
    sleep 2
    if kill -0 "$PID" 2>/dev/null; then
        kill -9 "$PID" 2>/dev/null || true
    fi
    rm -f "$PID_FILE" "$MODEL_FILE"
    echo "Stopped."
}

restart() {
    local model_name="${1:-}"
    stop
    sleep 1
    start "$model_name"
}

status() {
    if is_running; then
        local model=""
        if [[ -f "$MODEL_FILE" ]]; then
            model=" — model: $(cat "$MODEL_FILE")"
        fi
        echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model"
    else
        echo "Not running."
    fi
}

logs() {
    if [[ -f "$LOG_FILE" ]]; then
        tail -50 "$LOG_FILE"
    else
        echo "No log file found at $LOG_FILE"
    fi
}

list() {
    echo "Available models in $MODEL_DIR:"
    _list_model_names | while read -r name; do
        echo "  - $name"
    done
}

CMD="${1:-help}"
case "$CMD" in
    start)   start "${2:-}" ;;
    stop)    stop ;;
    restart) restart "${2:-}" ;;
    status)  status ;;
    logs)    logs ;;
    list)    list ;;
    *)
        echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]"
        echo ""
        echo "  start [model]    Start vLLM with the specified model (default: first in $MODEL_DIR)"
        echo "  stop             Stop the running vLLM server"
        echo "  restart [model]  Stop then start (pass a new model name to swap)"
        echo "  status           Show whether it's running and which model is loaded"
        echo "  logs             Tail the last 50 lines of the log"
        echo "  list             List available models"
        echo ""
        echo "  GPU:  $GPU (CUDA_VISIBLE_DEVICES)"
        echo "  Port: $PORT"
        ;;
esac