#!/bin/bash# =============================================================================# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture# =============================================================================# This script demonstrates disaggregated prefill and decode serving using# P2P NCCL communication. The architecture supports various XpYd configurations:## - 1P3D: 1 Prefill server + 3 Decode servers (current default)# - 3P1D: 3 Prefill servers + 1 Decode server# - etc.## Configuration can be customized via environment variables:# MODEL: Model to serve# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers# DECODE_GPUS: Comma-separated GPU IDs for decode servers# PREFILL_PORTS: Comma-separated ports for prefill servers# DECODE_PORTS: Comma-separated ports for decode servers# PROXY_PORT: Proxy server port used to setup XpYd connection.# TIMEOUT_SECONDS: Server startup timeout# =============================================================================# Configuration - can be overridden via environment variablesMODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}PROXY_PORT=${PROXY_PORT:-30001}# Default 1P3D configuration (1 Prefill + 3 Decode)PREFILL_GPUS=${PREFILL_GPUS:-0}DECODE_GPUS=${DECODE_GPUS:-1,2,3}PREFILL_PORTS=${PREFILL_PORTS:-20003}DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}echo"Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."echo""echo"Architecture Configuration:"echo" Model: $MODEL"echo" Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"echo" Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"echo" Proxy Port: $PROXY_PORT"echo" Timeout: ${TIMEOUT_SECONDS}s"echo""PIDS=()# Switch to the directory of the current scriptcd"$(dirname"${BASH_SOURCE[0]}")"check_required_files(){localfiles=("disagg_proxy_p2p_nccl_xpyd.py")forfilein"${files[@]}";doif[[!-f"$file"]];thenecho"Required file $file not found in $(pwd)"exit1fidone}check_hf_token(){if[-z"$HF_TOKEN"];thenecho"HF_TOKEN is not set. Please set it to your Hugging Face token."echo"Example: export HF_TOKEN=your_token_here"exit1fiif[["$HF_TOKEN"!=hf_*]];thenecho"HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."exit1fiecho"HF_TOKEN is set and valid."}check_num_gpus(){# Check if the number of GPUs are >=2 via nvidia-sminum_gpus=$(nvidia-smi--query-gpu=name--format=csv,noheader|wc-l)if["$num_gpus"-lt2];thenecho"You need at least 2 GPUs to run disaggregated prefill."exit1elseecho"Found $num_gpus GPUs."fi}ensure_python_library_installed(){echo"Checking if $1 is installed..."if!python3-c"import $1">/dev/null2>&1;thenecho"$1 is not installed. Please install it via pip install $1."exit1elseecho"$1 is installed."fi}cleanup(){echo"Stopping everything…"trap-INTTERM# prevent re-entrancypkill-9-f"disagg_proxy_p2p_nccl_xpyd.py"kill---$$# negative PID == "this whole process-group"wait# reap children so we don't leave zombiesexit0}wait_for_server(){localport=$1localtimeout_seconds=$TIMEOUT_SECONDSlocalstart_time=$(date+%s)echo"Waiting for server on port $port..."whiletrue;doifcurl-s"localhost:${port}/v1/completions">/dev/null;thenecho"Server on port $port is ready."return0filocalnow=$(date+%s)if((now-start_time>=timeout_seconds));thenecho"Timeout waiting for server on port $port"return1fisleep1done}main(){check_required_files
check_hf_token
check_num_gpus
ensure_python_library_installedpandas
ensure_python_library_installeddatasets
ensure_python_library_installedvllm
ensure_python_library_installedquart
trapcleanupINT
trapcleanupUSR1
trapcleanupTERM
echo"Launching disaggregated serving components..."echo"Please check the log files for detailed output:"echo" - prefill*.log: Prefill server logs"echo" - decode*.log: Decode server logs"echo" - proxy.log: Proxy server log"# =============================================================================# Launch Proxy Server# =============================================================================echo""echo"Starting proxy server on port $PROXY_PORT..."python3disagg_proxy_p2p_nccl_xpyd.py&PIDS+=($!)# Parse GPU and port arraysIFS=','read-raPREFILL_GPU_ARRAY<<<"$PREFILL_GPUS"IFS=','read-raDECODE_GPU_ARRAY<<<"$DECODE_GPUS"IFS=','read-raPREFILL_PORT_ARRAY<<<"$PREFILL_PORTS"IFS=','read-raDECODE_PORT_ARRAY<<<"$DECODE_PORTS"# =============================================================================# Launch Prefill Servers (X Producers)# =============================================================================echo""echo"Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."foriin"${!PREFILL_GPU_ARRAY[@]}";dolocalgpu_id=${PREFILL_GPU_ARRAY[$i]}localport=${PREFILL_PORT_ARRAY[$i]}localkv_port=$((21001+i))echo" Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"CUDA_VISIBLE_DEVICES=$gpu_idvllmserve$MODEL\--enforce-eager\--host0.0.0.0\--port$port\--tensor-parallel-size1\--seed1024\--dtypefloat16\--max-model-len10000\--max-num-batched-tokens10000\--max-num-seqs256\--trust-remote-code\--gpu-memory-utilization0.9\--kv-transfer-config\"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}">prefill$((i+1)).log2>&1&PIDS+=($!)done# =============================================================================# Launch Decode Servers (Y Decoders)# =============================================================================echo""echo"Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."foriin"${!DECODE_GPU_ARRAY[@]}";dolocalgpu_id=${DECODE_GPU_ARRAY[$i]}localport=${DECODE_PORT_ARRAY[$i]}localkv_port=$((22001+i))echo" Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"CUDA_VISIBLE_DEVICES=$gpu_idvllmserve$MODEL\--enforce-eager\--host0.0.0.0\--port$port\--tensor-parallel-size1\--seed1024\--dtypefloat16\--max-model-len10000\--max-num-batched-tokens10000\--max-num-seqs256\--trust-remote-code\--gpu-memory-utilization0.7\--kv-transfer-config\"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}">decode$((i+1)).log2>&1&PIDS+=($!)done# =============================================================================# Wait for All Servers to Start# =============================================================================echo""echo"Waiting for all servers to start..."forportin"${PREFILL_PORT_ARRAY[@]}""${DECODE_PORT_ARRAY[@]}";doif!wait_for_server$port;thenecho"Failed to start server on port $port"cleanup
exit1fidoneecho""echo"All servers are up. Starting benchmark..."# =============================================================================# Run Benchmark# =============================================================================cd../../../benchmarks/
vllmbenchserve--port10001--seed$(date+%s)\--model$MODEL\--dataset-namerandom--random-input-len7500--random-output-len200\--num-prompts200--burstiness100--request-rate2|teebenchmark.log
echo"Benchmarking done. Cleaning up..."cleanup
}main