#!/bin/bash# This file demonstrates the example usage of disaggregated prefilling# We will launch 2 vllm instances (1 for prefill and 1 for decode),# and then transfer the KV cache between them.set-xe
echo"π§π§ Warning: The usage of disaggregated prefill is experimental and subject to change π§π§"sleep1# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-LiteMODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}# Trap the SIGINT signal (triggered by Ctrl+C)trap'cleanup'INT
# Cleanup functioncleanup(){echo"Caught Ctrl+C, cleaning up..."# Cleanup commandspgreppython|xargskill-9
pkill-fpython
echo"Cleanup complete. Exiting."exit0}exportVLLM_HOST_IP=$(hostname-I|awk'{print $1}')# install quart first -- required for disagg prefill proxy serveifpython3-c"import quart"&>/dev/null;thenecho"Quart is already installed."elseecho"Quart is not installed. Installing..."python3-mpipinstallquart
fi# a function that waits vLLM server to startwait_for_server(){localport=$1timeout1200bash-c" until curl -s localhost:${port}/v1/completions > /dev/null; do sleep 1 done"&&return0||return1}# You can also adjust --kv-ip and --kv-port for distributed inference.# prefilling instance, which is the KV producerCUDA_VISIBLE_DEVICES=0vllmserve$MODEL_NAME\--port8100\--max-model-len100\--gpu-memory-utilization0.8\--trust-remote-code\--kv-transfer-config\'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'&# decoding instance, which is the KV consumerCUDA_VISIBLE_DEVICES=1vllmserve$MODEL_NAME\--port8200\--max-model-len100\--gpu-memory-utilization0.8\--trust-remote-code\--kv-transfer-config\'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'&# wait until prefill and decode instances are readywait_for_server8100wait_for_server8200# launch a proxy server that opens the service at port 8000# the workflow of this proxy:# - send the request to prefill vLLM instance (port 8100), change max_tokens # to 1# - after the prefill vLLM finishes prefill, send the request to decode vLLM # instance# NOTE: the usage of this API is subject to change --- in the future we will # introduce "vllm connect" to connect between prefill and decode instancespython3../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py&sleep1# serve two example requestsoutput1=$(curl-XPOST-shttp://localhost:8000/v1/completions\-H"Content-Type: application/json"\-d'{"model": "'"$MODEL_NAME"'","prompt": "San Francisco is a","max_tokens": 10,"temperature": 0}')output2=$(curl-XPOST-shttp://localhost:8000/v1/completions\-H"Content-Type: application/json"\-d'{"model": "'"$MODEL_NAME"'","prompt": "Santa Clara is a","max_tokens": 10,"temperature": 0}')# Cleanup commandspgreppython|xargskill-9
pkill-fpython
echo""sleep1# Print the outputs of the curl requestsecho""echo"Output of first request: $output1"echo"Output of second request: $output2"echo"ππ Successfully finished 2 test requests! ππ"echo""