Support benchmarking script by using real application trace (#737)

Signed-off-by: Ning Wang <[email protected]> Co-authored-by: Ning <[email protected]>
vllm-project · Mar 1, 2025 · d6148a4 · d6148a4
1 parent da4841d
commit d6148a4
Show file tree

Hide file tree

Showing 6 changed files with 601 additions and 28 deletions.
diff --git a/build/container/Dockerfile.runtime b/build/container/Dockerfile.runtime
@@ -34,7 +34,7 @@ COPY --from=builder /app/dist/*.whl ./
 
 # Install build dependencies and clean up in one step (avoiding creating another new layer)
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends gcc python3-dev \
+    && apt-get install -y --no-install-recommends gcc python3-dev mawk \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/* \
     && pip install --no-cache-dir ./*.whl \

diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh
@@ -18,8 +18,9 @@
 PATH_PREFIX=`dirname "$0"`
 OUTPUT_FILE=
 MODEL="llama2-7b"
-TOTAL=100
-# TODO: Set your preferred request sizes and rates here.
+TEMPERATURE=0.0  
+
+TOTAL=100  # Set your preferred request sizes and rates here.
 input_start=4
 input_limit=$((2**12)) # 4K
 output_start=4
@@ -29,10 +30,39 @@ rate_limit=$((2**6)) # 64
 workload=
 dry_run=0
 
+
+# Function to generate workload for specific input/output lengths
+generate_workload() {
+    local input_len=$1
+    local output_len=$2
+    local api_key=$3
+    local num_prompts=$4
+    local model=$5
+    local workload_path=$6
+    local output_dir=$7
+
+    local prompt_path
+    prompt_path=$(python $PATH_PREFIX/gen_benchmark_prompt.py \
+        $workload_path  \
+        --input-tokens "$input_len" \
+        --min-output-tokens "$output_len" \
+        --tolerance "0.2" \
+        --qps "2.0" \
+        --host "localhost" \
+        --port "8010" \
+        --api-key "$api_key" \
+        --total-prompts "$num_prompts" \
+        --model "$model" \
+        --temperature "$TEMPERATURE" \
+        --output-dir "$output_dir" 2>&1 | tail -n 1)
+
+    echo "$prompt_path"
+}
+
 while [[ $# -gt 0 ]]; do
   case "$1" in
     -m|--model)
-      MODEL="$2"
+      MODEL=$2
       shift 2
       ;;
     -o|--output)
@@ -71,6 +101,10 @@ while [[ $# -gt 0 ]]; do
       LLM_API_KEY=$2
       shift 2
       ;;
+    --temperature)
+      TEMPERATURE=$2
+      shift 2
+      ;;
     --workload)
       workload="--workload_dataset_file $2"
       shift 2
@@ -82,31 +116,69 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
-# Make sure the directory exists and clear output file
+
 if [[ -z "$OUTPUT_FILE" ]]; then
+  echo "Use default output path"
   OUTPUT_FILE="${PATH_PREFIX}/result/${MODEL}.jsonl"
 fi
-mkdir -p `dirname "$OUTPUT_FILE"`
+
+dir_path=$(dirname "$OUTPUT_FILE")
+PROMPT_DIR="${dir_path}/prompts"
+
+mkdir -p "$(dirname "$OUTPUT_FILE")"
+mkdir -p "$PROMPT_DIR"
+
+# Clear the workload directory
+echo "Clearing workload directory: $PROMPT_DIR"
+rm -rf "$PROMPT_DIR"/*
+
+# Append Mode: Uncomment the below line if you want the output file to be empty every run
+# > "$OUTPUT_FILE"
 
 # Print the arguments (or use them in your script logic)
-echo "Start benchmark $MODEL, input tokens:[$input_start:$input_limit], output tokens:[$output_start:$output_limit], rates:[$rate_start:$rate_limit], save as: $OUTPUT_FILE"
+echo "Start benchmark $MODEL, input tokens:[$input_start:$input_limit], output tokens:[$output_start:$output_limit], rates:[$rate_start:$rate_limit], save as: $OUTPUT_FILE", workload: "$workload"
+
+
 if [[ $dry_run == 1 ]]; then
-  echo "Dru run enabled, skip profiling."
+  echo "Dry run enabled, skip profiling."
   exit
 fi
 
+# Run the benchmark for each combination
+echo "Starting benchmark..."
 input_len=$input_start
 while [[ $input_len -le $input_limit ]]; do
   output_len=$output_start
   while [[ $output_len -le $output_limit ]]; do
-    req_rate=$rate_start
-    while [[ $req_rate -le $rate_limit ]]; do
-      python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$req_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --stream $workload 1>>${OUTPUT_FILE} 
+
+    if [[ -n "$workload" ]]; then
+      # Make sure all arguments are passed in the correct order
+      WORKLOAD_FILE=$(generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" "$workload" "$dir_path")
+      echo "Workload file: $WORKLOAD_FILE"
+    else
+      echo "Skip workload pattern generation, benchmark with fixed prompts"
+    fi
+
+    # Convert rate_start to integer (multiply by 1000 and remove decimals since -le does not work with floats)
+    req_rate=$(printf "%.0f\n" "$(echo "$rate_start * 1000" | awk '{print $1 * 1000}')")    
+    rate_limit_scaled=$(printf "%.0f\n" "$(echo "$rate_limit * 1000" | awk '{print $1 * 1000}')")
+    while [[ $req_rate -le $rate_limit_scaled ]]; do
+      actual_rate=$(echo "$req_rate" | awk '{ printf "%.3f", $1 / 1000 }')
+      if [[ -n "$workload" ]]; then
+        if [[ -f "$WORKLOAD_FILE" ]]; then
+            echo "run benchmark with workload file: $WORKLOAD_FILE"
+            # If workload file exists, run the benchmark with $WORKLOAD_FILE
+            python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" 
+        fi
+        # If workload file does not exist, print the command to run the benchmark
+      else
+        echo "run benchmark with fixed prompts: input=$input_len, output=$output_len, rate=$actual_rate"
+        python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --stream >> "$OUTPUT_FILE" 
+      fi
       req_rate=$((req_rate * 2)) 
     done
     output_len=$((output_len * 2)) 
   done
-  input_len=$((input_len * 2)) 
+  input_len=$((input_len * 2))
 done
-
-echo "Profiling finished."
+echo "Benchmarking finished."