Skip to content

Commit

Permalink
update benchmark logic and support using workload file and not using …
Browse files Browse the repository at this point in the history
…workload file

Signed-off-by: Ning Wang <[email protected]>
  • Loading branch information
nwangfw committed Feb 26, 2025
1 parent b02dabf commit 3db2882
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 32 deletions.
46 changes: 26 additions & 20 deletions python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,12 @@ generate_workload() {
local api_key=$3
local num_prompts=$4
local model=$5

echo " input_len: $input_len"
echo " output_len: $output_len"
echo " api_key: $api_key"
echo " num_prompts: $num_prompts"
echo " model: $model"
echo " temperature: $TEMPERATURE"
echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE"
local workload_path=$6

echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE, workload_path=$workload_path"

python $PATH_PREFIX/gen_benchmark_prompt.py \
$workload \
$workload_path \
--input-tokens "$input_len" \
--min-output-tokens "$output_len" \
--tolerance "0.2" \
Expand Down Expand Up @@ -148,24 +143,35 @@ input_len=$input_start
while [[ $input_len -le $input_limit ]]; do
output_len=$output_start
while [[ $output_len -le $output_limit ]]; do
# Make sure all arguments are passed in the correct order
generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL"

if [[ -n "$workload" ]]; then
# Make sure all arguments are passed in the correct order
generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" "$workload"
else
echo "Skip workload pattern generation, benchmark with fixed prompts"
fi

# Convert rate_start to integer (multiply by 100 and remove decimals)
req_rate=$(echo "$rate_start * 100" | bc | cut -d. -f1)
rate_limit_scaled=$(echo "$rate_limit * 100" | bc | cut -d. -f1)
# Convert rate_start to integer (multiply by 1000 and remove decimals)
req_rate=$(echo "$rate_start * 1000" | bc | cut -d. -f1)
rate_limit_scaled=$(echo "$rate_limit * 1000" | bc | cut -d. -f1)
while [[ $req_rate -le $rate_limit_scaled ]]; do
actual_rate=$(echo "scale=2; $req_rate / 100" | bc)

WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json"
if [[ -f "$WORKLOAD_FILE" ]]; then
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE"
actual_rate=$(echo "scale=3; $req_rate / 1000" | bc)

if [[ -n "$workload" ]]; then
WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json"
if [[ -f "$WORKLOAD_FILE" ]]; then
# If workload file exists, run the benchmark
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE"
fi
# If workload file does not exist, print the command to run the benchmark
else
echo "run benchmark with fixed prompts: input=$input_len, output=$output_len, rate=$actual_rate"
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --stream >> "$OUTPUT_FILE"
fi
req_rate=$((req_rate * 2))
done
output_len=$((output_len * 2))
done
input_len=$((input_len * 2))
done

echo "Profiling finished."
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ def main():
print(f"QPS: {args.qps}")
print(f"Max candidates: {args.max_candidates}")
print(f"Model endpoint: http://{args.host}:{args.port}/v1/chat/completions")
print(f"Using API key: {'default' if args.api_key == 'any_key' else '****'}")
print(f"Using API key: {args.api_key}")
print(f"Workload dataset file: {args.workload_dataset_file}")
print("-" * 80)

model_endpoint = f"http://{args.host}:{args.port}/v1/chat/completions"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,16 @@ def sample_requests(
# print('the least requests: ', requests[len(requests) - 1])
return requests
except Exception as e:
print_err(
f"Warning: Failed to load prompt dataset ({e}), falling back to synthetic prompts"
)

# # Original synthetic prompt generation
# requests = []
# for _ in range(num_requests):
# synthetic_prompt = "hi " * config_input_len
# # assign timestamp to -1 for all requests
# requests.append((synthetic_prompt, config_input_len, config_output_len, -1))
return []
print_err(f"Warning: Failed to load prompt dataset ({e})")
return []
else:
# Original synthetic prompt generation
requests = []
for _ in range(num_requests):
synthetic_prompt = "hi " * config_input_len
# assign timestamp to -1 for all requests
requests.append((synthetic_prompt, config_input_len, config_output_len, -1))
return requests


async def get_request(
Expand Down

0 comments on commit 3db2882

Please sign in to comment.