diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh index 3b18d7ba..9f6f651f 100755 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh @@ -39,17 +39,12 @@ generate_workload() { local api_key=$3 local num_prompts=$4 local model=$5 - - echo " input_len: $input_len" - echo " output_len: $output_len" - echo " api_key: $api_key" - echo " num_prompts: $num_prompts" - echo " model: $model" - echo " temperature: $TEMPERATURE" - echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE" + local workload_path=$6 + + echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE, workload_path=$workload_path" python $PATH_PREFIX/gen_benchmark_prompt.py \ - $workload \ + $workload_path \ --input-tokens "$input_len" \ --min-output-tokens "$output_len" \ --tolerance "0.2" \ @@ -148,18 +143,30 @@ input_len=$input_start while [[ $input_len -le $input_limit ]]; do output_len=$output_start while [[ $output_len -le $output_limit ]]; do - # Make sure all arguments are passed in the correct order - generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" + + if [[ -n "$workload" ]]; then + # Make sure all arguments are passed in the correct order + generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" "$workload" + else + echo "Skip workload pattern generation, benchmark with fixed prompts" + fi - # Convert rate_start to integer (multiply by 100 and remove decimals) - req_rate=$(echo "$rate_start * 100" | bc | cut -d. -f1) - rate_limit_scaled=$(echo "$rate_limit * 100" | bc | cut -d. -f1) + # Convert rate_start to integer (multiply by 1000 and remove decimals) + req_rate=$(echo "$rate_start * 1000" | bc | cut -d. -f1) + rate_limit_scaled=$(echo "$rate_limit * 1000" | bc | cut -d. -f1) while [[ $req_rate -le $rate_limit_scaled ]]; do - actual_rate=$(echo "scale=2; $req_rate / 100" | bc) - - WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json" - if [[ -f "$WORKLOAD_FILE" ]]; then - python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" + actual_rate=$(echo "scale=3; $req_rate / 1000" | bc) + + if [[ -n "$workload" ]]; then + WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json" + if [[ -f "$WORKLOAD_FILE" ]]; then + # If workload file exists, run the benchmark + python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" + fi + # If workload file does not exist, print the command to run the benchmark + else + echo "run benchmark with fixed prompts: input=$input_len, output=$output_len, rate=$actual_rate" + python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --stream >> "$OUTPUT_FILE" fi req_rate=$((req_rate * 2)) done @@ -167,5 +174,4 @@ while [[ $input_len -le $input_limit ]]; do done input_len=$((input_len * 2)) done - echo "Profiling finished." \ No newline at end of file diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py index 5baf827a..50824d96 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py @@ -286,7 +286,8 @@ def main(): print(f"QPS: {args.qps}") print(f"Max candidates: {args.max_candidates}") print(f"Model endpoint: http://{args.host}:{args.port}/v1/chat/completions") - print(f"Using API key: {'default' if args.api_key == 'any_key' else '****'}") + print(f"Using API key: {args.api_key}") + print(f"Workload dataset file: {args.workload_dataset_file}") print("-" * 80) model_endpoint = f"http://{args.host}:{args.port}/v1/chat/completions" diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py index aba43b8e..cabe3edb 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py @@ -85,17 +85,16 @@ def sample_requests( # print('the least requests: ', requests[len(requests) - 1]) return requests except Exception as e: - print_err( - f"Warning: Failed to load prompt dataset ({e}), falling back to synthetic prompts" - ) - - # # Original synthetic prompt generation - # requests = [] - # for _ in range(num_requests): - # synthetic_prompt = "hi " * config_input_len - # # assign timestamp to -1 for all requests - # requests.append((synthetic_prompt, config_input_len, config_output_len, -1)) - return [] + print_err(f"Warning: Failed to load prompt dataset ({e})") + return [] + else: + # Original synthetic prompt generation + requests = [] + for _ in range(num_requests): + synthetic_prompt = "hi " * config_input_len + # assign timestamp to -1 for all requests + requests.append((synthetic_prompt, config_input_len, config_output_len, -1)) + return requests async def get_request(