diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh index cb501879..3b18d7ba 100755 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh @@ -18,8 +18,9 @@ PATH_PREFIX=`dirname "$0"` FILE_NAME="result" MODEL="llama2-7b" +TEMPERATURE=0.0 -TOTAL=100 +TOTAL=100 # TODO: Set your preferred request sizes and rates here. input_start=4 input_limit=$((2**11)) # 2K @@ -30,10 +31,6 @@ rate_limit=$((2**6)) # 64 workload= dry_run=0 -debug_print() { - echo "[DEBUG] $1" -} - # Function to generate workload for specific input/output lengths generate_workload() { @@ -48,19 +45,8 @@ generate_workload() { echo " api_key: $api_key" echo " num_prompts: $num_prompts" echo " model: $model" - echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model" - - echo "python $PATH_PREFIX/gen_benchmark_prompt.py \ - $workload \ - --input-tokens \"$input_len\" \ - --min-output-tokens \"$output_len\" \ - --tolerance \"0.2\" \ - --qps \"2.0\" \ - --host \"localhost\" \ - --port \"8010\" \ - --api-key \"$api_key\" \ - --total-prompts \"$num_prompts\" \ - --model \"$model\"" + echo " temperature: $TEMPERATURE" + echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE" python $PATH_PREFIX/gen_benchmark_prompt.py \ $workload \ @@ -72,15 +58,14 @@ generate_workload() { --port "8010" \ --api-key "$api_key" \ --total-prompts "$num_prompts" \ - --model "$model" + --model "$model" \ + --temperature "$TEMPERATURE" } while [[ $# -gt 0 ]]; do - debug_print "Processing argument: $1" case "$1" in -m|--model) MODEL=$2 - debug_print "Set MODEL to: $MODEL" shift 2 ;; -o|--output) @@ -89,32 +74,26 @@ while [[ $# -gt 0 ]]; do ;; --input-start) input_start=$2 - debug_print "Set input_start to: $input_start" shift 2 ;; --input-limit) input_limit=$2 - debug_print "Set input_limit to: $input_limit" shift 2 ;; --output-start) output_start=$2 - debug_print "Set output_start to: $output_start" shift 2 ;; --output-limit) output_limit=$2 - debug_print "Set output_limit to: $output_limit" shift 2 ;; --rate-start) rate_start=$2 - debug_print "Set rate_start to: $rate_start" shift 2 ;; --rate-limit) rate_limit=$2 - debug_print "Set rate_limit to: $rate_limit" shift 2 ;; --dry-run) @@ -123,12 +102,14 @@ while [[ $# -gt 0 ]]; do ;; --api-key) LLM_API_KEY=$2 - debug_print "Set LLM_API_KEY to: $LLM_API_KEY" + shift 2 + ;; + --temperature) + TEMPERATURE=$2 shift 2 ;; --workload) workload="--workload_dataset_file $2" - debug_print "Set workload to: $workload" shift 2 ;; # *) @@ -138,14 +119,6 @@ while [[ $# -gt 0 ]]; do esac done -# Add debug prints for initial parameters -debug_print "Initial parameters:" -debug_print "MODEL: $MODEL" -debug_print "Input range: $input_start to $input_limit" -debug_print "Output range: $output_start to $output_limit" -debug_print "Rate range: $rate_start to $rate_limit" -debug_print "Workload file: $workload" - # Make sure the directory exists and clear output file OUTPUT_FILE="${PATH_PREFIX}/result/${FILE_NAME}.jsonl" @@ -186,7 +159,7 @@ while [[ $input_len -le $input_limit ]]; do WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json" if [[ -f "$WORKLOAD_FILE" ]]; then - python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --workload_dataset_file $WORKLOAD_FILE >> "$OUTPUT_FILE" + python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" fi req_rate=$((req_rate * 2)) done diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py index fb2898cf..a80fc88c 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py @@ -10,7 +10,7 @@ def get_tokenizer( pretrained_model_name_or_path: str, trust_remote_code: bool -) -> Any: # Changed return type to Any since we're returning a different tokenizer type +) -> Any: """Get tiktoken tokenizer.""" try: # Use cl100k_base for ChatGPT-style models @@ -115,7 +115,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int # Sort candidates by input difference candidates.sort(key=lambda x: x[2]) - # If max_candidates is not specified, use all candidates + # If max_candidates is not specified, use all candidates or choosing the first max_candidates number of candidates if max_candidates is not None: candidates = candidates[:max_candidates] @@ -131,7 +131,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int if output_tokens and output_tokens >= min_output_tokens: matching_prompts.append((prompt, input_tokens, output_tokens, response_data)) - break # We found our match, we can stop + break #No match found, stop the loop print("-" * 80) @@ -148,7 +148,6 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]], # Get the directory where the script is located script_dir = os.path.dirname(os.path.abspath(__file__)) - # Create the prompts directory relative to the script location prompts_dir = os.path.join(script_dir, "result", "prompts") os.makedirs(prompts_dir, exist_ok=True) @@ -162,7 +161,7 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]], for prompt, input_tokens, output_tokens, response_data in matching_prompts: for i in range(self.total_prompts): benchmark_format.append({ - "Timestamp": base_timestamp + (i * 1000), # Increment by 1000 for each prompt + "Timestamp": base_timestamp + (i * 1000), "Requests": [{ "Prompt": prompt, "Prompt Length": input_tokens, diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py index 55e77a37..8362099f 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py @@ -195,70 +195,53 @@ async def send_request( ts = datetime.now(timezone.utc) timeout = aiohttp.ClientTimeout(total=3 * 3600) status_code = None - error_msg = None - token_latencies = [] # Initialize here - time_to_first = 0.0 # Initialize here - - try: - async with aiohttp.ClientSession(timeout=timeout) as session: - while True: - async with session.post(api_url, headers=headers, json=pload) as response: - status_code = response.status - response_status = "success" if status_code == 200 else "failed" - - # Capture error response for non-200 status codes - if status_code != 200: - error_response = await response.text() - try: - error_json = json.loads(error_response) - error_msg = error_json.get('error', error_response) - except: - error_msg = error_response - print_err(f"Request {idx} failed with status {status_code}: {error_msg}") - break - - chunks = [] - previous_token_time = time.perf_counter() - first = True - - try: - if streaming: - async for chunk, _ in response.content.iter_chunks(): - chunks = [chunk] - now_time = time.perf_counter() - if first: - time_to_first = now_time - previous_token_time - first = False - else: - token_latencies.append(now_time - previous_token_time) - previous_token_time = now_time - # Stream off: Chunks are full response. - # chunks.append(chunk) - - output = b"".join(chunks).decode("utf-8") - santicized = output.rstrip("\n\t ") - else: - time_to_first = time.perf_counter() - previous_token_time - output = await response.text() - santicized = output - except Exception as e: - error_msg = f"Failed to read response: {str(e)}" - print_err(f"Failed to read response for request {idx}: {e}") - break - + async with aiohttp.ClientSession(timeout=timeout) as session: + while True: + # print(f"Sending request: {api_url}:{pload}") + async with session.post(api_url, headers=headers, json=pload) as response: + status_code = response.status + chunks = [] + token_latencies = [] + previous_token_time = time.perf_counter() + first = True try: - ret = load_response(santicized) - if "error" not in ret: - break - error_msg = f"API error: {ret.get('error', 'Unknown error')}" + if streaming: + async for chunk, _ in response.content.iter_chunks(): + # Stream on: Each chunk in the response is the full response so far + chunks = [chunk] + + now_time = time.perf_counter() + if first: + time_to_first = now_time - previous_token_time + first = False + else: + token_latencies.append(now_time - previous_token_time) + previous_token_time = now_time + + # Stream off: Chunks are full response. + # chunks.append(chunk) + + output = b"".join(chunks).decode("utf-8") + santicized = output.rstrip( + "\n\t " + ) # Remove trailing whitespace characters including EOF, and "[DONE]" + else: + time_to_first = time.perf_counter() - previous_token_time + output = await response.text() + santicized = output except Exception as e: - error_msg = f"Failed to parse response: {str(e)}" - print_err(f"Invalid response for request {idx}: {santicized}: {e}") + print_err(f"Failed to read response for request {idx}: {e}") + break + try: + ret = load_response(santicized) + + # Re-send the request if it failed. + if "error" not in ret: break - except Exception as e: - # It's ok to parse failure, santicized output could be jsonl, other format, or internal error. - print_err(f"Invalid response for request {idx}: {santicized}: {e}") - return + except Exception as e: + # It's ok to parse failure, santicized output could be jsonl, other format, or internal error. + print_err(f"Invalid response for request {idx}: {santicized}: {e}") + break request_end_time = time.perf_counter() request_latency = request_end_time - request_start_time @@ -272,18 +255,15 @@ async def send_request( else len(token_latencies) + 1, "timestamp": ts.strftime("%Y-%m-%d %H:%M:%S %Z%z"), "E2E": request_latency, - "status_code": status_code, - "success": status_code == 200 if status_code else False, - # "request_payload": pload + "status_code": status_code, # Add status code to trace + "success": status_code == 200 if status_code else False # Add success flag } - if error_msg: - request_trace["error"] = error_msg if len(token_latencies) > 0: request_trace["TTFT"] = time_to_first - request_trace["TPOT_mean"] = np.mean(token_latencies) - request_trace["TPOT_P50"] = np.percentile(token_latencies, 50) - request_trace["TPOT_P90"] = np.percentile(token_latencies, 90) - request_trace["TPOT_P99"] = np.percentile(token_latencies, 99) + request_trace["TPOT_mean"] = np.mean(token_latencies) # type: ignore + request_trace["TPOT_P50"] = np.percentile(token_latencies, 50) # type: ignore + request_trace["TPOT_P90"] = np.percentile(token_latencies, 90) # type: ignore + request_trace["TPOT_P99"] = np.percentile(token_latencies, 99) # type: ignore print(json.dumps(request_trace)) REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) if len(token_latencies) > 0: @@ -336,6 +316,10 @@ async def benchmark( def main(args: argparse.Namespace): + # Set global temperature from args + global TEMPERATURE + TEMPERATURE = args.temperature + result = {} if args.verbose: print(args) @@ -345,6 +329,7 @@ def main(args: argparse.Namespace): result["request_rate"] = args.request_rate result["seed"] = args.seed result["model"] = args.model + result["temperature"] = args.temperature result["samples"] = args.num_prompts random.seed(args.seed) @@ -524,5 +509,11 @@ def main(args: argparse.Namespace): help="Path to a JSON file containing prompts", ) parser.add_argument("--use-workload-interval", action="store_true") + parser.add_argument( + "--temperature", + type=float, + default=0.0, + help="Temperature for text generation (default: 0.0)", + ) args = parser.parse_args() main(args)