diff --git a/build/container/Dockerfile.runtime b/build/container/Dockerfile.runtime index 80b18774..1f63e8b2 100644 --- a/build/container/Dockerfile.runtime +++ b/build/container/Dockerfile.runtime @@ -34,7 +34,7 @@ COPY --from=builder /app/dist/*.whl ./ # Install build dependencies and clean up in one step (avoiding creating another new layer) RUN apt-get update \ - && apt-get install -y --no-install-recommends gcc python3-dev \ + && apt-get install -y --no-install-recommends gcc python3-dev mawk \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && pip install --no-cache-dir ./*.whl \ diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh index 9f6f651f..07f8b1e6 100755 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh @@ -16,7 +16,7 @@ # Result files will be added to 'PATH_PREFIX' directory. PATH_PREFIX=`dirname "$0"` -FILE_NAME="result" +OUTPUT_FILE= MODEL="llama2-7b" TEMPERATURE=0.0 @@ -40,8 +40,9 @@ generate_workload() { local num_prompts=$4 local model=$5 local workload_path=$6 + local output_dir=$7 - echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE, workload_path=$workload_path" + echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE, workload_path=$workload_path, output_dir=$output_dir" python $PATH_PREFIX/gen_benchmark_prompt.py \ $workload_path \ @@ -54,7 +55,8 @@ generate_workload() { --api-key "$api_key" \ --total-prompts "$num_prompts" \ --model "$model" \ - --temperature "$TEMPERATURE" + --temperature "$TEMPERATURE" \ + --output-dir "$output_dir" } while [[ $# -gt 0 ]]; do @@ -64,7 +66,7 @@ while [[ $# -gt 0 ]]; do shift 2 ;; -o|--output) - FILE_NAME="$2" + OUTPUT_FILE="$2" shift 2 ;; --input-start) @@ -115,10 +117,15 @@ while [[ $# -gt 0 ]]; do done -# Make sure the directory exists and clear output file -OUTPUT_FILE="${PATH_PREFIX}/result/${FILE_NAME}.jsonl" -PROMPT_DIR="${PATH_PREFIX}/result/prompts" -mkdir -p `dirname "$OUTPUT_FILE"` +if [[ -z "$OUTPUT_FILE" ]]; then + echo "Use default output path" + OUTPUT_FILE="${PATH_PREFIX}/result/${MODEL}.jsonl" +fi + +dir_path=$(dirname "$OUTPUT_FILE") +PROMPT_DIR="${dir_path}/prompts" + +mkdir -p "$(dirname "$OUTPUT_FILE")" mkdir -p "$PROMPT_DIR" # Clear the workload directory @@ -146,17 +153,16 @@ while [[ $input_len -le $input_limit ]]; do if [[ -n "$workload" ]]; then # Make sure all arguments are passed in the correct order - generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" "$workload" + generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" "$workload" "$dir_path" else echo "Skip workload pattern generation, benchmark with fixed prompts" fi - # Convert rate_start to integer (multiply by 1000 and remove decimals) - req_rate=$(echo "$rate_start * 1000" | bc | cut -d. -f1) - rate_limit_scaled=$(echo "$rate_limit * 1000" | bc | cut -d. -f1) + # Convert rate_start to integer (multiply by 1000 and remove decimals since -le does not work with floats) + req_rate=$(printf "%.0f\n" "$(echo "$rate_start * 1000" | awk '{print $1 * 1000}')") + rate_limit_scaled=$(printf "%.0f\n" "$(echo "$rate_limit * 1000" | awk '{print $1 * 1000}')") while [[ $req_rate -le $rate_limit_scaled ]]; do - actual_rate=$(echo "scale=3; $req_rate / 1000" | bc) - + actual_rate=$(echo "$req_rate" | awk '{ printf "%.3f", $1 / 1000 }') if [[ -n "$workload" ]]; then WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json" if [[ -f "$WORKLOAD_FILE" ]]; then diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py index 50824d96..4a4c8309 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py @@ -47,6 +47,7 @@ def __init__( temperature: float = 0.0, api_key: str = "any_key", total_prompts: int = 1, + output_dir: str = ".", ): self.trace_file = trace_file self.model_endpoint = model_endpoint @@ -56,6 +57,7 @@ def __init__( self.temperature = temperature self.api_key = api_key self.total_prompts = total_prompts + self.output_dir = output_dir def count_tokens(self, text: str) -> int: """Estimate token count using VLLM's tokenizer.""" @@ -163,9 +165,7 @@ def save_results( return # Get the directory where the script is located - script_dir = os.path.dirname(os.path.abspath(__file__)) - # Create the prompts directory relative to the script location - prompts_dir = os.path.join(script_dir, "result", "prompts") + prompts_dir = os.path.join(self.output_dir, "prompts") os.makedirs(prompts_dir, exist_ok=True) filename = os.path.join( @@ -272,6 +272,12 @@ def parse_args(): default="deepseek-coder-7b", help="Model name to use for completion", ) + parser.add_argument( + "--output-dir", + type=str, + help="Directory to save output files", + default=".", + ) return parser.parse_args() @@ -300,6 +306,7 @@ def main(): temperature=args.temperature, api_key=args.api_key, total_prompts=args.total_prompts, + output_dir=args.output_dir, ) matching_prompts = selector.find_matching_prompts( @@ -319,12 +326,12 @@ def main(): print(f"Output tokens: {output_tokens}") print(f"Complete usage data: {response_data.get('usage', {})}") print("-" * 40) - print("Prompt:") - print(prompt) - print("-" * 40) - print("Model completion:") - if "choices" in response_data: - print(response_data["choices"][0].get("message", {}).get("content", "")) + # print("Prompt:") + # print(prompt) + # print("-" * 40) + # print("Model completion:") + # if "choices" in response_data: + # print(response_data["choices"][0].get("message", {}).get("content", "")) end_time = time.time() print(f"\nTotal execution time: {end_time - start_time:.2f} seconds") diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py index cabe3edb..e26a4bc2 100644 --- a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py +++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py @@ -254,8 +254,8 @@ async def send_request( else len(token_latencies) + 1, "timestamp": ts.strftime("%Y-%m-%d %H:%M:%S %Z%z"), "E2E": request_latency, - "status_code": status_code, # Add status code to trace - "success": status_code == 200 if status_code else False, # Add success flag + "status_code": status_code, + "success": status_code == 200 if status_code else False, } if len(token_latencies) > 0: request_trace["TTFT"] = time_to_first