From b609c33220440c22073646736a881bab3824f2fc Mon Sep 17 00:00:00 2001 From: Jingyuan Zhang Date: Mon, 25 Nov 2024 13:58:22 -0800 Subject: [PATCH] Add a40 deployment --- docs/development/simulator/Dockerfile | 4 +-- docs/development/simulator/Dockerfile-a40 | 32 +++++++++++++++++++ docs/development/simulator/Makefile | 17 ++++++++-- .../development/simulator/deployment-a40.yaml | 2 +- python/aibrix/aibrix/gpuoptimizer/README.md | 11 +++++-- .../optimizer/profiling/README.md | 2 ++ .../optimizer/profiling/benchmark.sh | 4 +-- 7 files changed, 62 insertions(+), 10 deletions(-) create mode 100644 docs/development/simulator/Dockerfile-a40 diff --git a/docs/development/simulator/Dockerfile b/docs/development/simulator/Dockerfile index 0f06ae45..9e36d9da 100644 --- a/docs/development/simulator/Dockerfile +++ b/docs/development/simulator/Dockerfile @@ -19,11 +19,11 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy the rest of the application code into the container COPY ./*.py /simulator/ -COPY ./model_cache /simulator/model_cache +# COPY ./model_cache /simulator/model_cache ENV MODEL_NAME=llama2-7b # Trigger profiling -RUN python app.py --time_limit 1000 +RUN python app.py --time_limit 1000 --replica_config_device a100 # Expose the port the app runs on EXPOSE 8000 diff --git a/docs/development/simulator/Dockerfile-a40 b/docs/development/simulator/Dockerfile-a40 new file mode 100644 index 00000000..ac1c9be0 --- /dev/null +++ b/docs/development/simulator/Dockerfile-a40 @@ -0,0 +1,32 @@ +# Use the official Python base image +FROM python:3.10-slim + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV WANDB_MODE=disabled + +# Set the working directory +WORKDIR /simulator + +# Copy the requirements file into the container +COPY requirements.txt /simulator/ + +# Install dependencies +RUN apt update && apt install -y curl jq git + +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application code into the container +COPY ./*.py /simulator/ +# COPY ./model_cache /simulator/model_cache + +ENV MODEL_NAME=llama2-7b + # Trigger profiling +RUN python app.py --time_limit 1000 --replica_config_device a40 + +# Expose the port the app runs on +EXPOSE 8000 + +# Run the application +CMD ["python", "app.py"] diff --git a/docs/development/simulator/Makefile b/docs/development/simulator/Makefile index d5d1364b..917164da 100644 --- a/docs/development/simulator/Makefile +++ b/docs/development/simulator/Makefile @@ -1,15 +1,26 @@ all: build -build: +build-a100: docker build -t aibrix/vllm-simulator:nightly -f Dockerfile . -deploy: +build-a40: + docker build -t aibrix/vllm-simulator-a40:nightly -f Dockerfile-a40 . + +build: build-a100 + +deploy-a100: kubectl apply -f deployment-a100.yaml + +deploy-a40: + kubectl apply -f deployment-a40.yaml + +deploy: deploy-a100 sleep 2 kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & clean: - kubectl delete -f deployment.yaml + kubectl delete -f deployment-a100.yaml + kubectl delete -f deployment-a40.yaml sleep 1 curl http://localhost:8000/metrics diff --git a/docs/development/simulator/deployment-a40.yaml b/docs/development/simulator/deployment-a40.yaml index b3146c8a..7d172142 100644 --- a/docs/development/simulator/deployment-a40.yaml +++ b/docs/development/simulator/deployment-a40.yaml @@ -24,7 +24,7 @@ spec: automountServiceAccountToken: true # Important! containers: - name: llmengine-simulator - image: aibrix/vllm-simulator:nightly + image: aibrix/vllm-simulator-a40:nightly command: ["python", "app.py", "--replica_config_device", "a40"] ports: - containerPort: 8000 diff --git a/python/aibrix/aibrix/gpuoptimizer/README.md b/python/aibrix/aibrix/gpuoptimizer/README.md index 203e83fb..347a2603 100644 --- a/python/aibrix/aibrix/gpuoptimizer/README.md +++ b/python/aibrix/aibrix/gpuoptimizer/README.md @@ -12,9 +12,16 @@ docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile . make build ``` -3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. If using CPU based vLLM simulator, a sample profile is included in optimizer/profiling/result. +3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first: +```shell +# Make sure pod is accessable locally: +kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 & +``` + +If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result. + -4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute +1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute ```shell # Make sure Redis is accessable locally: kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 & diff --git a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md index ec0a3975..9afe3066 100644 --- a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md +++ b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md @@ -13,6 +13,8 @@ Once your model is up and running, modify `benchmark.sh` to configure the follow * output_limit: The ending output length for profling * rate_start: The starting request rate for profling * rate_limit: The ending request rate for profling + +Run `pip install -r requirements.txt` to install dependency. Finally, run `benchmark.sh [your deployment name]`, the results will be in the result directory. diff --git a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh index a79255af..a444c24a 100755 --- a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh +++ b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh @@ -17,7 +17,7 @@ mkdir -p `dirname "$OUTPUT_FILE"` # TODO: Set your preferred request sizes and rates here. input_start=8 -input_limit=$((2**14)) # 16K +input_limit=$((2**12)) # 4K output_start=4 output_limit=$((2**9)) # 512 rate_start=1 @@ -29,7 +29,7 @@ while [[ $input_len -le $input_limit ]]; do while [[ $output_len -le $output_limit ]]; do req_rate=$rate_start while [[ $req_rate -le $rate_limit ]]; do - python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8000 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE} + python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8010 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE} req_rate=$((req_rate * 2)) done output_len=$((output_len * 2))