From b609c33220440c22073646736a881bab3824f2fc Mon Sep 17 00:00:00 2001
From: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Date: Mon, 25 Nov 2024 13:58:22 -0800
Subject: [PATCH] Add a40 deployment

---
 docs/development/simulator/Dockerfile         |  4 +--
 docs/development/simulator/Dockerfile-a40     | 32 +++++++++++++++++++
 docs/development/simulator/Makefile           | 17 ++++++++--
 .../development/simulator/deployment-a40.yaml |  2 +-
 python/aibrix/aibrix/gpuoptimizer/README.md   | 11 +++++--
 .../optimizer/profiling/README.md             |  2 ++
 .../optimizer/profiling/benchmark.sh          |  4 +--
 7 files changed, 62 insertions(+), 10 deletions(-)
 create mode 100644 docs/development/simulator/Dockerfile-a40

diff --git a/docs/development/simulator/Dockerfile b/docs/development/simulator/Dockerfile
index 0f06ae45..9e36d9da 100644
--- a/docs/development/simulator/Dockerfile
+++ b/docs/development/simulator/Dockerfile
@@ -19,11 +19,11 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy the rest of the application code into the container
 COPY ./*.py /simulator/
-COPY ./model_cache /simulator/model_cache
+# COPY ./model_cache /simulator/model_cache
 
 ENV MODEL_NAME=llama2-7b
  # Trigger profiling
-RUN python app.py --time_limit 1000
+RUN python app.py --time_limit 1000 --replica_config_device a100
 
 # Expose the port the app runs on
 EXPOSE 8000
diff --git a/docs/development/simulator/Dockerfile-a40 b/docs/development/simulator/Dockerfile-a40
new file mode 100644
index 00000000..ac1c9be0
--- /dev/null
+++ b/docs/development/simulator/Dockerfile-a40
@@ -0,0 +1,32 @@
+# Use the official Python base image
+FROM python:3.10-slim
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV WANDB_MODE=disabled
+
+# Set the working directory
+WORKDIR /simulator
+
+# Copy the requirements file into the container
+COPY requirements.txt /simulator/
+
+# Install dependencies
+RUN apt update && apt install -y curl jq git
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code into the container
+COPY ./*.py /simulator/
+# COPY ./model_cache /simulator/model_cache
+
+ENV MODEL_NAME=llama2-7b
+ # Trigger profiling
+RUN python app.py --time_limit 1000 --replica_config_device a40
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "app.py"]
diff --git a/docs/development/simulator/Makefile b/docs/development/simulator/Makefile
index d5d1364b..917164da 100644
--- a/docs/development/simulator/Makefile
+++ b/docs/development/simulator/Makefile
@@ -1,15 +1,26 @@
 all: build
 
-build: 
+build-a100: 
 	docker build -t aibrix/vllm-simulator:nightly -f Dockerfile .
 
-deploy:
+build-a40: 
+	docker build -t aibrix/vllm-simulator-a40:nightly -f Dockerfile-a40 .
+
+build: build-a100
+
+deploy-a100:
 	kubectl apply -f deployment-a100.yaml
+
+deploy-a40:
+	kubectl apply -f deployment-a40.yaml
+
+deploy: deploy-a100
 	sleep 2
 	kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
 
 clean:
-	kubectl delete -f deployment.yaml
+	kubectl delete -f deployment-a100.yaml
+	kubectl delete -f deployment-a40.yaml
 	sleep 1
 	curl http://localhost:8000/metrics
 
diff --git a/docs/development/simulator/deployment-a40.yaml b/docs/development/simulator/deployment-a40.yaml
index b3146c8a..7d172142 100644
--- a/docs/development/simulator/deployment-a40.yaml
+++ b/docs/development/simulator/deployment-a40.yaml
@@ -24,7 +24,7 @@ spec:
       automountServiceAccountToken: true # Important!
       containers:
         - name: llmengine-simulator
-          image: aibrix/vllm-simulator:nightly
+          image: aibrix/vllm-simulator-a40:nightly
           command: ["python", "app.py", "--replica_config_device", "a40"]
           ports:
             - containerPort: 8000
diff --git a/python/aibrix/aibrix/gpuoptimizer/README.md b/python/aibrix/aibrix/gpuoptimizer/README.md
index 203e83fb..347a2603 100644
--- a/python/aibrix/aibrix/gpuoptimizer/README.md
+++ b/python/aibrix/aibrix/gpuoptimizer/README.md
@@ -12,9 +12,16 @@ docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile .
 make build
 ```
 
-3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. If using CPU based vLLM simulator, a sample profile is included in optimizer/profiling/result.
+3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
+```shell
+# Make sure pod is accessable locally:
+kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
+```
+
+If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result.
+	
 
-4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
+1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
 ```shell
 # Make sure Redis is accessable locally:
 kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
diff --git a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md
index ec0a3975..9afe3066 100644
--- a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md
+++ b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/README.md
@@ -13,6 +13,8 @@ Once your model is up and running, modify `benchmark.sh` to configure the follow
 * output_limit: The ending output length for profling
 * rate_start: The starting request rate for profling
 * rate_limit: The ending request rate for profling
+  
+Run `pip install -r requirements.txt` to install dependency.
 
 Finally, run `benchmark.sh [your deployment name]`, the results will be in the result directory.
 
diff --git a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh
index a79255af..a444c24a 100755
--- a/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh
+++ b/python/aibrix/aibrix/gpuoptimizer/optimizer/profiling/benchmark.sh
@@ -17,7 +17,7 @@ mkdir -p `dirname "$OUTPUT_FILE"`
 
 # TODO: Set your preferred request sizes and rates here.
 input_start=8
-input_limit=$((2**14)) # 16K
+input_limit=$((2**12)) # 4K
 output_start=4
 output_limit=$((2**9)) # 512
 rate_start=1
@@ -29,7 +29,7 @@ while [[ $input_len -le $input_limit ]]; do
   while [[ $output_len -le $output_limit ]]; do
     req_rate=$rate_start
     while [[ $req_rate -le $rate_limit ]]; do
-      python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8000 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE} 
+      python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8010 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE} 
       req_rate=$((req_rate * 2)) 
     done
     output_len=$((output_len * 2))