Skip to content

Commit

Permalink
Add a40 deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
Jingyuan Zhang committed Nov 25, 2024
1 parent 019afd9 commit b609c33
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 10 deletions.
4 changes: 2 additions & 2 deletions docs/development/simulator/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application code into the container
COPY ./*.py /simulator/
COPY ./model_cache /simulator/model_cache
# COPY ./model_cache /simulator/model_cache

ENV MODEL_NAME=llama2-7b
# Trigger profiling
RUN python app.py --time_limit 1000
RUN python app.py --time_limit 1000 --replica_config_device a100

# Expose the port the app runs on
EXPOSE 8000
Expand Down
32 changes: 32 additions & 0 deletions docs/development/simulator/Dockerfile-a40
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Use the official Python base image
FROM python:3.10-slim

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV WANDB_MODE=disabled

# Set the working directory
WORKDIR /simulator

# Copy the requirements file into the container
COPY requirements.txt /simulator/

# Install dependencies
RUN apt update && apt install -y curl jq git

RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application code into the container
COPY ./*.py /simulator/
# COPY ./model_cache /simulator/model_cache

ENV MODEL_NAME=llama2-7b
# Trigger profiling
RUN python app.py --time_limit 1000 --replica_config_device a40

# Expose the port the app runs on
EXPOSE 8000

# Run the application
CMD ["python", "app.py"]
17 changes: 14 additions & 3 deletions docs/development/simulator/Makefile
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
all: build

build:
build-a100:
docker build -t aibrix/vllm-simulator:nightly -f Dockerfile .

deploy:
build-a40:
docker build -t aibrix/vllm-simulator-a40:nightly -f Dockerfile-a40 .

build: build-a100

deploy-a100:
kubectl apply -f deployment-a100.yaml

deploy-a40:
kubectl apply -f deployment-a40.yaml

deploy: deploy-a100
sleep 2
kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &

clean:
kubectl delete -f deployment.yaml
kubectl delete -f deployment-a100.yaml
kubectl delete -f deployment-a40.yaml
sleep 1
curl http://localhost:8000/metrics

Expand Down
2 changes: 1 addition & 1 deletion docs/development/simulator/deployment-a40.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
automountServiceAccountToken: true # Important!
containers:
- name: llmengine-simulator
image: aibrix/vllm-simulator:nightly
image: aibrix/vllm-simulator-a40:nightly
command: ["python", "app.py", "--replica_config_device", "a40"]
ports:
- containerPort: 8000
Expand Down
11 changes: 9 additions & 2 deletions python/aibrix/aibrix/gpuoptimizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,16 @@ docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile .
make build
```

3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. If using CPU based vLLM simulator, a sample profile is included in optimizer/profiling/result.
3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
```shell
# Make sure pod is accessable locally:
kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
```

If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result.


4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
```shell
# Make sure Redis is accessable locally:
kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ Once your model is up and running, modify `benchmark.sh` to configure the follow
* output_limit: The ending output length for profling
* rate_start: The starting request rate for profling
* rate_limit: The ending request rate for profling

Run `pip install -r requirements.txt` to install dependency.

Finally, run `benchmark.sh [your deployment name]`, the results will be in the result directory.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mkdir -p `dirname "$OUTPUT_FILE"`

# TODO: Set your preferred request sizes and rates here.
input_start=8
input_limit=$((2**14)) # 16K
input_limit=$((2**12)) # 4K
output_start=4
output_limit=$((2**9)) # 512
rate_start=1
Expand All @@ -29,7 +29,7 @@ while [[ $input_len -le $input_limit ]]; do
while [[ $output_len -le $output_limit ]]; do
req_rate=$rate_start
while [[ $req_rate -le $rate_limit ]]; do
python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8000 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE}
python $PATH_PREFIX/gpu-benchmark.py --backend=vllm --port 8010 --model=llama2-7b --request-rate=$req_rate --num-prompts=$TOTAL --input_len $input_len --output_len $output_len >> ${OUTPUT_FILE}
req_rate=$((req_rate * 2))
done
output_len=$((output_len * 2))
Expand Down

0 comments on commit b609c33

Please sign in to comment.