Skip to content

Commit

Permalink
[Feat] Integrate deployment configurations and fix autoscaler/gpu opt…
Browse files Browse the repository at this point in the history
…imizer connectivity (#500)

* Add GPU Optimizer deployment and update configurations

* Fix k8s accessibility regard namespaces. GPU optimizer now monitor all namespaces with model label.

* Lint fix

* Deployment clean-up

* Update README.md

---------

Co-authored-by: Ning Wang <[email protected]>
Co-authored-by: Jingyuan Zhang <[email protected]>
  • Loading branch information
3 people authored Dec 7, 2024
1 parent b5b7586 commit dd2aa26
Show file tree
Hide file tree
Showing 16 changed files with 139 additions and 125 deletions.
1 change: 1 addition & 0 deletions config/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ resources:
- ../rbac
- ../manager
- ../gateway
- ../gpu-optimizer
- ../dependency/kuberay-operator
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
# crd/kustomization.yaml
Expand Down
26 changes: 26 additions & 0 deletions config/gpu-optimizer/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-optimizer
namespace: aibrix-system
spec:
replicas: 1
selector:
matchLabels:
app: gpu-optimizer
template:
metadata:
labels:
app: gpu-optimizer
spec:
serviceAccountName: gpu-optimizer-sa
automountServiceAccountToken: true
containers:
- name: gpu-optimizer
image: aibrix/runtime:nightly
command: ["python", "-m", "aibrix.gpu_optimizer.app"]
ports:
- containerPort: 8080
env:
- name: REDIS_HOST
value: aibrix-redis-master.aibrix-system.svc.cluster.local
4 changes: 4 additions & 0 deletions config/gpu-optimizer/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- deployment.yaml
- service.yaml
- rbac.yaml
27 changes: 27 additions & 0 deletions config/gpu-optimizer/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-optimizer-sa
namespace: aibrix-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-optimizer-clusterrole
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-optimizer-clusterrole-binding
subjects:
- kind: ServiceAccount
name: gpu-optimizer-sa
namespace: aibrix-system
roleRef:
kind: ClusterRole
name: gpu-optimizer-clusterrole
apiGroup: rbac.authorization.k8s.io
13 changes: 13 additions & 0 deletions config/gpu-optimizer/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: gpu-optimizer
namespace: aibrix-system
spec:
selector:
app: gpu-optimizer
ports:
- protocol: TCP
port: 8080
targetPort: 8080
type: ClusterIP
7 changes: 7 additions & 0 deletions config/overlays/vke/default/gpu-optimizer/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
resources:
- ../../../../gpu-optimizer

images:
- name: aibrix/gpu-optimizer
newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gpu-optimizer
newTag: nightly
1 change: 1 addition & 0 deletions config/overlays/vke/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ resources:
- ../../../rbac
- manager
- gateway
- gpu-optimizer
- ../../../dependency/kuberay-operator


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
kind: Deployment
name: simulator-llama2-7b-a40
metricsSources:
- endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
- endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
path: /metrics/default/simulator-llama2-7b-a40
metric: "vllm:deployment_replicas"
targetValue: "1"
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
kind: Deployment
name: simulator-llama2-7b-a100
metricsSources:
- endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
- endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
path: /metrics/default/simulator-llama2-7b-a100
metric: "vllm:deployment_replicas"
targetValue: "1"
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
maxReplicas: 10
targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
metricsSources:
- endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
- endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
path: /metrics/default/simulator-llama2-7b
metric: "vllm:deployment_replicas"
targetValue: "1"
Expand Down
24 changes: 8 additions & 16 deletions python/aibrix/aibrix/gpu_optimizer/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,6 @@ all: build
DP ?= profiling
DATASET ?= [set your DATASET path]

.PHONY: deploy
deploy:
kubectl apply -f deployment.yaml
sleep 2
kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &

.PHONY: clean
clean:
kubectl delete -f deployment.yaml
sleep 1
curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b

.PHONY: benchmark
benchmark:
optimizer/profiling/benchmark.sh $(DP)
Expand All @@ -33,27 +21,31 @@ debug:

.PHONY: debug-init-simulator
debug-init-simulator:
curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
-H "Content-Type: application/json" \
-H "Authorization: Bearer any_key" \
-d '{}'

.PHONY: debug-scale-simulator
debug-scale-simulator:
curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \
curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \
-H "Content-Type: application/json" \
-H "Authorization: Bearer any_key" \
-d '{}'

.PHONY: debug-stop-simulator
debug-stop-simulator:
curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
-H "Content-Type: application/json" \
-H "Authorization: Bearer any_key"

.PHONY: debug-update-profile
debug-update-profile:
curl http://localhost:8080/update_profile/llama2-7b

.PHONY: debug-metrics
debug-metrics:
curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100

.PHONY: debug-workload
debug-workload:
Expand Down
36 changes: 16 additions & 20 deletions python/aibrix/aibrix/gpu_optimizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,48 @@

## Run in kubernetes

1. Make sure Aibrix components are up-to-date.

2. For now, build GPU Optimizer base image using Dockerfile within this folder.
```dockerfile
docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile .

# Or use make
make build
1. Make sure Aibrix components are up-to-date. In particular, GPU Optimizer can be updated independently by:
```shell
cd ../../../../ && make docker-build-runtime
kubectl create -k config/gpu-optimizer
```

3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
2. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See development/app for details

3. [Optional] Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
```shell
# Make sure pod is accessable locally:
kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
kubectl port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
```

If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result.


1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
```shell
# Make sure Redis is accessable locally:
kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
# Or use make
make debug-init

python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
# Or use make
make DP=simulator-llama2-7b-a100 gen-profile
```
Replace simulator-llama2-7b-a100 with your deployment name.

5. Deploy GPU Optimizer
4. Notify GPU optimizer that profiles are ready
```shell
kubectl apply -f deployment.yaml
kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 &

# Or use make
make deploy
curl http://localhost:8080/update_profile/llama2-7b
```
Replace llama2-7b with your model name.

4. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See docs/development/simulator for details

5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as:
```shell
# Make sure gateway's local access, see docs/development/simulator/README.md for details.
python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
```

6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by:
Expand Down
26 changes: 20 additions & 6 deletions python/aibrix/aibrix/gpu_optimizer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer
from aibrix.gpu_optimizer.utils import ExcludePathsFilter

NAMESPACE = os.getenv("NAMESPACE", "aibrix-system")
MODEL_LABEL = "model.aibrix.ai/name"
MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas"
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
Expand Down Expand Up @@ -186,6 +185,22 @@ async def stop_deployment_optimization(request):
)


@app.route("/update_profile/{model_name}")
async def update_profile(request):
model_name = request.path_params["model_name"]
monitor = model_monitors.get(model_name, None)
if monitor is None:
return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404)

if monitor.load_profiles():
return JSONResponse({"message": f"workload profile of {model_name} updated"})
else:
return JSONResponse(
{"error": f"failed to update workload profile of {model_name}"},
status_code=500,
)


@app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"])
async def scale_deployment(request):
namespace = request.path_params["namespace"]
Expand Down Expand Up @@ -249,9 +264,9 @@ def main(signal, timeout):
apps_v1 = client.AppsV1Api()

# List existing deployments
logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}")
deployments = apps_v1.list_namespaced_deployment(
namespace=NAMESPACE, label_selector=MODEL_LABEL
logger.info(f"Looking for deployments with {MODEL_LABEL}")
deployments = apps_v1.list_deployment_for_all_namespaces(
label_selector=MODEL_LABEL
)
watch_version = deployments.metadata.resource_version
logger.debug(f"last watch version: {watch_version}")
Expand Down Expand Up @@ -284,8 +299,7 @@ def main(signal, timeout):
w = watch.Watch()
signal["watch"] = w
for event in w.stream(
apps_v1.list_namespaced_deployment,
namespace=NAMESPACE,
apps_v1.list_deployment_for_all_namespaces,
label_selector=MODEL_LABEL,
resource_version=watch_version,
timeout_seconds=timeout,
Expand Down
76 changes: 0 additions & 76 deletions python/aibrix/aibrix/gpu_optimizer/deployment.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def validate(self) -> bool:
if len(self.clusterers) < self.buffer_size:
self.clusterers.append(self.clusterers[current].clone())
self.frontier = len(self.clusterers) - 1
logger.debug("test")
logger.debug(
"moving buffer created: %s, buffers: %s",
self._reason,
Expand Down
Loading

0 comments on commit dd2aa26

Please sign in to comment.