[Feat] Integrate deployment configurations and fix autoscaler/gpu opt…

…imizer connectivity (#500) * Add GPU Optimizer deployment and update configurations * Fix k8s accessibility regard namespaces. GPU optimizer now monitor all namespaces with model label. * Lint fix * Deployment clean-up * Update README.md --------- Co-authored-by: Ning Wang <[email protected]> Co-authored-by: Jingyuan Zhang <[email protected]>
vllm-project · Dec 7, 2024 · dd2aa26 · dd2aa26
1 parent b5b7586
commit dd2aa26
Show file tree

Hide file tree

Showing 16 changed files with 139 additions and 125 deletions.
diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
@@ -23,6 +23,7 @@ resources:
 - ../rbac
 - ../manager
 - ../gateway
+- ../gpu-optimizer
 - ../dependency/kuberay-operator
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml

diff --git a/config/gpu-optimizer/deployment.yaml b/config/gpu-optimizer/deployment.yaml
@@ -0,0 +1,26 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-optimizer
+  template:
+    metadata:
+      labels:
+        app: gpu-optimizer
+    spec:
+      serviceAccountName: gpu-optimizer-sa
+      automountServiceAccountToken: true
+      containers:
+      - name: gpu-optimizer
+        image: aibrix/runtime:nightly
+        command: ["python", "-m", "aibrix.gpu_optimizer.app"]
+        ports:
+        - containerPort: 8080
+        env:
+          - name: REDIS_HOST
+            value: aibrix-redis-master.aibrix-system.svc.cluster.local
diff --git a/config/gpu-optimizer/kustomization.yaml b/config/gpu-optimizer/kustomization.yaml
@@ -0,0 +1,4 @@
+resources:
+- deployment.yaml
+- service.yaml
+- rbac.yaml
diff --git a/config/gpu-optimizer/rbac.yaml b/config/gpu-optimizer/rbac.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpu-optimizer-sa
+  namespace: aibrix-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: gpu-optimizer-clusterrole
+rules:
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gpu-optimizer-clusterrole-binding
+subjects:
+  - kind: ServiceAccount
+    name: gpu-optimizer-sa
+    namespace: aibrix-system
+roleRef:
+  kind: ClusterRole
+  name: gpu-optimizer-clusterrole
+  apiGroup: rbac.authorization.k8s.io
diff --git a/config/gpu-optimizer/service.yaml b/config/gpu-optimizer/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  selector:
+    app: gpu-optimizer
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
+  type: ClusterIP
diff --git a/config/overlays/vke/default/gpu-optimizer/kustomization.yaml b/config/overlays/vke/default/gpu-optimizer/kustomization.yaml
@@ -0,0 +1,7 @@
+resources:
+  - ../../../../gpu-optimizer
+
+images:
+- name: aibrix/gpu-optimizer
+  newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gpu-optimizer
+  newTag: nightly
diff --git a/config/overlays/vke/default/kustomization.yaml b/config/overlays/vke/default/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
 - ../../../rbac
 - manager
 - gateway
+- gpu-optimizer
 - ../../../dependency/kuberay-operator
 
 

diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a40
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a40
       metric: "vllm:deployment_replicas"
   targetValue: "1"
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a100
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a100
       metric: "vllm:deployment_replicas"
   targetValue: "1"
diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml
@@ -16,7 +16,7 @@ spec:
   maxReplicas: 10
   targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b
       metric: "vllm:deployment_replicas"
   targetValue: "1"

diff --git a/python/aibrix/aibrix/gpu_optimizer/Makefile b/python/aibrix/aibrix/gpu_optimizer/Makefile
@@ -3,18 +3,6 @@ all: build
 DP ?= profiling
 DATASET ?= [set your DATASET path]
 
-.PHONY: deploy
-deploy:
-	kubectl apply -f deployment.yaml
-	sleep 2
-	kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
-
-.PHONY: clean
-clean:
-	kubectl delete -f deployment.yaml
-	sleep 1
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
-
 .PHONY: benchmark
 benchmark:
 	optimizer/profiling/benchmark.sh $(DP)
@@ -33,27 +21,31 @@ debug:
 
 .PHONY: debug-init-simulator
 debug-init-simulator:
-	curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-scale-simulator	
 debug-scale-simulator:
-	curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \
+	curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-stop-simulator
 debug-stop-simulator:
-	curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key"
 
+.PHONY: debug-update-profile
+debug-update-profile:
+	curl http://localhost:8080/update_profile/llama2-7b
+
 .PHONY: debug-metrics
 debug-metrics:
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
+	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100
 
 .PHONY: debug-workload
 debug-workload:

diff --git a/python/aibrix/aibrix/gpu_optimizer/README.md b/python/aibrix/aibrix/gpu_optimizer/README.md
@@ -2,52 +2,48 @@
 
 ## Run in kubernetes
 
-1. Make sure Aibrix components are up-to-date.
-
-2. For now, build GPU Optimizer base image using Dockerfile within this folder.
-```dockerfile
-docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile .
-
-# Or use make
-make build
+1. Make sure Aibrix components are up-to-date. In particular, GPU Optimizer can be updated independently by:
+```shell
+cd ../../../../ && make docker-build-runtime
+kubectl create -k config/gpu-optimizer
 ```
 
-3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
+2. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See development/app for details
+
+3. [Optional] Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
 ```shell
 # Make sure pod is accessable locally:
-kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
+kubectl port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
 ```
 
 If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result.
-
 
-1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
+4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
 ```shell
 # Make sure Redis is accessable locally:
 kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
 # Or use make
 make debug-init
 
-python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
+python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
 # Or use make
 make DP=simulator-llama2-7b-a100 gen-profile
 ```
+Replace simulator-llama2-7b-a100 with your deployment name.
 
-5. Deploy GPU Optimizer
+4. Notify GPU optimizer that profiles are ready
 ```shell
-kubectl apply -f deployment.yaml
-kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
+kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
 
-# Or use make
-make deploy
+curl http://localhost:8080/update_profile/llama2-7b
 ```
+Replace llama2-7b with your model name.
 
-4. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See docs/development/simulator for details
 
 5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as:
 ```shell
 # Make sure gateway's local access, see docs/development/simulator/README.md for details.
-python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
+python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
 ```
 
 6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by:

diff --git a/python/aibrix/aibrix/gpu_optimizer/app.py b/python/aibrix/aibrix/gpu_optimizer/app.py
@@ -27,7 +27,6 @@
 from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer
 from aibrix.gpu_optimizer.utils import ExcludePathsFilter
 
-NAMESPACE = os.getenv("NAMESPACE", "aibrix-system")
 MODEL_LABEL = "model.aibrix.ai/name"
 MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas"
 REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
@@ -186,6 +185,22 @@ async def stop_deployment_optimization(request):
         )
 
 
+@app.route("/update_profile/{model_name}")
+async def update_profile(request):
+    model_name = request.path_params["model_name"]
+    monitor = model_monitors.get(model_name, None)
+    if monitor is None:
+        return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404)
+
+    if monitor.load_profiles():
+        return JSONResponse({"message": f"workload profile of {model_name} updated"})
+    else:
+        return JSONResponse(
+            {"error": f"failed to update workload profile of {model_name}"},
+            status_code=500,
+        )
+
+
 @app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"])
 async def scale_deployment(request):
     namespace = request.path_params["namespace"]
@@ -249,9 +264,9 @@ def main(signal, timeout):
             apps_v1 = client.AppsV1Api()
 
             # List existing deployments
-            logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}")
-            deployments = apps_v1.list_namespaced_deployment(
-                namespace=NAMESPACE, label_selector=MODEL_LABEL
+            logger.info(f"Looking for deployments with {MODEL_LABEL}")
+            deployments = apps_v1.list_deployment_for_all_namespaces(
+                label_selector=MODEL_LABEL
             )
             watch_version = deployments.metadata.resource_version
             logger.debug(f"last watch version: {watch_version}")
@@ -284,8 +299,7 @@ def main(signal, timeout):
             w = watch.Watch()
             signal["watch"] = w
             for event in w.stream(
-                apps_v1.list_namespaced_deployment,
-                namespace=NAMESPACE,
+                apps_v1.list_deployment_for_all_namespaces,
                 label_selector=MODEL_LABEL,
                 resource_version=watch_version,
                 timeout_seconds=timeout,

diff --git a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
@@ -127,7 +127,6 @@ def validate(self) -> bool:
         if len(self.clusterers) < self.buffer_size:
             self.clusterers.append(self.clusterers[current].clone())
             self.frontier = len(self.clusterers) - 1
-            logger.debug("test")
             logger.debug(
                 "moving buffer created: %s, buffers: %s",
                 self._reason,
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ resources: @@
     - ../../../rbac
     - manager
     - gateway
+    - gpu-optimizer
     - ../../../dependency/kuberay-operator
@@ Expand Down @@