From dd2aa265f1d68dc32382606c857182140a2e6870 Mon Sep 17 00:00:00 2001
From: Jingyuan <zhangjyr@gmail.com>
Date: Fri, 6 Dec 2024 17:38:00 -0800
Subject: [PATCH] [Feat] Integrate deployment configurations and fix
 autoscaler/gpu optimizer connectivity (#500)

* Add GPU Optimizer deployment and update configurations

* Fix k8s accessibility regard namespaces. GPU optimizer now monitor all namespaces with model label.

* Lint fix

* Deployment clean-up

* Update README.md

---------

Co-authored-by: Ning Wang <n.wang.chn@hotmail.com>
Co-authored-by: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
---
 config/default/kustomization.yaml             |  1 +
 config/gpu-optimizer/deployment.yaml          | 26 +++++++
 config/gpu-optimizer/kustomization.yaml       |  4 +
 config/gpu-optimizer/rbac.yaml                | 27 +++++++
 config/gpu-optimizer/service.yaml             | 13 ++++
 .../default/gpu-optimizer/kustomization.yaml  |  7 ++
 .../overlays/vke/default/kustomization.yaml   |  1 +
 .../patch_podautoscaler_a40.yaml              |  2 +-
 .../simulator/patch_podautoscaler_a100.yaml   |  2 +-
 .../podautoscaler/podautoscaler.yaml          |  2 +-
 python/aibrix/aibrix/gpu_optimizer/Makefile   | 24 ++----
 python/aibrix/aibrix/gpu_optimizer/README.md  | 36 ++++-----
 python/aibrix/aibrix/gpu_optimizer/app.py     | 26 +++++--
 .../aibrix/gpu_optimizer/deployment.yaml      | 76 -------------------
 .../gpu_optimizer/load_monitor/clusterer.py   |  1 -
 .../gpu_optimizer/load_monitor/monitor.py     | 16 +++-
 16 files changed, 139 insertions(+), 125 deletions(-)
 create mode 100644 config/gpu-optimizer/deployment.yaml
 create mode 100644 config/gpu-optimizer/kustomization.yaml
 create mode 100644 config/gpu-optimizer/rbac.yaml
 create mode 100644 config/gpu-optimizer/service.yaml
 create mode 100644 config/overlays/vke/default/gpu-optimizer/kustomization.yaml
 delete mode 100644 python/aibrix/aibrix/gpu_optimizer/deployment.yaml

diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
index f472003f..2402b1b1 100644
--- a/config/default/kustomization.yaml
+++ b/config/default/kustomization.yaml
@@ -23,6 +23,7 @@ resources:
 - ../rbac
 - ../manager
 - ../gateway
+- ../gpu-optimizer
 - ../dependency/kuberay-operator
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml
diff --git a/config/gpu-optimizer/deployment.yaml b/config/gpu-optimizer/deployment.yaml
new file mode 100644
index 00000000..796912e4
--- /dev/null
+++ b/config/gpu-optimizer/deployment.yaml
@@ -0,0 +1,26 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-optimizer
+  template:
+    metadata:
+      labels:
+        app: gpu-optimizer
+    spec:
+      serviceAccountName: gpu-optimizer-sa
+      automountServiceAccountToken: true
+      containers:
+      - name: gpu-optimizer
+        image: aibrix/runtime:nightly
+        command: ["python", "-m", "aibrix.gpu_optimizer.app"]
+        ports:
+        - containerPort: 8080
+        env:
+          - name: REDIS_HOST
+            value: aibrix-redis-master.aibrix-system.svc.cluster.local
\ No newline at end of file
diff --git a/config/gpu-optimizer/kustomization.yaml b/config/gpu-optimizer/kustomization.yaml
new file mode 100644
index 00000000..bb0c7530
--- /dev/null
+++ b/config/gpu-optimizer/kustomization.yaml
@@ -0,0 +1,4 @@
+resources:
+- deployment.yaml
+- service.yaml
+- rbac.yaml
\ No newline at end of file
diff --git a/config/gpu-optimizer/rbac.yaml b/config/gpu-optimizer/rbac.yaml
new file mode 100644
index 00000000..dd56e2e1
--- /dev/null
+++ b/config/gpu-optimizer/rbac.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpu-optimizer-sa
+  namespace: aibrix-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: gpu-optimizer-clusterrole
+rules:
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gpu-optimizer-clusterrole-binding
+subjects:
+  - kind: ServiceAccount
+    name: gpu-optimizer-sa
+    namespace: aibrix-system
+roleRef:
+  kind: ClusterRole
+  name: gpu-optimizer-clusterrole
+  apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
diff --git a/config/gpu-optimizer/service.yaml b/config/gpu-optimizer/service.yaml
new file mode 100644
index 00000000..6968aeed
--- /dev/null
+++ b/config/gpu-optimizer/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  selector:
+    app: gpu-optimizer
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
+  type: ClusterIP
\ No newline at end of file
diff --git a/config/overlays/vke/default/gpu-optimizer/kustomization.yaml b/config/overlays/vke/default/gpu-optimizer/kustomization.yaml
new file mode 100644
index 00000000..73c8e670
--- /dev/null
+++ b/config/overlays/vke/default/gpu-optimizer/kustomization.yaml
@@ -0,0 +1,7 @@
+resources:
+  - ../../../../gpu-optimizer
+
+images:
+- name: aibrix/gpu-optimizer
+  newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gpu-optimizer
+  newTag: nightly
\ No newline at end of file
diff --git a/config/overlays/vke/default/kustomization.yaml b/config/overlays/vke/default/kustomization.yaml
index 4598d51d..249e59ce 100644
--- a/config/overlays/vke/default/kustomization.yaml
+++ b/config/overlays/vke/default/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
 - ../../../rbac
 - manager
 - gateway
+- gpu-optimizer
 - ../../../dependency/kuberay-operator
 
 
diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
index 93b2d37d..339c87a2 100644
--- a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
+++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a40
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a40
       metric: "vllm:deployment_replicas"
   targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
index c04a2c19..09836449 100644
--- a/development/app/config/simulator/patch_podautoscaler_a100.yaml
+++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a100
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a100
       metric: "vllm:deployment_replicas"
   targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml
index 945fb13c..a75a60c5 100644
--- a/development/app/config/templates/podautoscaler/podautoscaler.yaml
+++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml
@@ -16,7 +16,7 @@ spec:
   maxReplicas: 10
   targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b
       metric: "vllm:deployment_replicas"
   targetValue: "1"
diff --git a/python/aibrix/aibrix/gpu_optimizer/Makefile b/python/aibrix/aibrix/gpu_optimizer/Makefile
index e4a8ac3a..cbd761cb 100644
--- a/python/aibrix/aibrix/gpu_optimizer/Makefile
+++ b/python/aibrix/aibrix/gpu_optimizer/Makefile
@@ -3,18 +3,6 @@ all: build
 DP ?= profiling
 DATASET ?= [set your DATASET path]
 
-.PHONY: deploy
-deploy:
-	kubectl apply -f deployment.yaml
-	sleep 2
-	kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
-
-.PHONY: clean
-clean:
-	kubectl delete -f deployment.yaml
-	sleep 1
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
-
 .PHONY: benchmark
 benchmark:
 	optimizer/profiling/benchmark.sh $(DP)
@@ -33,27 +21,31 @@ debug:
 
 .PHONY: debug-init-simulator
 debug-init-simulator:
-	curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-scale-simulator	
 debug-scale-simulator:
-	curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \
+	curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-stop-simulator
 debug-stop-simulator:
-	curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key"
 
+.PHONY: debug-update-profile
+debug-update-profile:
+	curl http://localhost:8080/update_profile/llama2-7b
+
 .PHONY: debug-metrics
 debug-metrics:
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
+	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100
 
 .PHONY: debug-workload
 debug-workload:
diff --git a/python/aibrix/aibrix/gpu_optimizer/README.md b/python/aibrix/aibrix/gpu_optimizer/README.md
index 347a2603..fdcf821d 100644
--- a/python/aibrix/aibrix/gpu_optimizer/README.md
+++ b/python/aibrix/aibrix/gpu_optimizer/README.md
@@ -2,52 +2,48 @@
 
 ## Run in kubernetes
 
-1. Make sure Aibrix components are up-to-date.
-
-2. For now, build GPU Optimizer base image using Dockerfile within this folder.
-```dockerfile
-docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile .
-
-# Or use make
-make build
+1. Make sure Aibrix components are up-to-date. In particular, GPU Optimizer can be updated independently by:
+```shell
+cd ../../../../ && make docker-build-runtime
+kubectl create -k config/gpu-optimizer
 ```
 
-3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
+2. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See development/app for details
+
+3. [Optional] Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first:
 ```shell
 # Make sure pod is accessable locally:
-kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
+kubectl port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 &
 ```
 
 If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result.
-	
 
-1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
+4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute
 ```shell
 # Make sure Redis is accessable locally:
 kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
 # Or use make
 make debug-init
 
-python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
+python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
 # Or use make
 make DP=simulator-llama2-7b-a100 gen-profile
 ```
+Replace simulator-llama2-7b-a100 with your deployment name.
 
-5. Deploy GPU Optimizer
+4. Notify GPU optimizer that profiles are ready
 ```shell
-kubectl apply -f deployment.yaml
-kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
+kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
 
-# Or use make
-make deploy
+curl http://localhost:8080/update_profile/llama2-7b
 ```
+Replace llama2-7b with your model name.
 
-4. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See docs/development/simulator for details
 
 5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as:
 ```shell
 # Make sure gateway's local access, see docs/development/simulator/README.md for details.
-python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
+python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
 ```
 
 6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by:
diff --git a/python/aibrix/aibrix/gpu_optimizer/app.py b/python/aibrix/aibrix/gpu_optimizer/app.py
index 4368a2ac..405abe4e 100644
--- a/python/aibrix/aibrix/gpu_optimizer/app.py
+++ b/python/aibrix/aibrix/gpu_optimizer/app.py
@@ -27,7 +27,6 @@
 from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer
 from aibrix.gpu_optimizer.utils import ExcludePathsFilter
 
-NAMESPACE = os.getenv("NAMESPACE", "aibrix-system")
 MODEL_LABEL = "model.aibrix.ai/name"
 MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas"
 REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
@@ -186,6 +185,22 @@ async def stop_deployment_optimization(request):
         )
 
 
+@app.route("/update_profile/{model_name}")
+async def update_profile(request):
+    model_name = request.path_params["model_name"]
+    monitor = model_monitors.get(model_name, None)
+    if monitor is None:
+        return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404)
+
+    if monitor.load_profiles():
+        return JSONResponse({"message": f"workload profile of {model_name} updated"})
+    else:
+        return JSONResponse(
+            {"error": f"failed to update workload profile of {model_name}"},
+            status_code=500,
+        )
+
+
 @app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"])
 async def scale_deployment(request):
     namespace = request.path_params["namespace"]
@@ -249,9 +264,9 @@ def main(signal, timeout):
             apps_v1 = client.AppsV1Api()
 
             # List existing deployments
-            logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}")
-            deployments = apps_v1.list_namespaced_deployment(
-                namespace=NAMESPACE, label_selector=MODEL_LABEL
+            logger.info(f"Looking for deployments with {MODEL_LABEL}")
+            deployments = apps_v1.list_deployment_for_all_namespaces(
+                label_selector=MODEL_LABEL
             )
             watch_version = deployments.metadata.resource_version
             logger.debug(f"last watch version: {watch_version}")
@@ -284,8 +299,7 @@ def main(signal, timeout):
             w = watch.Watch()
             signal["watch"] = w
             for event in w.stream(
-                apps_v1.list_namespaced_deployment,
-                namespace=NAMESPACE,
+                apps_v1.list_deployment_for_all_namespaces,
                 label_selector=MODEL_LABEL,
                 resource_version=watch_version,
                 timeout_seconds=timeout,
diff --git a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
deleted file mode 100644
index 1e122b93..00000000
--- a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: pod-autoscaler
-  namespace: aibrix-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  namespace: aibrix-system
-  name: deployment-reader
-rules:
-  - apiGroups: ["apps"]
-    resources: ["deployments"]
-    verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: deployment-reader-binding
-  namespace: aibrix-system
-subjects:
-  - kind: ServiceAccount
-    name: pod-autoscaler
-    namespace: aibrix-system
-roleRef:
-  kind: Role
-  name: deployment-reader
-  apiGroup: rbac.authorization.k8s.io
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: gpu-optimizer
-  namespace: aibrix-system
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: gpu-optimizer
-  template:
-    metadata:
-      labels:
-        app: gpu-optimizer
-    spec:
-      serviceAccountName: pod-autoscaler
-      automountServiceAccountToken: true # Important!
-      containers:
-      - name: gpu-optimizer
-        image: aibrix/runtime:nightly
-        command: ["python", "-m", "aibrix.gpu_optimizer.app", "--debug"]
-        ports:
-        - containerPort: 8080
-        env:
-          - name: NAMESPACE
-            valueFrom:
-              fieldRef:
-                fieldPath: metadata.namespace
-          - name: REDIS_HOST
-            value: aibrix-redis-master.aibrix-system.svc.cluster.local
----
-# Debug only: Make sure pod can be visited from controller that deployed in mac.
-apiVersion: v1
-kind: Service
-metadata:
-  name: gpu-optimizer
-  namespace: aibrix-system
-spec:
-  selector:
-    app: gpu-optimizer
-  ports:
-    - protocol: TCP
-      port: 8080
-      targetPort: 8080
-      nodePort: 30008
-  type: NodePort
\ No newline at end of file
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
index 5fc887d9..409f23d8 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
@@ -127,7 +127,6 @@ def validate(self) -> bool:
         if len(self.clusterers) < self.buffer_size:
             self.clusterers.append(self.clusterers[current].clone())
             self.frontier = len(self.clusterers) - 1
-            logger.debug("test")
             logger.debug(
                 "moving buffer created: %s, buffers: %s",
                 self._reason,
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
index 091fc152..dbc295ee 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
@@ -137,7 +137,12 @@ def add_deployment(
         profile = self._match_profile(key, deployment_name)
         if profile is not None:
             # No lock required here since the deployment has not been added to deployments.
-            self._optimizer.set_profile(profile)
+            try:
+                self._optimizer.set_profile(profile)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to set GPU profile for {key}. Optimizer will skip the GPU: {e}"
+                )
         else:
             logger.warning(
                 f"No GPU profile found for {key}. Optimizer will skip the GPU."
@@ -197,12 +202,13 @@ def clear_outdated_deployments(self) -> int:
                 del self.deployments[key]
         return len(self.deployments)
 
-    def load_profiles(self, profile_reader: Optional[ProfileReader] = None):
+    def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool:
         """Load profiles from a file"""
         try:
             if profile_reader is None:
                 if self._profile_reader is None:
-                    return
+                    logger.error("Profile reader not initialized")
+                    return False
                 profile_reader = self._profile_reader
             else:
                 self._profile_reader = profile_reader
@@ -211,9 +217,13 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None):
             for profile in profiles:
                 if self._update_profile(profile):
                     logger.debug(f"Profile of {profile.gpu} updated.")
+
+            return True
         except Exception as e:
             logger.error(f"Failed to load profiles: {e}")
 
+            return False
+
     def _update_profile(self, profile: GPUProfile) -> bool:
         """Update a profile, will update the formal alias copy, too."""
         key = profile.gpu