From d2be10abc0d4c485a05256dd4966e31793a5f9f3 Mon Sep 17 00:00:00 2001
From: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Date: Thu, 5 Dec 2024 15:37:45 -0800
Subject: [PATCH] Fix k8s accessibility regard namespaces. GPU optimizer now
 monitor all namespaces with model label.

---
 development/app/README.md                     |  2 +-
 .../patch_podautoscaler_a40.yaml              |  2 +-
 .../simulator/patch_podautoscaler_a100.yaml   |  2 +-
 .../podautoscaler/podautoscaler.yaml          |  2 +-
 python/aibrix/aibrix/gpu_optimizer/Makefile   | 16 +++++----
 python/aibrix/aibrix/gpu_optimizer/app.py     | 26 ++++++++++----
 .../aibrix/gpu_optimizer/deployment.yaml      | 36 ++++++++-----------
 .../gpu_optimizer/load_monitor/clusterer.py   |  1 -
 .../gpu_optimizer/load_monitor/monitor.py     | 16 +++++++--
 9 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/development/app/README.md b/development/app/README.md
index f119c518..140b6c11 100644
--- a/development/app/README.md
+++ b/development/app/README.md
@@ -41,7 +41,7 @@ Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for hig
 
 1. Builder simulator base model image
 ```dockerfile
-docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
+docker build -t aibrix/vllm-simulator:nightly --build-arg SIMULATION=a100 -f Dockerfile .
 ```
 
 1.b (Optional) Load container image to docker context
diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
index 93b2d37d..339c87a2 100644
--- a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
+++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a40
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a40
       metric: "vllm:deployment_replicas"
   targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
index c04a2c19..09836449 100644
--- a/development/app/config/simulator/patch_podautoscaler_a100.yaml
+++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: simulator-llama2-7b-a100
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a100
       metric: "vllm:deployment_replicas"
   targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml
index 945fb13c..a75a60c5 100644
--- a/development/app/config/templates/podautoscaler/podautoscaler.yaml
+++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml
@@ -16,7 +16,7 @@ spec:
   maxReplicas: 10
   targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
   metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b
       metric: "vllm:deployment_replicas"
   targetValue: "1"
diff --git a/python/aibrix/aibrix/gpu_optimizer/Makefile b/python/aibrix/aibrix/gpu_optimizer/Makefile
index e4a8ac3a..a7a1842c 100644
--- a/python/aibrix/aibrix/gpu_optimizer/Makefile
+++ b/python/aibrix/aibrix/gpu_optimizer/Makefile
@@ -7,13 +7,13 @@ DATASET ?= [set your DATASET path]
 deploy:
 	kubectl apply -f deployment.yaml
 	sleep 2
-	kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
+	kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
 
 .PHONY: clean
 clean:
 	kubectl delete -f deployment.yaml
 	sleep 1
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
+	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100
 
 .PHONY: benchmark
 benchmark:
@@ -33,27 +33,31 @@ debug:
 
 .PHONY: debug-init-simulator
 debug-init-simulator:
-	curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-scale-simulator	
 debug-scale-simulator:
-	curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \
+	curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
 		-d '{}'
 
 .PHONY: debug-stop-simulator
 debug-stop-simulator:
-	curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \
+	curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key"
 
+.PHONY: debug-update-profile
+debug-update-profile:
+	curl http://localhost:8080/update_profile/llama2-7b
+
 .PHONY: debug-metrics
 debug-metrics:
-	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b
+	curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100
 
 .PHONY: debug-workload
 debug-workload:
diff --git a/python/aibrix/aibrix/gpu_optimizer/app.py b/python/aibrix/aibrix/gpu_optimizer/app.py
index 4368a2ac..6bb344dd 100644
--- a/python/aibrix/aibrix/gpu_optimizer/app.py
+++ b/python/aibrix/aibrix/gpu_optimizer/app.py
@@ -27,7 +27,6 @@
 from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer
 from aibrix.gpu_optimizer.utils import ExcludePathsFilter
 
-NAMESPACE = os.getenv("NAMESPACE", "aibrix-system")
 MODEL_LABEL = "model.aibrix.ai/name"
 MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas"
 REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
@@ -186,6 +185,22 @@ async def stop_deployment_optimization(request):
         )
 
 
+@app.route("/update_profile/{model_name}")
+async def update_profile(request):
+    model_name = request.path_params["model_name"]
+    monitor = model_monitors.get(model_name, None)
+    if monitor is None:
+        return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404)
+
+    if monitor.load_profiles():
+        return JSONResponse({"message": f"workload profile of {model_name} updated"})
+    else:
+        return JSONResponse(
+            {"error": f"failed to update workload profile of {model_name}"},
+            status_code=500,
+        )
+
+
 @app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"])
 async def scale_deployment(request):
     namespace = request.path_params["namespace"]
@@ -249,10 +264,8 @@ def main(signal, timeout):
             apps_v1 = client.AppsV1Api()
 
             # List existing deployments
-            logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}")
-            deployments = apps_v1.list_namespaced_deployment(
-                namespace=NAMESPACE, label_selector=MODEL_LABEL
-            )
+            logger.info(f"Looking for deployments with {MODEL_LABEL}")
+            deployments = apps_v1.list_deployment_for_all_namespaces(label_selector=MODEL_LABEL)
             watch_version = deployments.metadata.resource_version
             logger.debug(f"last watch version: {watch_version}")
             for deployment in deployments.items:
@@ -284,8 +297,7 @@ def main(signal, timeout):
             w = watch.Watch()
             signal["watch"] = w
             for event in w.stream(
-                apps_v1.list_namespaced_deployment,
-                namespace=NAMESPACE,
+                apps_v1.list_deployment_for_all_namespaces,
                 label_selector=MODEL_LABEL,
                 resource_version=watch_version,
                 timeout_seconds=timeout,
diff --git a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
index 1e122b93..a2b949cc 100644
--- a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
+++ b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml
@@ -1,61 +1,55 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: pod-autoscaler
+  name: aibrix-gpu-optimizer-sa
   namespace: aibrix-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
+kind: ClusterRole
 metadata:
-  namespace: aibrix-system
-  name: deployment-reader
+  name: gpu-optimizer-clusterrole
 rules:
   - apiGroups: ["apps"]
     resources: ["deployments"]
     verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
+kind: ClusterRoleBinding
 metadata:
-  name: deployment-reader-binding
-  namespace: aibrix-system
+  name: aibrix-gpu-optimizer-clusterrole-binding
 subjects:
   - kind: ServiceAccount
-    name: pod-autoscaler
+    name: aibrix-gpu-optimizer-sa
     namespace: aibrix-system
 roleRef:
-  kind: Role
-  name: deployment-reader
+  kind: ClusterRole
+  name: gpu-optimizer-clusterrole
   apiGroup: rbac.authorization.k8s.io
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: gpu-optimizer
+  name: aibrix-gpu-optimizer
   namespace: aibrix-system
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: gpu-optimizer
+      app: aibrix-gpu-optimizer
   template:
     metadata:
       labels:
-        app: gpu-optimizer
+        app: aibrix-gpu-optimizer
     spec:
-      serviceAccountName: pod-autoscaler
+      serviceAccountName: aibrix-gpu-optimizer-sa
       automountServiceAccountToken: true # Important!
       containers:
-      - name: gpu-optimizer
+      - name: aibrix-gpu-optimizer
         image: aibrix/runtime:nightly
         command: ["python", "-m", "aibrix.gpu_optimizer.app", "--debug"]
         ports:
         - containerPort: 8080
         env:
-          - name: NAMESPACE
-            valueFrom:
-              fieldRef:
-                fieldPath: metadata.namespace
           - name: REDIS_HOST
             value: aibrix-redis-master.aibrix-system.svc.cluster.local
 ---
@@ -63,11 +57,11 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: gpu-optimizer
+  name: aibrix-gpu-optimizer
   namespace: aibrix-system
 spec:
   selector:
-    app: gpu-optimizer
+    app: aibrix-gpu-optimizer
   ports:
     - protocol: TCP
       port: 8080
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
index 5fc887d9..409f23d8 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py
@@ -127,7 +127,6 @@ def validate(self) -> bool:
         if len(self.clusterers) < self.buffer_size:
             self.clusterers.append(self.clusterers[current].clone())
             self.frontier = len(self.clusterers) - 1
-            logger.debug("test")
             logger.debug(
                 "moving buffer created: %s, buffers: %s",
                 self._reason,
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
index 091fc152..dbc295ee 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
@@ -137,7 +137,12 @@ def add_deployment(
         profile = self._match_profile(key, deployment_name)
         if profile is not None:
             # No lock required here since the deployment has not been added to deployments.
-            self._optimizer.set_profile(profile)
+            try:
+                self._optimizer.set_profile(profile)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to set GPU profile for {key}. Optimizer will skip the GPU: {e}"
+                )
         else:
             logger.warning(
                 f"No GPU profile found for {key}. Optimizer will skip the GPU."
@@ -197,12 +202,13 @@ def clear_outdated_deployments(self) -> int:
                 del self.deployments[key]
         return len(self.deployments)
 
-    def load_profiles(self, profile_reader: Optional[ProfileReader] = None):
+    def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool:
         """Load profiles from a file"""
         try:
             if profile_reader is None:
                 if self._profile_reader is None:
-                    return
+                    logger.error("Profile reader not initialized")
+                    return False
                 profile_reader = self._profile_reader
             else:
                 self._profile_reader = profile_reader
@@ -211,9 +217,13 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None):
             for profile in profiles:
                 if self._update_profile(profile):
                     logger.debug(f"Profile of {profile.gpu} updated.")
+
+            return True
         except Exception as e:
             logger.error(f"Failed to load profiles: {e}")
 
+            return False
+
     def _update_profile(self, profile: GPUProfile) -> bool:
         """Update a profile, will update the formal alias copy, too."""
         key = profile.gpu