From d2be10abc0d4c485a05256dd4966e31793a5f9f3 Mon Sep 17 00:00:00 2001 From: Jingyuan Zhang Date: Thu, 5 Dec 2024 15:37:45 -0800 Subject: [PATCH] Fix k8s accessibility regard namespaces. GPU optimizer now monitor all namespaces with model label. --- development/app/README.md | 2 +- .../patch_podautoscaler_a40.yaml | 2 +- .../simulator/patch_podautoscaler_a100.yaml | 2 +- .../podautoscaler/podautoscaler.yaml | 2 +- python/aibrix/aibrix/gpu_optimizer/Makefile | 16 +++++---- python/aibrix/aibrix/gpu_optimizer/app.py | 26 ++++++++++---- .../aibrix/gpu_optimizer/deployment.yaml | 36 ++++++++----------- .../gpu_optimizer/load_monitor/clusterer.py | 1 - .../gpu_optimizer/load_monitor/monitor.py | 16 +++++++-- 9 files changed, 61 insertions(+), 42 deletions(-) diff --git a/development/app/README.md b/development/app/README.md index f119c518..140b6c11 100644 --- a/development/app/README.md +++ b/development/app/README.md @@ -41,7 +41,7 @@ Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for hig 1. Builder simulator base model image ```dockerfile -docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . +docker build -t aibrix/vllm-simulator:nightly --build-arg SIMULATION=a100 -f Dockerfile . ``` 1.b (Optional) Load container image to docker context diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml index 93b2d37d..339c87a2 100644 --- a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml +++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml @@ -9,7 +9,7 @@ spec: kind: Deployment name: simulator-llama2-7b-a40 metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b-a40 metric: "vllm:deployment_replicas" targetValue: "1" \ No newline at end of file diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml index c04a2c19..09836449 100644 --- a/development/app/config/simulator/patch_podautoscaler_a100.yaml +++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml @@ -9,7 +9,7 @@ spec: kind: Deployment name: simulator-llama2-7b-a100 metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b-a100 metric: "vllm:deployment_replicas" targetValue: "1" \ No newline at end of file diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml index 945fb13c..a75a60c5 100644 --- a/development/app/config/templates/podautoscaler/podautoscaler.yaml +++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml @@ -16,7 +16,7 @@ spec: maxReplicas: 10 targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b metric: "vllm:deployment_replicas" targetValue: "1" diff --git a/python/aibrix/aibrix/gpu_optimizer/Makefile b/python/aibrix/aibrix/gpu_optimizer/Makefile index e4a8ac3a..a7a1842c 100644 --- a/python/aibrix/aibrix/gpu_optimizer/Makefile +++ b/python/aibrix/aibrix/gpu_optimizer/Makefile @@ -7,13 +7,13 @@ DATASET ?= [set your DATASET path] deploy: kubectl apply -f deployment.yaml sleep 2 - kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 & + kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 & .PHONY: clean clean: kubectl delete -f deployment.yaml sleep 1 - curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b + curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100 .PHONY: benchmark benchmark: @@ -33,27 +33,31 @@ debug: .PHONY: debug-init-simulator debug-init-simulator: - curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \ + curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" \ -d '{}' .PHONY: debug-scale-simulator debug-scale-simulator: - curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \ + curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" \ -d '{}' .PHONY: debug-stop-simulator debug-stop-simulator: - curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \ + curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" +.PHONY: debug-update-profile +debug-update-profile: + curl http://localhost:8080/update_profile/llama2-7b + .PHONY: debug-metrics debug-metrics: - curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b + curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100 .PHONY: debug-workload debug-workload: diff --git a/python/aibrix/aibrix/gpu_optimizer/app.py b/python/aibrix/aibrix/gpu_optimizer/app.py index 4368a2ac..6bb344dd 100644 --- a/python/aibrix/aibrix/gpu_optimizer/app.py +++ b/python/aibrix/aibrix/gpu_optimizer/app.py @@ -27,7 +27,6 @@ from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer from aibrix.gpu_optimizer.utils import ExcludePathsFilter -NAMESPACE = os.getenv("NAMESPACE", "aibrix-system") MODEL_LABEL = "model.aibrix.ai/name" MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas" REDIS_HOST = os.getenv("REDIS_HOST", "localhost") @@ -186,6 +185,22 @@ async def stop_deployment_optimization(request): ) +@app.route("/update_profile/{model_name}") +async def update_profile(request): + model_name = request.path_params["model_name"] + monitor = model_monitors.get(model_name, None) + if monitor is None: + return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404) + + if monitor.load_profiles(): + return JSONResponse({"message": f"workload profile of {model_name} updated"}) + else: + return JSONResponse( + {"error": f"failed to update workload profile of {model_name}"}, + status_code=500, + ) + + @app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"]) async def scale_deployment(request): namespace = request.path_params["namespace"] @@ -249,10 +264,8 @@ def main(signal, timeout): apps_v1 = client.AppsV1Api() # List existing deployments - logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}") - deployments = apps_v1.list_namespaced_deployment( - namespace=NAMESPACE, label_selector=MODEL_LABEL - ) + logger.info(f"Looking for deployments with {MODEL_LABEL}") + deployments = apps_v1.list_deployment_for_all_namespaces(label_selector=MODEL_LABEL) watch_version = deployments.metadata.resource_version logger.debug(f"last watch version: {watch_version}") for deployment in deployments.items: @@ -284,8 +297,7 @@ def main(signal, timeout): w = watch.Watch() signal["watch"] = w for event in w.stream( - apps_v1.list_namespaced_deployment, - namespace=NAMESPACE, + apps_v1.list_deployment_for_all_namespaces, label_selector=MODEL_LABEL, resource_version=watch_version, timeout_seconds=timeout, diff --git a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml index 1e122b93..a2b949cc 100644 --- a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml +++ b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml @@ -1,61 +1,55 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: pod-autoscaler + name: aibrix-gpu-optimizer-sa namespace: aibrix-system --- apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: ClusterRole metadata: - namespace: aibrix-system - name: deployment-reader + name: gpu-optimizer-clusterrole rules: - apiGroups: ["apps"] resources: ["deployments"] verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding +kind: ClusterRoleBinding metadata: - name: deployment-reader-binding - namespace: aibrix-system + name: aibrix-gpu-optimizer-clusterrole-binding subjects: - kind: ServiceAccount - name: pod-autoscaler + name: aibrix-gpu-optimizer-sa namespace: aibrix-system roleRef: - kind: Role - name: deployment-reader + kind: ClusterRole + name: gpu-optimizer-clusterrole apiGroup: rbac.authorization.k8s.io --- apiVersion: apps/v1 kind: Deployment metadata: - name: gpu-optimizer + name: aibrix-gpu-optimizer namespace: aibrix-system spec: replicas: 1 selector: matchLabels: - app: gpu-optimizer + app: aibrix-gpu-optimizer template: metadata: labels: - app: gpu-optimizer + app: aibrix-gpu-optimizer spec: - serviceAccountName: pod-autoscaler + serviceAccountName: aibrix-gpu-optimizer-sa automountServiceAccountToken: true # Important! containers: - - name: gpu-optimizer + - name: aibrix-gpu-optimizer image: aibrix/runtime:nightly command: ["python", "-m", "aibrix.gpu_optimizer.app", "--debug"] ports: - containerPort: 8080 env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - name: REDIS_HOST value: aibrix-redis-master.aibrix-system.svc.cluster.local --- @@ -63,11 +57,11 @@ spec: apiVersion: v1 kind: Service metadata: - name: gpu-optimizer + name: aibrix-gpu-optimizer namespace: aibrix-system spec: selector: - app: gpu-optimizer + app: aibrix-gpu-optimizer ports: - protocol: TCP port: 8080 diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py index 5fc887d9..409f23d8 100644 --- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py +++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py @@ -127,7 +127,6 @@ def validate(self) -> bool: if len(self.clusterers) < self.buffer_size: self.clusterers.append(self.clusterers[current].clone()) self.frontier = len(self.clusterers) - 1 - logger.debug("test") logger.debug( "moving buffer created: %s, buffers: %s", self._reason, diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py index 091fc152..dbc295ee 100644 --- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py +++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py @@ -137,7 +137,12 @@ def add_deployment( profile = self._match_profile(key, deployment_name) if profile is not None: # No lock required here since the deployment has not been added to deployments. - self._optimizer.set_profile(profile) + try: + self._optimizer.set_profile(profile) + except Exception as e: + logger.warning( + f"Failed to set GPU profile for {key}. Optimizer will skip the GPU: {e}" + ) else: logger.warning( f"No GPU profile found for {key}. Optimizer will skip the GPU." @@ -197,12 +202,13 @@ def clear_outdated_deployments(self) -> int: del self.deployments[key] return len(self.deployments) - def load_profiles(self, profile_reader: Optional[ProfileReader] = None): + def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool: """Load profiles from a file""" try: if profile_reader is None: if self._profile_reader is None: - return + logger.error("Profile reader not initialized") + return False profile_reader = self._profile_reader else: self._profile_reader = profile_reader @@ -211,9 +217,13 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None): for profile in profiles: if self._update_profile(profile): logger.debug(f"Profile of {profile.gpu} updated.") + + return True except Exception as e: logger.error(f"Failed to load profiles: {e}") + return False + def _update_profile(self, profile: GPUProfile) -> bool: """Update a profile, will update the formal alias copy, too.""" key = profile.gpu