From dd2aa265f1d68dc32382606c857182140a2e6870 Mon Sep 17 00:00:00 2001 From: Jingyuan Date: Fri, 6 Dec 2024 17:38:00 -0800 Subject: [PATCH] [Feat] Integrate deployment configurations and fix autoscaler/gpu optimizer connectivity (#500) * Add GPU Optimizer deployment and update configurations * Fix k8s accessibility regard namespaces. GPU optimizer now monitor all namespaces with model label. * Lint fix * Deployment clean-up * Update README.md --------- Co-authored-by: Ning Wang Co-authored-by: Jingyuan Zhang --- config/default/kustomization.yaml | 1 + config/gpu-optimizer/deployment.yaml | 26 +++++++ config/gpu-optimizer/kustomization.yaml | 4 + config/gpu-optimizer/rbac.yaml | 27 +++++++ config/gpu-optimizer/service.yaml | 13 ++++ .../default/gpu-optimizer/kustomization.yaml | 7 ++ .../overlays/vke/default/kustomization.yaml | 1 + .../patch_podautoscaler_a40.yaml | 2 +- .../simulator/patch_podautoscaler_a100.yaml | 2 +- .../podautoscaler/podautoscaler.yaml | 2 +- python/aibrix/aibrix/gpu_optimizer/Makefile | 24 ++---- python/aibrix/aibrix/gpu_optimizer/README.md | 36 ++++----- python/aibrix/aibrix/gpu_optimizer/app.py | 26 +++++-- .../aibrix/gpu_optimizer/deployment.yaml | 76 ------------------- .../gpu_optimizer/load_monitor/clusterer.py | 1 - .../gpu_optimizer/load_monitor/monitor.py | 16 +++- 16 files changed, 139 insertions(+), 125 deletions(-) create mode 100644 config/gpu-optimizer/deployment.yaml create mode 100644 config/gpu-optimizer/kustomization.yaml create mode 100644 config/gpu-optimizer/rbac.yaml create mode 100644 config/gpu-optimizer/service.yaml create mode 100644 config/overlays/vke/default/gpu-optimizer/kustomization.yaml delete mode 100644 python/aibrix/aibrix/gpu_optimizer/deployment.yaml diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index f472003f..2402b1b1 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -23,6 +23,7 @@ resources: - ../rbac - ../manager - ../gateway +- ../gpu-optimizer - ../dependency/kuberay-operator # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml diff --git a/config/gpu-optimizer/deployment.yaml b/config/gpu-optimizer/deployment.yaml new file mode 100644 index 00000000..796912e4 --- /dev/null +++ b/config/gpu-optimizer/deployment.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-optimizer + namespace: aibrix-system +spec: + replicas: 1 + selector: + matchLabels: + app: gpu-optimizer + template: + metadata: + labels: + app: gpu-optimizer + spec: + serviceAccountName: gpu-optimizer-sa + automountServiceAccountToken: true + containers: + - name: gpu-optimizer + image: aibrix/runtime:nightly + command: ["python", "-m", "aibrix.gpu_optimizer.app"] + ports: + - containerPort: 8080 + env: + - name: REDIS_HOST + value: aibrix-redis-master.aibrix-system.svc.cluster.local \ No newline at end of file diff --git a/config/gpu-optimizer/kustomization.yaml b/config/gpu-optimizer/kustomization.yaml new file mode 100644 index 00000000..bb0c7530 --- /dev/null +++ b/config/gpu-optimizer/kustomization.yaml @@ -0,0 +1,4 @@ +resources: +- deployment.yaml +- service.yaml +- rbac.yaml \ No newline at end of file diff --git a/config/gpu-optimizer/rbac.yaml b/config/gpu-optimizer/rbac.yaml new file mode 100644 index 00000000..dd56e2e1 --- /dev/null +++ b/config/gpu-optimizer/rbac.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-optimizer-sa + namespace: aibrix-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gpu-optimizer-clusterrole +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gpu-optimizer-clusterrole-binding +subjects: + - kind: ServiceAccount + name: gpu-optimizer-sa + namespace: aibrix-system +roleRef: + kind: ClusterRole + name: gpu-optimizer-clusterrole + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/config/gpu-optimizer/service.yaml b/config/gpu-optimizer/service.yaml new file mode 100644 index 00000000..6968aeed --- /dev/null +++ b/config/gpu-optimizer/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: gpu-optimizer + namespace: aibrix-system +spec: + selector: + app: gpu-optimizer + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 + type: ClusterIP \ No newline at end of file diff --git a/config/overlays/vke/default/gpu-optimizer/kustomization.yaml b/config/overlays/vke/default/gpu-optimizer/kustomization.yaml new file mode 100644 index 00000000..73c8e670 --- /dev/null +++ b/config/overlays/vke/default/gpu-optimizer/kustomization.yaml @@ -0,0 +1,7 @@ +resources: + - ../../../../gpu-optimizer + +images: +- name: aibrix/gpu-optimizer + newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gpu-optimizer + newTag: nightly \ No newline at end of file diff --git a/config/overlays/vke/default/kustomization.yaml b/config/overlays/vke/default/kustomization.yaml index 4598d51d..249e59ce 100644 --- a/config/overlays/vke/default/kustomization.yaml +++ b/config/overlays/vke/default/kustomization.yaml @@ -7,6 +7,7 @@ resources: - ../../../rbac - manager - gateway +- gpu-optimizer - ../../../dependency/kuberay-operator diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml index 93b2d37d..339c87a2 100644 --- a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml +++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml @@ -9,7 +9,7 @@ spec: kind: Deployment name: simulator-llama2-7b-a40 metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b-a40 metric: "vllm:deployment_replicas" targetValue: "1" \ No newline at end of file diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml index c04a2c19..09836449 100644 --- a/development/app/config/simulator/patch_podautoscaler_a100.yaml +++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml @@ -9,7 +9,7 @@ spec: kind: Deployment name: simulator-llama2-7b-a100 metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b-a100 metric: "vllm:deployment_replicas" targetValue: "1" \ No newline at end of file diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml index 945fb13c..a75a60c5 100644 --- a/development/app/config/templates/podautoscaler/podautoscaler.yaml +++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml @@ -16,7 +16,7 @@ spec: maxReplicas: 10 targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 path: /metrics/default/simulator-llama2-7b metric: "vllm:deployment_replicas" targetValue: "1" diff --git a/python/aibrix/aibrix/gpu_optimizer/Makefile b/python/aibrix/aibrix/gpu_optimizer/Makefile index e4a8ac3a..cbd761cb 100644 --- a/python/aibrix/aibrix/gpu_optimizer/Makefile +++ b/python/aibrix/aibrix/gpu_optimizer/Makefile @@ -3,18 +3,6 @@ all: build DP ?= profiling DATASET ?= [set your DATASET path] -.PHONY: deploy -deploy: - kubectl apply -f deployment.yaml - sleep 2 - kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 & - -.PHONY: clean -clean: - kubectl delete -f deployment.yaml - sleep 1 - curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b - .PHONY: benchmark benchmark: optimizer/profiling/benchmark.sh $(DP) @@ -33,27 +21,31 @@ debug: .PHONY: debug-init-simulator debug-init-simulator: - curl http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \ + curl http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" \ -d '{}' .PHONY: debug-scale-simulator debug-scale-simulator: - curl http://localhost:8080/scale/aibrix-system/simulator-llama2-7b/2 \ + curl http://localhost:8080/scale/default/simulator-llama2-7b-a100-a100/2 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" \ -d '{}' .PHONY: debug-stop-simulator debug-stop-simulator: - curl -X DELETE http://localhost:8080/monitor/aibrix-system/simulator-llama2-7b \ + curl -X DELETE http://localhost:8080/monitor/default/simulator-llama2-7b-a100 \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" +.PHONY: debug-update-profile +debug-update-profile: + curl http://localhost:8080/update_profile/llama2-7b + .PHONY: debug-metrics debug-metrics: - curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b + curl http://localhost:8080/metrics/aibrix-system/simulator-llama2-7b-a100 .PHONY: debug-workload debug-workload: diff --git a/python/aibrix/aibrix/gpu_optimizer/README.md b/python/aibrix/aibrix/gpu_optimizer/README.md index 347a2603..fdcf821d 100644 --- a/python/aibrix/aibrix/gpu_optimizer/README.md +++ b/python/aibrix/aibrix/gpu_optimizer/README.md @@ -2,52 +2,48 @@ ## Run in kubernetes -1. Make sure Aibrix components are up-to-date. - -2. For now, build GPU Optimizer base image using Dockerfile within this folder. -```dockerfile -docker build -t aibrix/gpu-optimizer:nightly -f Dockerfile . - -# Or use make -make build +1. Make sure Aibrix components are up-to-date. In particular, GPU Optimizer can be updated independently by: +```shell +cd ../../../../ && make docker-build-runtime +kubectl create -k config/gpu-optimizer ``` -3. Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first: +2. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See development/app for details + +3. [Optional] Prepare performance benchmark using optimizer/profiling/benchmark.sh. See optimizer/profiling/README.md. You may need to expose pod interface first: ```shell # Make sure pod is accessable locally: -kubectl -n aibrix-system port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 & +kubectl port-forward [pod_name] 8010:8000 1>/dev/null 2>&1 & ``` If using CPU based vLLM simulator, sample profiles is included in optimizer/profiling/result. - -1. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute +4. Generate profile based on SLO target using optimizer/profiling/gen-profile.py. If using CPU based vLLM simulator, execute ```shell # Make sure Redis is accessable locally: kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 & # Or use make make debug-init -python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b" +python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b" # Or use make make DP=simulator-llama2-7b-a100 gen-profile ``` +Replace simulator-llama2-7b-a100 with your deployment name. -5. Deploy GPU Optimizer +4. Notify GPU optimizer that profiles are ready ```shell -kubectl apply -f deployment.yaml -kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 & +kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 & -# Or use make -make deploy +curl http://localhost:8080/update_profile/llama2-7b ``` +Replace llama2-7b with your model name. -4. Deploy your vLLM model. If run locally a CPU based vLLM simulator is provided. See docs/development/simulator for details 5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as: ```shell # Make sure gateway's local access, see docs/development/simulator/README.md for details. -python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b +python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b ``` 6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by: diff --git a/python/aibrix/aibrix/gpu_optimizer/app.py b/python/aibrix/aibrix/gpu_optimizer/app.py index 4368a2ac..405abe4e 100644 --- a/python/aibrix/aibrix/gpu_optimizer/app.py +++ b/python/aibrix/aibrix/gpu_optimizer/app.py @@ -27,7 +27,6 @@ from aibrix.gpu_optimizer.load_monitor.visualizer import mount_to as mount_visulizer from aibrix.gpu_optimizer.utils import ExcludePathsFilter -NAMESPACE = os.getenv("NAMESPACE", "aibrix-system") MODEL_LABEL = "model.aibrix.ai/name" MIN_REPLICAS_LABEL = "model.aibrix.ai/min_replicas" REDIS_HOST = os.getenv("REDIS_HOST", "localhost") @@ -186,6 +185,22 @@ async def stop_deployment_optimization(request): ) +@app.route("/update_profile/{model_name}") +async def update_profile(request): + model_name = request.path_params["model_name"] + monitor = model_monitors.get(model_name, None) + if monitor is None: + return JSONResponse({"error": f"{model_name} not monitored"}, status_code=404) + + if monitor.load_profiles(): + return JSONResponse({"message": f"workload profile of {model_name} updated"}) + else: + return JSONResponse( + {"error": f"failed to update workload profile of {model_name}"}, + status_code=500, + ) + + @app.route("/scale/{namespace}/{deployment_name}/{replicas}", methods=["POST"]) async def scale_deployment(request): namespace = request.path_params["namespace"] @@ -249,9 +264,9 @@ def main(signal, timeout): apps_v1 = client.AppsV1Api() # List existing deployments - logger.info(f"Looking for deployments in {NAMESPACE} with {MODEL_LABEL}") - deployments = apps_v1.list_namespaced_deployment( - namespace=NAMESPACE, label_selector=MODEL_LABEL + logger.info(f"Looking for deployments with {MODEL_LABEL}") + deployments = apps_v1.list_deployment_for_all_namespaces( + label_selector=MODEL_LABEL ) watch_version = deployments.metadata.resource_version logger.debug(f"last watch version: {watch_version}") @@ -284,8 +299,7 @@ def main(signal, timeout): w = watch.Watch() signal["watch"] = w for event in w.stream( - apps_v1.list_namespaced_deployment, - namespace=NAMESPACE, + apps_v1.list_deployment_for_all_namespaces, label_selector=MODEL_LABEL, resource_version=watch_version, timeout_seconds=timeout, diff --git a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml b/python/aibrix/aibrix/gpu_optimizer/deployment.yaml deleted file mode 100644 index 1e122b93..00000000 --- a/python/aibrix/aibrix/gpu_optimizer/deployment.yaml +++ /dev/null @@ -1,76 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pod-autoscaler - namespace: aibrix-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: aibrix-system - name: deployment-reader -rules: - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: deployment-reader-binding - namespace: aibrix-system -subjects: - - kind: ServiceAccount - name: pod-autoscaler - namespace: aibrix-system -roleRef: - kind: Role - name: deployment-reader - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: gpu-optimizer - namespace: aibrix-system -spec: - replicas: 1 - selector: - matchLabels: - app: gpu-optimizer - template: - metadata: - labels: - app: gpu-optimizer - spec: - serviceAccountName: pod-autoscaler - automountServiceAccountToken: true # Important! - containers: - - name: gpu-optimizer - image: aibrix/runtime:nightly - command: ["python", "-m", "aibrix.gpu_optimizer.app", "--debug"] - ports: - - containerPort: 8080 - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: REDIS_HOST - value: aibrix-redis-master.aibrix-system.svc.cluster.local ---- -# Debug only: Make sure pod can be visited from controller that deployed in mac. -apiVersion: v1 -kind: Service -metadata: - name: gpu-optimizer - namespace: aibrix-system -spec: - selector: - app: gpu-optimizer - ports: - - protocol: TCP - port: 8080 - targetPort: 8080 - nodePort: 30008 - type: NodePort \ No newline at end of file diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py index 5fc887d9..409f23d8 100644 --- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py +++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/clusterer.py @@ -127,7 +127,6 @@ def validate(self) -> bool: if len(self.clusterers) < self.buffer_size: self.clusterers.append(self.clusterers[current].clone()) self.frontier = len(self.clusterers) - 1 - logger.debug("test") logger.debug( "moving buffer created: %s, buffers: %s", self._reason, diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py index 091fc152..dbc295ee 100644 --- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py +++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py @@ -137,7 +137,12 @@ def add_deployment( profile = self._match_profile(key, deployment_name) if profile is not None: # No lock required here since the deployment has not been added to deployments. - self._optimizer.set_profile(profile) + try: + self._optimizer.set_profile(profile) + except Exception as e: + logger.warning( + f"Failed to set GPU profile for {key}. Optimizer will skip the GPU: {e}" + ) else: logger.warning( f"No GPU profile found for {key}. Optimizer will skip the GPU." @@ -197,12 +202,13 @@ def clear_outdated_deployments(self) -> int: del self.deployments[key] return len(self.deployments) - def load_profiles(self, profile_reader: Optional[ProfileReader] = None): + def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool: """Load profiles from a file""" try: if profile_reader is None: if self._profile_reader is None: - return + logger.error("Profile reader not initialized") + return False profile_reader = self._profile_reader else: self._profile_reader = profile_reader @@ -211,9 +217,13 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None): for profile in profiles: if self._update_profile(profile): logger.debug(f"Profile of {profile.gpu} updated.") + + return True except Exception as e: logger.error(f"Failed to load profiles: {e}") + return False + def _update_profile(self, profile: GPUProfile) -> bool: """Update a profile, will update the formal alias copy, too.""" key = profile.gpu