Merge branch 'gpu-optimizer-orchestration' into issues/484_Controller…

…_failed_to_fetch_metrics_from_MetricSource # Conflicts: # development/simulator/deployment-a100.yaml # development/simulator/deployment-a40.yaml
vllm-project · Dec 6, 2024 · e544c12 · e544c12
2 parents d2be10a + 90cd690
commit e544c12
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 3 deletions.
diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
@@ -23,6 +23,7 @@ resources:
 - ../rbac
 - ../manager
 - ../gateway
+- ../gpu-optimizer
 - ../dependency/kuberay-operator
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml

diff --git a/config/gpu-optimizer/deployment.yaml b/config/gpu-optimizer/deployment.yaml
@@ -0,0 +1,26 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-optimizer
+  template:
+    metadata:
+      labels:
+        app: gpu-optimizer
+    spec:
+      serviceAccountName: gpu-optimizer-sa
+      automountServiceAccountToken: true
+      containers:
+      - name: gpu-optimizer
+        image: aibrix/runtime:nightly
+        command: ["python", "-m", "aibrix.gpu_optimizer.app"]
+        ports:
+        - containerPort: 8080
+        env:
+          - name: REDIS_HOST
+            value: aibrix-redis-master.aibrix-system.svc.cluster.local
diff --git a/config/gpu-optimizer/kustomization.yaml b/config/gpu-optimizer/kustomization.yaml
@@ -0,0 +1,4 @@
+resources:
+- deployment.yaml
+- service.yaml
+- rbac.yaml
diff --git a/config/gpu-optimizer/rbac.yaml b/config/gpu-optimizer/rbac.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpu-optimizer-sa
+  namespace: aibrix-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: gpu-optimizer-clusterrole
+rules:
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gpu-optimizer-clusterrole-binding
+subjects:
+  - kind: ServiceAccount
+    name: gpu-optimizer-sa
+    namespace: aibrix-system
+roleRef:
+  kind: ClusterRole
+  name: gpu-optimizer-clusterrole
+  apiGroup: rbac.authorization.k8s.io
diff --git a/config/gpu-optimizer/service.yaml b/config/gpu-optimizer/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-optimizer
+  namespace: aibrix-system
+spec:
+  selector:
+    app: gpu-optimizer
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
+  type: ClusterIP
diff --git a/config/overlays/vke/default/kustomization.yaml b/config/overlays/vke/default/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
 - ../../../rbac
 - manager
 - gateway
+- ../../../gpu-optimizer
 - ../../../dependency/kuberay-operator
 
 

diff --git a/python/aibrix/aibrix/gpu_optimizer/README.md b/python/aibrix/aibrix/gpu_optimizer/README.md
@@ -28,15 +28,15 @@ kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/n
 # Or use make
 make debug-init
 
-python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
+python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b"
 # Or use make
 make DP=simulator-llama2-7b-a100 gen-profile
 ```
 
 5. Deploy GPU Optimizer
 ```shell
 kubectl apply -f deployment.yaml
-kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
+kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 &
 
 # Or use make
 make deploy
@@ -47,7 +47,7 @@ make deploy
 5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as:
 ```shell
 # Make sure gateway's local access, see docs/development/simulator/README.md for details.
-python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
+python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b
 ```
 
 6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by:
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ resources: @@
     - ../../../rbac
     - manager
     - gateway
+    - ../../../gpu-optimizer
     - ../../../dependency/kuberay-operator
@@ Expand Down @@