Append ray head label selector in PodAutoscaler (#789)

It helps to only consider the engine pod for multi-node inference. Ray worker doesn’t have http server running and it can not expose any application but only resource metrics. For resource metrics, since we use Tensor Parallelism, we think the utilization across GPU are same. Signed-off-by: Jiaxin Shan <[email protected]>
vllm-project · Mar 4, 2025 · c73e063 · c73e063
1 parent 3ad08a1
commit c73e063
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 18 deletions.
diff --git a/development/app/requirements.txt b/development/app/requirements.txt
@@ -8,4 +8,5 @@ ddsketch
 plotly_express
 fasteners
 transformers
-git+https://github.com/zhangjyr/vidur.git
+git+https://github.com/zhangjyr/vidur.git
+ray[default]
diff --git a/development/tutorials/distributed/fleet.yaml b/development/tutorials/distributed/fleet.yaml
@@ -3,7 +3,7 @@ kind: RayClusterFleet
 metadata:
   labels:
     app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
+    model.aibrix.ai/name: facebook-opt-13b
   name: facebook-opt-13b
 spec:
   replicas: 1
@@ -20,13 +20,17 @@ spec:
       labels:
         model.aibrix.ai/name: facebook-opt-13b
       annotations:
-          ray.io/overwrite-container-cmd: "true"
+        ray.io/overwrite-container-cmd: "true"
     spec:
-      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      rayVersion: "2.10.0"
       headGroupSpec:
         rayStartParams:
           dashboard-host: '0.0.0.0'
+          block: 'false'
         template:
+          metadata:
+            labels:
+              model.aibrix.ai/name: facebook-opt-13b
           spec:
             containers:
               - name: ray-head
@@ -40,35 +44,35 @@ spec:
                     name: client
                   - containerPort: 8000
                     name: service
-                    command: ["/bin/bash", "-lc", "--"]
-                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                command: ["/bin/bash", "-lc", "--"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"]
                 resources:
                   limits:
-                    cpu: "1000m"
-                    nvidia.com/gpu: 1
+                    cpu: 1000m
                   requests:
-                    cpu: "200m"
-                    nvidia.com/gpu: 1
+                    cpu: 200m
       workerGroupSpecs:
-        # the pod replicas in this group typed worker
-        - replicas: 1
+        - replicas: 2
           minReplicas: 1
           maxReplicas: 5
           groupName: small-group
           rayStartParams: {}
           template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: facebook-opt-13b
             spec:
               containers:
-                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.10.0
+                - name: ray-worker
+                  image: 'rayproject/ray:2.10.0'
+                  command: [ "/bin/bash", "-lc", "--" ]
+                  args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ]
                   lifecycle:
                     preStop:
                       exec:
                         command: [ "/bin/sh","-c","ray stop" ]
                   resources:
                     limits:
-                      cpu: "1000m"
-                      nvidia.com/gpu: 1
+                      cpu: 1000m
                     requests:
-                      cpu: "200m"
-                      nvidia.com/gpu: 1
+                      cpu: 200m
diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -22,8 +22,11 @@ import (
 	"time"
 
 	autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
+	orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
 	"github.com/vllm-project/aibrix/pkg/config"
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/metrics"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/scaler"
 	podutil "github.com/vllm-project/aibrix/pkg/utils"
@@ -619,6 +622,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
 		return 0, "", currentTimestamp, err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return 0, "", currentTimestamp, err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector)
 
 	if err != nil {
@@ -700,6 +713,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa
 		return err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	// Get pod list managed by scaleTargetRef
 	podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector)
 	if err != nil {