From 17b528c9b9ce5ef1d7fd0dd83f12c4a26b9fc358 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Mon, 3 Mar 2025 16:37:44 -0800
Subject: [PATCH] Append ray head label selector in PodAutoscaler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It helps to only consider the engine pod for multi-node inference. Ray worker doesn’t have http server running and it can not expose any application but only resource metrics. For resource metrics, since we use Tensor Parallelism, we think the utilization across GPU are same.

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 development/app/requirements.txt              |  3 +-
 development/tutorials/distributed/fleet.yaml  | 38 ++++++++++---------
 .../podautoscaler/podautoscaler_controller.go | 23 +++++++++++
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/development/app/requirements.txt b/development/app/requirements.txt
index e529cf21..8f587987 100644
--- a/development/app/requirements.txt
+++ b/development/app/requirements.txt
@@ -8,4 +8,5 @@ ddsketch
 plotly_express
 fasteners
 transformers
-git+https://github.com/zhangjyr/vidur.git
\ No newline at end of file
+git+https://github.com/zhangjyr/vidur.git
+ray[default]
\ No newline at end of file
diff --git a/development/tutorials/distributed/fleet.yaml b/development/tutorials/distributed/fleet.yaml
index 1333c304..f08e77ad 100644
--- a/development/tutorials/distributed/fleet.yaml
+++ b/development/tutorials/distributed/fleet.yaml
@@ -3,7 +3,7 @@ kind: RayClusterFleet
 metadata:
   labels:
     app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
+    model.aibrix.ai/name: facebook-opt-13b
   name: facebook-opt-13b
 spec:
   replicas: 1
@@ -20,13 +20,17 @@ spec:
       labels:
         model.aibrix.ai/name: facebook-opt-13b
       annotations:
-          ray.io/overwrite-container-cmd: "true"
+        ray.io/overwrite-container-cmd: "true"
     spec:
-      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      rayVersion: "2.10.0"
       headGroupSpec:
         rayStartParams:
           dashboard-host: '0.0.0.0'
+          block: 'false'
         template:
+          metadata:
+            labels:
+              model.aibrix.ai/name: facebook-opt-13b
           spec:
             containers:
               - name: ray-head
@@ -40,35 +44,35 @@ spec:
                     name: client
                   - containerPort: 8000
                     name: service
-                    command: ["/bin/bash", "-lc", "--"]
-                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                command: ["/bin/bash", "-lc", "--"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"]
                 resources:
                   limits:
-                    cpu: "1000m"
-                    nvidia.com/gpu: 1
+                    cpu: 1000m
                   requests:
-                    cpu: "200m"
-                    nvidia.com/gpu: 1
+                    cpu: 200m
       workerGroupSpecs:
-        # the pod replicas in this group typed worker
-        - replicas: 1
+        - replicas: 2
           minReplicas: 1
           maxReplicas: 5
           groupName: small-group
           rayStartParams: {}
           template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: facebook-opt-13b
             spec:
               containers:
-                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.10.0
+                - name: ray-worker
+                  image: 'rayproject/ray:2.10.0'
+                  command: [ "/bin/bash", "-lc", "--" ]
+                  args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ]
                   lifecycle:
                     preStop:
                       exec:
                         command: [ "/bin/sh","-c","ray stop" ]
                   resources:
                     limits:
-                      cpu: "1000m"
-                      nvidia.com/gpu: 1
+                      cpu: 1000m
                     requests:
-                      cpu: "200m"
-                      nvidia.com/gpu: 1
+                      cpu: 200m
diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
index 60cf72d0..75eeb433 100644
--- a/pkg/controller/podautoscaler/podautoscaler_controller.go
+++ b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -22,8 +22,11 @@ import (
 	"time"
 
 	autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
+	orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
 	"github.com/vllm-project/aibrix/pkg/config"
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/metrics"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/scaler"
 	podutil "github.com/vllm-project/aibrix/pkg/utils"
@@ -619,6 +622,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
 		return 0, "", currentTimestamp, err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return 0, "", currentTimestamp, err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector)
 
 	if err != nil {
@@ -700,6 +713,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa
 		return err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	// Get pod list managed by scaleTargetRef
 	podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector)
 	if err != nil {