vllm-project · Jeffwan · Mar 4, 2025 · Mar 4, 2025
diff --git a/development/app/requirements.txt b/development/app/requirements.txt
@@ -8,4 +8,5 @@ ddsketch
 plotly_express
 fasteners
 transformers
-git+https://github.com/zhangjyr/vidur.git
+git+https://github.com/zhangjyr/vidur.git
+ray[default]
diff --git a/development/tutorials/distributed/fleet.yaml b/development/tutorials/distributed/fleet.yaml
@@ -3,7 +3,7 @@ kind: RayClusterFleet
 metadata:
   labels:
     app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
+    model.aibrix.ai/name: facebook-opt-13b
   name: facebook-opt-13b
 spec:
   replicas: 1
@@ -20,13 +20,17 @@ spec:
       labels:
         model.aibrix.ai/name: facebook-opt-13b
       annotations:
-          ray.io/overwrite-container-cmd: "true"
+        ray.io/overwrite-container-cmd: "true"
     spec:
-      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      rayVersion: "2.10.0"
       headGroupSpec:
         rayStartParams:
           dashboard-host: '0.0.0.0'
+          block: 'false'
         template:
+          metadata:
+            labels:
+              model.aibrix.ai/name: facebook-opt-13b
           spec:
             containers:
               - name: ray-head
@@ -40,35 +44,35 @@ spec:
                     name: client
                   - containerPort: 8000
                     name: service
-                    command: ["/bin/bash", "-lc", "--"]
-                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                command: ["/bin/bash", "-lc", "--"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"]
                 resources:
                   limits:
-                    cpu: "1000m"
-                    nvidia.com/gpu: 1
+                    cpu: 1000m
                   requests:
-                    cpu: "200m"
-                    nvidia.com/gpu: 1
+                    cpu: 200m
       workerGroupSpecs:
-        # the pod replicas in this group typed worker
-        - replicas: 1
+        - replicas: 2
           minReplicas: 1
           maxReplicas: 5
           groupName: small-group
           rayStartParams: {}
           template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: facebook-opt-13b
             spec:
               containers:
-                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.10.0
+                - name: ray-worker
+                  image: 'rayproject/ray:2.10.0'
+                  command: [ "/bin/bash", "-lc", "--" ]
+                  args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ]
                   lifecycle:
                     preStop:
                       exec:
                         command: [ "/bin/sh","-c","ray stop" ]
                   resources:
                     limits:
-                      cpu: "1000m"
-                      nvidia.com/gpu: 1
+                      cpu: 1000m
                     requests:
-                      cpu: "200m"
-                      nvidia.com/gpu: 1
+                      cpu: 200m
diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -22,8 +22,11 @@ import (
 	"time"
 
 	autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
+	orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
 	"github.com/vllm-project/aibrix/pkg/config"
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/metrics"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 
 	"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/scaler"
 	podutil "github.com/vllm-project/aibrix/pkg/utils"
@@ -619,6 +622,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
 		return 0, "", currentTimestamp, err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return 0, "", currentTimestamp, err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector)
 
 	if err != nil {
@@ -700,6 +713,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa
 		return err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	// Get pod list managed by scaleTargetRef
 	podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector)
 	if err != nil {