From 17b528c9b9ce5ef1d7fd0dd83f12c4a26b9fc358 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Mon, 3 Mar 2025 16:37:44 -0800 Subject: [PATCH] Append ray head label selector in PodAutoscaler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It helps to only consider the engine pod for multi-node inference. Ray worker doesn’t have http server running and it can not expose any application but only resource metrics. For resource metrics, since we use Tensor Parallelism, we think the utilization across GPU are same. Signed-off-by: Jiaxin Shan --- development/app/requirements.txt | 3 +- development/tutorials/distributed/fleet.yaml | 38 ++++++++++--------- .../podautoscaler/podautoscaler_controller.go | 23 +++++++++++ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/development/app/requirements.txt b/development/app/requirements.txt index e529cf21..8f587987 100644 --- a/development/app/requirements.txt +++ b/development/app/requirements.txt @@ -8,4 +8,5 @@ ddsketch plotly_express fasteners transformers -git+https://github.com/zhangjyr/vidur.git \ No newline at end of file +git+https://github.com/zhangjyr/vidur.git +ray[default] \ No newline at end of file diff --git a/development/tutorials/distributed/fleet.yaml b/development/tutorials/distributed/fleet.yaml index 1333c304..f08e77ad 100644 --- a/development/tutorials/distributed/fleet.yaml +++ b/development/tutorials/distributed/fleet.yaml @@ -3,7 +3,7 @@ kind: RayClusterFleet metadata: labels: app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize + model.aibrix.ai/name: facebook-opt-13b name: facebook-opt-13b spec: replicas: 1 @@ -20,13 +20,17 @@ spec: labels: model.aibrix.ai/name: facebook-opt-13b annotations: - ray.io/overwrite-container-cmd: "true" + ray.io/overwrite-container-cmd: "true" spec: - rayVersion: '2.10.0' # should match the Ray version in the image of the containers + rayVersion: "2.10.0" headGroupSpec: rayStartParams: dashboard-host: '0.0.0.0' + block: 'false' template: + metadata: + labels: + model.aibrix.ai/name: facebook-opt-13b spec: containers: - name: ray-head @@ -40,35 +44,35 @@ spec: name: client - containerPort: 8000 name: service - command: ["/bin/bash", "-lc", "--"] - args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray] + command: ["/bin/bash", "-lc", "--"] + args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"] resources: limits: - cpu: "1000m" - nvidia.com/gpu: 1 + cpu: 1000m requests: - cpu: "200m" - nvidia.com/gpu: 1 + cpu: 200m workerGroupSpecs: - # the pod replicas in this group typed worker - - replicas: 1 + - replicas: 2 minReplicas: 1 maxReplicas: 5 groupName: small-group rayStartParams: {} template: + metadata: + labels: + model.aibrix.ai/name: facebook-opt-13b spec: containers: - - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:2.10.0 + - name: ray-worker + image: 'rayproject/ray:2.10.0' + command: [ "/bin/bash", "-lc", "--" ] + args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ] lifecycle: preStop: exec: command: [ "/bin/sh","-c","ray stop" ] resources: limits: - cpu: "1000m" - nvidia.com/gpu: 1 + cpu: 1000m requests: - cpu: "200m" - nvidia.com/gpu: 1 + cpu: 200m diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go index 60cf72d0..75eeb433 100644 --- a/pkg/controller/podautoscaler/podautoscaler_controller.go +++ b/pkg/controller/podautoscaler/podautoscaler_controller.go @@ -22,8 +22,11 @@ import ( "time" autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1" + orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1" "github.com/vllm-project/aibrix/pkg/config" "github.com/vllm-project/aibrix/pkg/controller/podautoscaler/metrics" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" "github.com/vllm-project/aibrix/pkg/controller/podautoscaler/scaler" podutil "github.com/vllm-project/aibrix/pkg/utils" @@ -619,6 +622,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context, return 0, "", currentTimestamp, err } + // Append ray head worker requirement for label selector + if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" { + newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"}) + if err != nil { + klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector") + return 0, "", currentTimestamp, err + } + labelsSelector = labelsSelector.Add(*newRequirement) + } + originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector) if err != nil { @@ -700,6 +713,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa return err } + // Append ray head worker requirement for label selector + if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" { + newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"}) + if err != nil { + klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector") + return err + } + labelsSelector = labelsSelector.Add(*newRequirement) + } + // Get pod list managed by scaleTargetRef podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector) if err != nil {