Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Append ray head label selector in PodAutoscaler #789

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion development/app/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ ddsketch
plotly_express
fasteners
transformers
git+https://github.com/zhangjyr/vidur.git
git+https://github.com/zhangjyr/vidur.git
ray[default]
38 changes: 21 additions & 17 deletions development/tutorials/distributed/fleet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: RayClusterFleet
metadata:
labels:
app.kubernetes.io/name: aibrix
app.kubernetes.io/managed-by: kustomize
model.aibrix.ai/name: facebook-opt-13b
name: facebook-opt-13b
spec:
replicas: 1
Expand All @@ -20,13 +20,17 @@ spec:
labels:
model.aibrix.ai/name: facebook-opt-13b
annotations:
ray.io/overwrite-container-cmd: "true"
ray.io/overwrite-container-cmd: "true"
spec:
rayVersion: '2.10.0' # should match the Ray version in the image of the containers
rayVersion: "2.10.0"
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'false'
template:
metadata:
labels:
model.aibrix.ai/name: facebook-opt-13b
spec:
containers:
- name: ray-head
Expand All @@ -40,35 +44,35 @@ spec:
name: client
- containerPort: 8000
name: service
command: ["/bin/bash", "-lc", "--"]
args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
command: ["/bin/bash", "-lc", "--"]
args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"]
resources:
limits:
cpu: "1000m"
nvidia.com/gpu: 1
cpu: 1000m
requests:
cpu: "200m"
nvidia.com/gpu: 1
cpu: 200m
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: small-group
rayStartParams: {}
template:
metadata:
labels:
model.aibrix.ai/name: facebook-opt-13b
spec:
containers:
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray:2.10.0
- name: ray-worker
image: 'rayproject/ray:2.10.0'
command: [ "/bin/bash", "-lc", "--" ]
args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ]
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
resources:
limits:
cpu: "1000m"
nvidia.com/gpu: 1
cpu: 1000m
requests:
cpu: "200m"
nvidia.com/gpu: 1
cpu: 200m
23 changes: 23 additions & 0 deletions pkg/controller/podautoscaler/podautoscaler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@ import (
"time"

autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
"github.com/vllm-project/aibrix/pkg/config"
"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/metrics"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"

"github.com/vllm-project/aibrix/pkg/controller/podautoscaler/scaler"
podutil "github.com/vllm-project/aibrix/pkg/utils"
Expand Down Expand Up @@ -619,6 +622,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
return 0, "", currentTimestamp, err
}

// Append ray head worker requirement for label selector
if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
if err != nil {
klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
return 0, "", currentTimestamp, err
}
labelsSelector = labelsSelector.Add(*newRequirement)
}

originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector)

if err != nil {
Expand Down Expand Up @@ -700,6 +713,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa
return err
}

// Append ray head worker requirement for label selector
if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
if err != nil {
klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
return err
}
labelsSelector = labelsSelector.Add(*newRequirement)
}

// Get pod list managed by scaleTargetRef
podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector)
if err != nil {
Expand Down
Loading