Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 1 addition & 53 deletions .github/workflows/reusable-nightly-e2e-cks-helmfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ jobs:
yq e '(select(.decode.containers != null) | .decode.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.prefill.containers != null) | .prefill.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.containers != null) | .containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
sed -E "s/^#([[:space:]]+)priorityClassName/\1priorityClassName/g" -i $file
done
fi
env:
Expand Down Expand Up @@ -650,59 +651,6 @@ jobs:
echo "::warning::HTTPRoute file $HTTPROUTE not found — skipping"
fi

- name: Set nightly GPU priority on workloads
if: inputs.allow_gpu_preemption && inputs.required_gpus > 0
run: |
echo "Setting priorityClassName=nightly-gpu-critical on GPU workloads in $NAMESPACE..."
for kind in deployment statefulset; do
for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# Check if this workload requests GPUs
GPU_REQ=$(kubectl get "$kind" "$name" -n "$NAMESPACE" -o json | \
jq '[.spec.template.spec.containers[]?.resources.limits["nvidia.com/gpu"] // "0" | tonumber] | add // 0')
if [ "$GPU_REQ" -gt 0 ]; then
echo " Patching $kind/$name (requests $GPU_REQ GPU(s))..."
if [ "$kind" = "deployment" ]; then
# Use Recreate strategy for GPU deployments to avoid rolling update deadlock.
# With RollingUpdate, new pods can't start because old pods hold the GPUs.
# Use merge patch with rollingUpdate:null to clear the existing rollingUpdate
# config, which Kubernetes rejects when strategy type is Recreate.
kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=merge -p \
'{"spec":{"strategy":{"type":"Recreate","rollingUpdate":null},"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
else
kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=strategic -p \
'{"spec":{"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
fi
fi
done
done
# Also patch LeaderWorkerSets if present
for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Patching leaderworkerset/$name..."
kubectl patch leaderworkerset "$name" -n "$NAMESPACE" --type=strategic -p \
'{"spec":{"leaderWorkerTemplate":{"workerTemplate":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}}' 2>/dev/null || \
echo " Could not patch LWS $name (may not support strategic merge)"
done
echo "Priority patching complete"

# Wait for rolling updates triggered by the priority patch to stabilize.
# Without this, kubectl wait sees pods from both old and new ReplicaSets,
# causing "NotFound" errors as old pods are deleted during rollout.
echo "Waiting for rollouts to stabilize after priority patching..."
for kind in deployment statefulset; do
for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Waiting for $kind/$name rollout..."
kubectl rollout status "$kind/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
echo " Rollout status check timed out for $kind/$name (will retry in pod wait)"
done
done
# Also wait for LeaderWorkerSet rollouts, since they were patched above.
for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Waiting for leaderworkerset/$name rollout..."
kubectl rollout status "leaderworkerset/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
echo " Rollout status check failed for leaderworkerset/$name (kubectl may not support rollout for this resource; will rely on pod wait)"
done
echo "Rollouts stabilized"

- name: Show deployment status
run: |
echo "=== Deployments ==="
Expand Down
54 changes: 1 addition & 53 deletions .github/workflows/reusable-nightly-e2e-gke-helmfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ jobs:
yq e '(select(.decode.containers != null) | .decode.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.prefill.containers != null) | .prefill.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.containers != null) | .containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
sed -E "s/^#([[:space:]]+)priorityClassName/\1priorityClassName/g" -i $file
done
fi
env:
Expand Down Expand Up @@ -471,59 +472,6 @@ jobs:
"$NAMESPACE" || true
done

#- name: Set nightly GPU priority on workloads
# if: inputs.allow_gpu_preemption && inputs.required_gpus > 0
# run: |
# echo "Setting priorityClassName=nightly-gpu-critical on GPU workloads in $NAMESPACE..."
# for kind in deployment statefulset; do
# for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# # Check if this workload requests GPUs
# GPU_REQ=$(kubectl get "$kind" "$name" -n "$NAMESPACE" -o json | \
# jq '[.spec.template.spec.containers[]?.resources.limits["nvidia.com/gpu"] // "0" | tonumber] | add // 0')
# if [ "$GPU_REQ" -gt 0 ]; then
# echo " Patching $kind/$name (requests $GPU_REQ GPU(s))..."
# if [ "$kind" = "deployment" ]; then
# # Use Recreate strategy for GPU deployments to avoid rolling update deadlock.
# # With RollingUpdate, new pods can't start because old pods hold the GPUs.
# # Use merge patch with rollingUpdate:null to clear the existing rollingUpdate
# # config, which Kubernetes rejects when strategy type is Recreate.
# kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=merge -p \
# '{"spec":{"strategy":{"type":"Recreate","rollingUpdate":null},"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
# else
# kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=strategic -p \
# '{"spec":{"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
# fi
# fi
# done
# done
# # Also patch LeaderWorkerSets if present
# for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# echo " Patching leaderworkerset/$name..."
# kubectl patch leaderworkerset "$name" -n "$NAMESPACE" --type=strategic -p \
# '{"spec":{"leaderWorkerTemplate":{"workerTemplate":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}}' 2>/dev/null || \
# echo " Could not patch LWS $name (may not support strategic merge)"
# done
# echo "Priority patching complete"

# # Wait for rolling updates triggered by the priority patch to stabilize.
# # Without this, kubectl wait sees pods from both old and new ReplicaSets,
# # causing "NotFound" errors as old pods are deleted during rollout.
# echo "Waiting for rollouts to stabilize after priority patching..."
# for kind in deployment statefulset; do
# for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# echo " Waiting for $kind/$name rollout..."
# kubectl rollout status "$kind/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
# echo " Rollout status check timed out for $kind/$name (will retry in pod wait)"
# done
# done
# # Also wait for LeaderWorkerSet rollouts, since they were patched above.
# for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# echo " Waiting for leaderworkerset/$name rollout..."
# kubectl rollout status "leaderworkerset/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
# echo " Rollout status check failed for leaderworkerset/$name (kubectl may not support rollout for this resource; will rely on pod wait)"
# done
# echo "Rollouts stabilized"

- name: Show deployment status
run: |
echo "=== Deployments ==="
Expand Down
56 changes: 2 additions & 54 deletions .github/workflows/reusable-nightly-e2e-openshift-helmfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ jobs:
run: |
echo "Creating namespace $NAMESPACE..."
kubectl create namespace "$NAMESPACE" || echo "Namespace already exists"

echo "Granting \"anyuid\" Security Context Constraint to serviceAccount \"default\" on namespace \"${NAMESPACE}\""
oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE}
echo "Granting \"privileged\" Security Context Constraint to serviceAccount \"default\" on namespace \"${NAMESPACE}\""
Expand Down Expand Up @@ -567,6 +567,7 @@ jobs:
yq e '(select(.decode.containers != null) | .decode.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.prefill.containers != null) | .prefill.containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
yq e '(select(.containers != null) | .containers[].image | select(test("^ghcr\.io/llm-d/llm-d-cuda"))) = strenv(IMAGE_OVERRIDE)' -i "$file"
sed -E "s/^#([[:space:]]+)priorityClassName/\1priorityClassName/g" -i $file
done
fi
env:
Expand Down Expand Up @@ -670,59 +671,6 @@ jobs:
echo "::warning::HTTPRoute file $HTTPROUTE not found — skipping"
fi

- name: Set nightly GPU priority on workloads
if: inputs.allow_gpu_preemption && inputs.required_gpus > 0
run: |
echo "Setting priorityClassName=nightly-gpu-critical on GPU workloads in $NAMESPACE..."
for kind in deployment statefulset; do
for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
# Check if this workload requests GPUs
GPU_REQ=$(kubectl get "$kind" "$name" -n "$NAMESPACE" -o json | \
jq '[.spec.template.spec.containers[]?.resources.limits["nvidia.com/gpu"] // "0" | tonumber] | add // 0')
if [ "$GPU_REQ" -gt 0 ]; then
echo " Patching $kind/$name (requests $GPU_REQ GPU(s))..."
if [ "$kind" = "deployment" ]; then
# Use Recreate strategy for GPU deployments to avoid rolling update deadlock.
# With RollingUpdate, new pods can't start because old pods hold the GPUs.
# Use merge patch with rollingUpdate:null to clear the existing rollingUpdate
# config, which Kubernetes rejects when strategy type is Recreate.
kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=merge -p \
'{"spec":{"strategy":{"type":"Recreate","rollingUpdate":null},"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
else
kubectl patch "$kind" "$name" -n "$NAMESPACE" --type=strategic -p \
'{"spec":{"template":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}'
fi
fi
done
done
# Also patch LeaderWorkerSets if present
for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Patching leaderworkerset/$name..."
kubectl patch leaderworkerset "$name" -n "$NAMESPACE" --type=strategic -p \
'{"spec":{"leaderWorkerTemplate":{"workerTemplate":{"spec":{"priorityClassName":"nightly-gpu-critical"}}}}}' 2>/dev/null || \
echo " Could not patch LWS $name (may not support strategic merge)"
done
echo "Priority patching complete"

# Wait for rolling updates triggered by the priority patch to stabilize.
# Without this, kubectl wait sees pods from both old and new ReplicaSets,
# causing "NotFound" errors as old pods are deleted during rollout.
echo "Waiting for rollouts to stabilize after priority patching..."
for kind in deployment statefulset; do
for name in $(kubectl get "$kind" -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Waiting for $kind/$name rollout..."
kubectl rollout status "$kind/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
echo " Rollout status check timed out for $kind/$name (will retry in pod wait)"
done
done
# Also wait for LeaderWorkerSet rollouts, since they were patched above.
for name in $(kubectl get leaderworkersets -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo " Waiting for leaderworkerset/$name rollout..."
kubectl rollout status "leaderworkerset/$name" -n "$NAMESPACE" --timeout=10m 2>/dev/null || \
echo " Rollout status check failed for leaderworkerset/$name (kubectl may not support rollout for this resource; will rely on pod wait)"
done
echo "Rollouts stabilized"

- name: Show deployment status
run: |
echo "=== Deployments ==="
Expand Down
Loading