From 4ebf0ff2c73c76b6b3c5e3bfaf5bd9172df6e425 Mon Sep 17 00:00:00 2001 From: Jingyuan Date: Mon, 9 Dec 2024 15:49:30 -0800 Subject: [PATCH] [Bug] Fix the way how podautoscaler handle 0 pods. (#508) * Bug fix * Make APA comfortable with 0 originalReadyPodsCount for completeness. * Verify ActivationScale > 1 settings. Keep readyPodsCount value intact and fix affected code. * Bug fix * Document the new logic in comments. --------- Co-authored-by: Jingyuan Zhang --- .../podautoscaler/podautoscaler_controller.go | 2 +- pkg/controller/podautoscaler/scaler/apa.go | 5 +++ pkg/controller/podautoscaler/scaler/kpa.go | 36 ++++++++++++++----- pkg/utils/pod.go | 2 +- 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go index 6f9ea028..4a668011 100644 --- a/pkg/controller/podautoscaler/podautoscaler_controller.go +++ b/pkg/controller/podautoscaler/podautoscaler_controller.go @@ -368,7 +368,7 @@ func (r *PodAutoscalerReconciler) reconcileCustomPA(ctx context.Context, pa auto // computeReplicasForMetrics gives // TODO: check why it return the metrics name here? metricDesiredReplicas, metricName, metricTimestamp, err := r.computeReplicasForMetrics(ctx, pa, scale, metricKey) - if err != nil && metricDesiredReplicas == -1 { + if err != nil { r.setCurrentReplicasAndMetricsInStatus(&pa, currentReplicas) if err := r.updateStatusIfNeeded(ctx, paStatusOriginal, &pa); err != nil { return ctrl.Result{}, fmt.Errorf("failed to update the resource status") diff --git a/pkg/controller/podautoscaler/scaler/apa.go b/pkg/controller/podautoscaler/scaler/apa.go index 484a1565..c3583272 100644 --- a/pkg/controller/podautoscaler/scaler/apa.go +++ b/pkg/controller/podautoscaler/scaler/apa.go @@ -147,6 +147,11 @@ func (a *ApaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name return ScaleResult{} } + if originalReadyPodsCount == 0 { + klog.Errorf("Unexpected pod count for %s: %d", metricKey, originalReadyPodsCount) + return ScaleResult{} + } + currentUsePerPod := observedValue / float64(originalReadyPodsCount) spec.SetCurrentUsePerPod(currentUsePerPod) diff --git a/pkg/controller/podautoscaler/scaler/kpa.go b/pkg/controller/podautoscaler/scaler/kpa.go index 29c83091..9fe78473 100644 --- a/pkg/controller/podautoscaler/scaler/kpa.go +++ b/pkg/controller/podautoscaler/scaler/kpa.go @@ -249,10 +249,23 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name return ScaleResult{} } - // Use 1 if there are zero current pods. - readyPodsCount := math.Max(1, float64(originalReadyPodsCount)) - maxScaleUp := math.Ceil(spec.MaxScaleUpRate * readyPodsCount) - maxScaleDown := math.Floor(readyPodsCount / spec.MaxScaleDownRate) + // Old logic: + // readyPodsCount = min(1, originalReadyPodsCount) + // maxScaleUp = ceil(spec.MaxScaleUpRate*readyPodsCount) + // maxScaleDown = floor(readyPodsCount / spec.MaxScaleDownRate) + // + // The problems with old way was: + // 1. readyPodsCount did not reflect real pods count in "KPA Details" log. + // 2. If originalReadyPodsCount == 0 and spec.MaxScaleDownRate == 1, maxScaleDown will reset to 1, + // preventing down scale to 0. + // + // New implementation does follows: + // 1. readyPodsCount now reflects real pods count. + // 2. maxScaleUp is at lease to 1 to ensure 0 to 1 activation. + // 3. maxScaleDown will remain 0 in case spec.MaxScaleDownRate == 0 + readyPodsCount := math.Max(0, float64(originalReadyPodsCount)) // A little sanitizing. + maxScaleUp := math.Max(1, math.Ceil(spec.MaxScaleUpRate*readyPodsCount)) // Keep scale up non zero + maxScaleDown := math.Floor(readyPodsCount / spec.MaxScaleDownRate) // Make scale down zero-able dspc := math.Ceil(observedStableValue / spec.TargetValue) dppc := math.Ceil(observedPanicValue / spec.TargetValue) @@ -263,15 +276,17 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name // If ActivationScale > 1, then adjust the desired pod counts if k.scalingContext.ActivationScale > 1 { - if k.scalingContext.ActivationScale > desiredStablePodCount { + // ActivationScale only makes sense if activated (desired > 0) + if k.scalingContext.ActivationScale > desiredStablePodCount && desiredStablePodCount > 0 { desiredStablePodCount = k.scalingContext.ActivationScale } - if k.scalingContext.ActivationScale > desiredPanicPodCount { + if k.scalingContext.ActivationScale > desiredPanicPodCount && desiredPanicPodCount > 0 { desiredPanicPodCount = k.scalingContext.ActivationScale } } - isOverPanicThreshold := dppc/readyPodsCount >= spec.PanicThreshold + // Now readyPodsCount can be 0, use max(1, readyPodsCount) to prevent error. + isOverPanicThreshold := dppc/math.Max(1, readyPodsCount) >= spec.PanicThreshold klog.V(4).InfoS("--- KPA Details", "readyPodsCount", readyPodsCount, "MaxScaleUpRate", spec.MaxScaleUpRate, "MaxScaleDownRate", spec.MaxScaleDownRate, @@ -313,7 +328,7 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name } desiredPodCount = k.maxPanicPods } else { - klog.InfoS("Operating in stable mode.") + klog.InfoS("Operating in stable mode.", "desiredPodCount", desiredPodCount) } // Delay scale down decisions, if a ScaleDownDelay was specified. @@ -322,8 +337,9 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name // not the same in the case where two Scale()s happen in the same time // interval (because the largest will be picked rather than the most recent // in that case). - klog.V(4).InfoS("DelayWindow details", "delayWindow", k.delayWindow.String()) if k.delayWindow != nil { + klog.V(4).InfoS("DelayWindow details", "delayWindow", k.delayWindow.String()) + // the actual desiredPodCount will be recorded, but return the max replicas during passed delayWindow k.delayWindow.Record(now, float64(desiredPodCount)) delayedPodCount, err := k.delayWindow.Max() @@ -338,6 +354,8 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name ) desiredPodCount = int32(delayedPodCount) } + } else { + klog.V(4).InfoS("No DelayWindow set") } // Compute excess burst capacity diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 82debc09..6be24188 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -125,7 +125,7 @@ func GetPodListByLabelSelector(ctx context.Context, podLister client.Client, nam func CountReadyPods(podList *v1.PodList) (int64, error) { if podList == nil || len(podList.Items) == 0 { - return 0, fmt.Errorf("no pods provided or list is empty") + return 0, nil } readyPodCount := 0