From 4ebf0ff2c73c76b6b3c5e3bfaf5bd9172df6e425 Mon Sep 17 00:00:00 2001
From: Jingyuan <zhangjyr@gmail.com>
Date: Mon, 9 Dec 2024 15:49:30 -0800
Subject: [PATCH] [Bug] Fix the way how podautoscaler handle 0 pods. (#508)

* Bug fix

* Make APA comfortable with 0 originalReadyPodsCount for completeness.

* Verify ActivationScale > 1 settings.
Keep readyPodsCount value intact and fix affected code.

* Bug fix

* Document the new logic in comments.

---------

Co-authored-by: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
---
 .../podautoscaler/podautoscaler_controller.go |  2 +-
 pkg/controller/podautoscaler/scaler/apa.go    |  5 +++
 pkg/controller/podautoscaler/scaler/kpa.go    | 36 ++++++++++++++-----
 pkg/utils/pod.go                              |  2 +-
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
index 6f9ea028..4a668011 100644
--- a/pkg/controller/podautoscaler/podautoscaler_controller.go
+++ b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -368,7 +368,7 @@ func (r *PodAutoscalerReconciler) reconcileCustomPA(ctx context.Context, pa auto
 		// computeReplicasForMetrics gives
 		// TODO: check why it return the metrics name here?
 		metricDesiredReplicas, metricName, metricTimestamp, err := r.computeReplicasForMetrics(ctx, pa, scale, metricKey)
-		if err != nil && metricDesiredReplicas == -1 {
+		if err != nil {
 			r.setCurrentReplicasAndMetricsInStatus(&pa, currentReplicas)
 			if err := r.updateStatusIfNeeded(ctx, paStatusOriginal, &pa); err != nil {
 				return ctrl.Result{}, fmt.Errorf("failed to update the resource status")
diff --git a/pkg/controller/podautoscaler/scaler/apa.go b/pkg/controller/podautoscaler/scaler/apa.go
index 484a1565..c3583272 100644
--- a/pkg/controller/podautoscaler/scaler/apa.go
+++ b/pkg/controller/podautoscaler/scaler/apa.go
@@ -147,6 +147,11 @@ func (a *ApaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 		return ScaleResult{}
 	}
 
+	if originalReadyPodsCount == 0 {
+		klog.Errorf("Unexpected pod count for %s: %d", metricKey, originalReadyPodsCount)
+		return ScaleResult{}
+	}
+
 	currentUsePerPod := observedValue / float64(originalReadyPodsCount)
 	spec.SetCurrentUsePerPod(currentUsePerPod)
 
diff --git a/pkg/controller/podautoscaler/scaler/kpa.go b/pkg/controller/podautoscaler/scaler/kpa.go
index 29c83091..9fe78473 100644
--- a/pkg/controller/podautoscaler/scaler/kpa.go
+++ b/pkg/controller/podautoscaler/scaler/kpa.go
@@ -249,10 +249,23 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 		return ScaleResult{}
 	}
 
-	// Use 1 if there are zero current pods.
-	readyPodsCount := math.Max(1, float64(originalReadyPodsCount))
-	maxScaleUp := math.Ceil(spec.MaxScaleUpRate * readyPodsCount)
-	maxScaleDown := math.Floor(readyPodsCount / spec.MaxScaleDownRate)
+	// Old logic:
+	// readyPodsCount = min(1, originalReadyPodsCount)
+	// maxScaleUp = ceil(spec.MaxScaleUpRate*readyPodsCount)
+	// maxScaleDown = floor(readyPodsCount / spec.MaxScaleDownRate)
+	//
+	// The problems with old way was:
+	// 1. readyPodsCount did not reflect real pods count in "KPA Details" log.
+	// 2. If originalReadyPodsCount == 0 and spec.MaxScaleDownRate == 1, maxScaleDown will reset to 1,
+	//    preventing down scale to 0.
+	//
+	// New implementation does follows:
+	// 1. readyPodsCount now reflects real pods count.
+	// 2. maxScaleUp is at lease to 1 to ensure 0 to 1 activation.
+	// 3. maxScaleDown will remain 0 in case spec.MaxScaleDownRate == 0
+	readyPodsCount := math.Max(0, float64(originalReadyPodsCount))           // A little sanitizing.
+	maxScaleUp := math.Max(1, math.Ceil(spec.MaxScaleUpRate*readyPodsCount)) // Keep scale up non zero
+	maxScaleDown := math.Floor(readyPodsCount / spec.MaxScaleDownRate)       // Make scale down zero-able
 
 	dspc := math.Ceil(observedStableValue / spec.TargetValue)
 	dppc := math.Ceil(observedPanicValue / spec.TargetValue)
@@ -263,15 +276,17 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 
 	//	If ActivationScale > 1, then adjust the desired pod counts
 	if k.scalingContext.ActivationScale > 1 {
-		if k.scalingContext.ActivationScale > desiredStablePodCount {
+		// ActivationScale only makes sense if activated (desired > 0)
+		if k.scalingContext.ActivationScale > desiredStablePodCount && desiredStablePodCount > 0 {
 			desiredStablePodCount = k.scalingContext.ActivationScale
 		}
-		if k.scalingContext.ActivationScale > desiredPanicPodCount {
+		if k.scalingContext.ActivationScale > desiredPanicPodCount && desiredPanicPodCount > 0 {
 			desiredPanicPodCount = k.scalingContext.ActivationScale
 		}
 	}
 
-	isOverPanicThreshold := dppc/readyPodsCount >= spec.PanicThreshold
+	// Now readyPodsCount can be 0, use max(1, readyPodsCount) to prevent error.
+	isOverPanicThreshold := dppc/math.Max(1, readyPodsCount) >= spec.PanicThreshold
 
 	klog.V(4).InfoS("--- KPA Details", "readyPodsCount", readyPodsCount,
 		"MaxScaleUpRate", spec.MaxScaleUpRate, "MaxScaleDownRate", spec.MaxScaleDownRate,
@@ -313,7 +328,7 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 		}
 		desiredPodCount = k.maxPanicPods
 	} else {
-		klog.InfoS("Operating in stable mode.")
+		klog.InfoS("Operating in stable mode.", "desiredPodCount", desiredPodCount)
 	}
 
 	// Delay scale down decisions, if a ScaleDownDelay was specified.
@@ -322,8 +337,9 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 	// not the same in the case where two Scale()s happen in the same time
 	// interval (because the largest will be picked rather than the most recent
 	// in that case).
-	klog.V(4).InfoS("DelayWindow details", "delayWindow", k.delayWindow.String())
 	if k.delayWindow != nil {
+		klog.V(4).InfoS("DelayWindow details", "delayWindow", k.delayWindow.String())
+
 		// the actual desiredPodCount will be recorded, but return the max replicas during passed delayWindow
 		k.delayWindow.Record(now, float64(desiredPodCount))
 		delayedPodCount, err := k.delayWindow.Max()
@@ -338,6 +354,8 @@ func (k *KpaAutoscaler) Scale(originalReadyPodsCount int, metricKey metrics.Name
 			)
 			desiredPodCount = int32(delayedPodCount)
 		}
+	} else {
+		klog.V(4).InfoS("No DelayWindow set")
 	}
 
 	// Compute excess burst capacity
diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go
index 82debc09..6be24188 100644
--- a/pkg/utils/pod.go
+++ b/pkg/utils/pod.go
@@ -125,7 +125,7 @@ func GetPodListByLabelSelector(ctx context.Context, podLister client.Client, nam
 
 func CountReadyPods(podList *v1.PodList) (int64, error) {
 	if podList == nil || len(podList.Items) == 0 {
-		return 0, fmt.Errorf("no pods provided or list is empty")
+		return 0, nil
 	}
 
 	readyPodCount := 0