knative · skonto · Jun 12, 2024 · Jun 14, 2024 · dprotaso · Jun 25, 2024
diff --git a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go
@@ -215,6 +215,12 @@ func (pas *PodAutoscalerStatus) MarkScaleTargetInitialized() {
 	podCondSet.Manage(pas).MarkTrue(PodAutoscalerConditionScaleTargetInitialized)
 }
 
+// MarkScaleTargetNotInitialized marks the PA's PodAutoscalerConditionScaleTargetInitialized
+// condition true.
+func (pas *PodAutoscalerStatus) MarkScaleTargetNotInitialized(reason, message string) {
+	podCondSet.Manage(pas).MarkFalse(PodAutoscalerConditionScaleTargetInitialized, reason, message)
+}
+
 // MarkSKSReady marks the PA condition denoting that SKS is ready.
 func (pas *PodAutoscalerStatus) MarkSKSReady() {
 	podCondSet.Manage(pas).MarkTrue(PodAutoscalerConditionSKSReady)

diff --git a/pkg/reconciler/autoscaling/kpa/kpa.go b/pkg/reconciler/autoscaling/kpa/kpa.go
@@ -114,10 +114,16 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pa *autoscalingv1alpha1.
 	if err := c.ReconcileMetric(ctx, pa, resolveScrapeTarget(ctx, pa)); err != nil {
 		return fmt.Errorf("error reconciling Metric: %w", err)
 	}
+	podCounter := resourceutil.NewPodAccessor(c.podsLister, pa.Namespace, pa.Labels[serving.RevisionLabelKey])
+
+	pod, err := podCounter.GetAnyPod()
+	if err != nil {
+		return fmt.Errorf("error getting a pod for the revision: %w", err)
+	}
 
 	// Get the appropriate current scale from the metric, and right size
 	// the scaleTargetRef based on it.
-	want, err := c.scaler.scale(ctx, pa, sks, decider.Status.DesiredScale)
+	want, err := c.scaler.scale(ctx, pa, sks, decider.Status.DesiredScale, c.Client, pod)
 	if err != nil {
 		return fmt.Errorf("error scaling target: %w", err)
 	}
@@ -145,7 +151,6 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pa *autoscalingv1alpha1.
 	}
 
 	// Compare the desired and observed resources to determine our situation.
-	podCounter := resourceutil.NewPodAccessor(c.podsLister, pa.Namespace, pa.Labels[serving.RevisionLabelKey])
 	ready, notReady, pending, terminating, err := podCounter.PodCountsByState()
 	if err != nil {
 		return fmt.Errorf("error getting pod counts: %w", err)

diff --git a/pkg/reconciler/autoscaling/kpa/scaler.go b/pkg/reconciler/autoscaling/kpa/scaler.go
@@ -35,16 +35,21 @@ import (
 	"knative.dev/serving/pkg/activator"
 	autoscalingv1alpha1 "knative.dev/serving/pkg/apis/autoscaling/v1alpha1"
 	"knative.dev/serving/pkg/autoscaler/config/autoscalerconfig"
+	clientset "knative.dev/serving/pkg/client/clientset/versioned"
 	"knative.dev/serving/pkg/reconciler/autoscaling/config"
 	kparesources "knative.dev/serving/pkg/reconciler/autoscaling/kpa/resources"
 	aresources "knative.dev/serving/pkg/reconciler/autoscaling/resources"
+	revresurces "knative.dev/serving/pkg/reconciler/revision/resources"
 	"knative.dev/serving/pkg/resources"
 
+	"go.uber.org/zap"
+	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/dynamic"
 	"k8s.io/client-go/tools/cache"
+	"k8s.io/client-go/util/retry"
 )
 
 const (
@@ -326,7 +331,7 @@ func (ks *scaler) applyScale(ctx context.Context, pa *autoscalingv1alpha1.PodAut
 }
 
 // scale attempts to scale the given PA's target reference to the desired scale.
-func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscaler, sks *netv1alpha1.ServerlessService, desiredScale int32) (int32, error) {
+func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscaler, sks *netv1alpha1.ServerlessService, desiredScale int32, client clientset.Interface, podForErrorChecking *corev1.Pod) (int32, error) {
 	asConfig := config.FromContext(ctx).Autoscaler
 	logger := logging.FromContext(ctx)
 
@@ -371,6 +376,38 @@ func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscal
 		return desiredScale, nil
 	}
 
+	// Before we apply scale to zero and since we have failed to activate, check if any pod is in waiting state
+	// This should capture any case where we scale from zero and regular progressDeadline expiration will not be hit due to K8s limitations
+	// when not being in a deployment rollout.
+	if desiredScale == 0 && pa.Status.IsActivating() && podForErrorChecking != nil {
+		if err = checkAvailabilityBeforeScalingDown(ctx, logger, podForErrorChecking, pa, client); err != nil {
+			return desiredScale, fmt.Errorf("failed to get check availability for target %v: %w", pa.Spec.ScaleTargetRef, err)
+		}
+	}
+
 	logger.Infof("Scaling from %d to %d", currentScale, desiredScale)
 	return desiredScale, ks.applyScale(ctx, pa, desiredScale, ps)
 }
+
+func checkAvailabilityBeforeScalingDown(ctx context.Context, logger *zap.SugaredLogger, pod *corev1.Pod, pa *autoscalingv1alpha1.PodAutoscaler, client clientset.Interface) error {
+	for _, status := range pod.Status.ContainerStatuses {
+		if status.Name != revresurces.QueueContainerName {
+			if w := status.State.Waiting; w != nil {
+				logger.Debugf("marking revision inactive with: %s: %s", w.Reason, w.Message)
+				pa.Status.MarkScaleTargetNotInitialized(w.Reason, w.Message)
+				return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+					rev, err := client.ServingV1().Revisions(pa.Namespace).Get(ctx, pa.Name, metav1.GetOptions{})
+					if err != nil {
+						return err
+					}
+					rev.Status.MarkResourcesAvailableFalse(w.Reason, w.Message)
+					if _, err = client.ServingV1().Revisions(pa.Namespace).UpdateStatus(ctx, rev, metav1.UpdateOptions{}); err != nil {
+						return err
+					}
+					return nil
+				})
+			}
+		}
+	}
+	return nil
+}
diff --git a/pkg/reconciler/autoscaling/kpa/scaler_test.go b/pkg/reconciler/autoscaling/kpa/scaler_test.go
@@ -556,7 +556,7 @@ func TestScaler(t *testing.T) {
 				test.configMutator(cfg)
 			}
 			ctx = config.ToContext(ctx, cfg)
-			desiredScale, err := revisionScaler.scale(ctx, pa, sks, test.scaleTo)
+			desiredScale, err := revisionScaler.scale(ctx, pa, sks, test.scaleTo, fakeservingclient.Get(ctx), nil)
 			if err != nil {
 				t.Error("Scale got an unexpected error:", err)
 			}
@@ -647,7 +647,7 @@ func TestDisableScaleToZero(t *testing.T) {
 			conf := defaultConfig()
 			conf.Autoscaler.EnableScaleToZero = false
 			ctx = config.ToContext(ctx, conf)
-			desiredScale, err := revisionScaler.scale(ctx, pa, nil /*sks doesn't matter in this test*/, test.scaleTo)
+			desiredScale, err := revisionScaler.scale(ctx, pa, nil /*sks doesn't matter in this test*/, test.scaleTo, fakeservingclient.Get(ctx), nil)
 
 			if err != nil {
 				t.Error("Scale got an unexpected error:", err)

diff --git a/pkg/resources/pods.go b/pkg/resources/pods.go
@@ -44,6 +44,18 @@ func NewPodAccessor(lister corev1listers.PodLister, namespace, revisionName stri
 	}
 }
 
+// GetAnyPod returns a pod for the revision that owns the pa, if any
+func (pa PodAccessor) GetAnyPod() (pod *corev1.Pod, err error) {
+	pods, err := pa.podsLister.List(pa.selector)
+	if err != nil {
+		return nil, err
+	}
+	if len(pods) != 0 {
+		return pods[0], nil
+	}
+	return nil, nil
+}
+
 // PodCountsByState returns number of pods for the revision grouped by their state, that is
 // of interest to knative (e.g. ignoring failed or terminated pods).
 func (pa PodAccessor) PodCountsByState() (ready, notReady, pending, terminating int, err error) {