kubernetes · ialidzhikov · Dec 4, 2024 · Dec 5, 2024 · Dec 13, 2024 · Dec 18, 2024
diff --git a/vertical-pod-autoscaler/docs/examples.md b/vertical-pod-autoscaler/docs/examples.md
@@ -11,6 +11,7 @@
   - [Controlling eviction behavior based on scaling direction and resource](#controlling-eviction-behavior-based-on-scaling-direction-and-resource)
   - [Limiting which namespaces are used](#limiting-which-namespaces-are-used)
   - [Setting the webhook failurePolicy](#setting-the-webhook-failurepolicy)
+  - [Specifying global maximum allowed resources to prevent pods from being unschedulable](#specifying-global-maximum-allowed-resources-to-prevent-pods-from-being-unschedulable)
 
 ## Keeping limit proportional to request
 
@@ -108,3 +109,16 @@ These options cannot be used together and are mutually exclusive.
 It is possible to set the failurePolicy of the webhook to `Fail` by passing `--webhook-failure-policy-fail=true` to the VPA admission controller.
 Please use this option with caution as it may be possible to break Pod creation if there is a failure with the VPA.
 Using it in conjunction with `--ignored-vpa-object-namespaces=kube-system` or `--vpa-object-namespace` to reduce risk.
+
+### Specifying global maximum allowed resources to prevent pods from being unschedulable
+
+The [Known limitations dcoument](./known-limitations.md) outlines that VPA (vpa-recommender in particular) is not aware of the cluster's maximum allocatable and can recommend resources which will not fit even the largest node in the cluster. This issue occurs even when the cluster uses the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#basics). The vpa-recommender's resource recommendation can exceed the allocatable of the largest node in the cluster. Hence, pod's will be unschedulable (in `Pending` state) and the pod wouldn't fit the cluster even if a new node is added by the Cluster Autoscaler.
+It is possible to mitigate this issue by specifying the `--container-recommendation-max-allowed-cpu` and `--container-recommendation-max-allowed-memory` flags of the vpa-recommender. These flags represent the global maximum amount of cpu/memory that will be recommended **for a container**. If the VerticalPodAutoscaler already defines a max allowed (`.spec.resourcePolicy.containerPolicies.maxAllowed`) then it takes precedence over the global max allowed. The global max allowed is merged to the VerticalPodAutoscaler's max allowed if VerticalPodAutoscaler's max allowed is specified only for cpu or memory. If the VerticalPodAutoscaler does not specify a max allowed and a global max allowed is specified, then the global max allowed is being used.
+
+The recommendation for computing the `--container-recommendation-max-allowed-cpu` and `--container-recommendation-max-allowed-memory` values for your cluster is to use the largest node's allocatable (`.status.allocatable` field of a node) minus the DaemonSet pods resource requests minus a safety margin:
+```
+<max allowed> = <largest node's allocatable> - <resource requests of DaemonSet pods> - <safety margin>
+```
+
+> [!WARNING]
+> Pay attention that `--container-recommendation-max-allowed-cpu` and `--container-recommendation-max-allowed-memory` are **container-level** flags. A pod enabling autoscaling for more than one container can theoretically still get unschedulable if the sum of the resource recommendations of the containers exceeds the largest Node's allocatable. In practice, it is not very likely to hit such case as usually a single container in a pod is the main one, the others are sidecars that either do not need autoscaling or do not consume high resource requests.
diff --git a/vertical-pod-autoscaler/docs/faq.md b/vertical-pod-autoscaler/docs/faq.md
@@ -212,6 +212,8 @@ Name | Type | Description | Default
 `container-pod-name-label` | String | Label name to look for container pod names | "pod_name"
 `container-name-label` | String | Label name to look for container names | "name"
 `vpa-object-namespace` | String | Namespace to search for VPA objects and pod stats. Empty means all namespaces will be used. | apiv1.NamespaceAll
+`container-recommendation-max-allowed-cpu` | Quantity | Maximum amount of CPU that will be recommended for a container. VerticalPodAutoscaler-level maximum allowed takes precedence over the global maximum allowed. | Empty (no max allowed cpu by default)
+`container-recommendation-max-allowed-memory` | Quantity | Maximum amount of memory that will be recommended for a container. VerticalPodAutoscaler-level maximum allowed takes precedence over the global maximum allowed. | Empty (no max allowed memory by default)
 `memory-aggregation-interval` | Duration | The length of a single interval, for which the peak memory usage is computed. Memory usage peaks are aggregated in multiples of this interval. In other words there is one memory usage sample per interval (the maximum usage over that interval | model.DefaultMemoryAggregationInterval
 `memory-aggregation-interval-count` | Int64 | The number of consecutive memory-aggregation-intervals which make up the MemoryAggregationWindowLength which in turn is the period for memory usage aggregation by VPA. In other words, MemoryAggregationWindowLength = memory-aggregation-interval * memory-aggregation-interval-count. | model.DefaultMemoryAggregationIntervalCount
 `memory-histogram-decay-half-life` | Duration | The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period. | model.DefaultMemoryHistogramDecayHalfLife

diff --git a/vertical-pod-autoscaler/docs/known-limitations.md b/vertical-pod-autoscaler/docs/known-limitations.md
@@ -21,7 +21,9 @@
 - VPA performance has not been tested in large clusters.
 - VPA recommendation might exceed available resources (e.g. Node size, available
   size, available quota) and cause **pods to go pending**. This can be partly
-  addressed by using VPA together with [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#basics).
+  addressed by:
+  * using VPA together with [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#basics) - the drawback of this approach is that pods can still get unschedulable if the recommendation exceeds the largest Node's allocatable.
+  * specifying the `--container-recommendation-max-allowed-cpu` and `--container-recommendation-max-allowed-memory` flags - the drawback of this approach is that a pod can still get unschedulable if more than one container in the pod is scaled by VPA and the sum of the container recommendations exceeds the largest Node's allocatable.
 - Multiple VPA resources matching the same pod have undefined behavior.
 - Running the vpa-recommender with leader election enabled (`--leader-elect=true`) in a GKE cluster
   causes contention with a lease called `vpa-recommender` held by the GKE system component of the

diff --git a/vertical-pod-autoscaler/pkg/recommender/main.go b/vertical-pod-autoscaler/pkg/recommender/main.go
@@ -25,6 +25,7 @@ import (
 
 	"github.com/spf13/pflag"
 	apiv1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/client-go/informers"
@@ -103,6 +104,8 @@ var (
 var (
 	// CPU as integer to benefit for CPU management Static Policy ( https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy )
 	postProcessorCPUasInteger = flag.Bool("cpu-integer-post-processor-enabled", false, "Enable the cpu-integer recommendation post processor. The post processor will round up CPU recommendations to a whole CPU for pods which were opted in by setting an appropriate label on VPA object (experimental)")
+	maxAllowedCPU             = resource.QuantityValue{}
+	maxAllowedMemory          = resource.QuantityValue{}
 )
 
 const (
@@ -115,6 +118,11 @@ const (
 	defaultResyncPeriod               time.Duration = 10 * time.Minute
 )
 
+func init() {
+	flag.Var(&maxAllowedCPU, "container-recommendation-max-allowed-cpu", "Maximum amount of CPU that will be recommended for a container. VerticalPodAutoscaler-level maximum allowed takes precedence over the global maximum allowed.")
+	flag.Var(&maxAllowedMemory, "container-recommendation-max-allowed-memory", "Maximum amount of memory that will be recommended for a container. VerticalPodAutoscaler-level maximum allowed takes precedence over the global maximum allowed.")
+}
+
 func main() {
 	commonFlags := common.InitCommonFlags()
 	klog.InitFlags(nil)
@@ -215,8 +223,9 @@ func run(healthCheck *metrics.HealthCheck, commonFlag *common.CommonFlags) {
 		postProcessors = append(postProcessors, &routines.IntegerCPUPostProcessor{})
 	}
 
+	globalMaxAllowed := initGlobalMaxAllowed()
 	// CappingPostProcessor, should always come in the last position for post-processing
-	postProcessors = append(postProcessors, &routines.CappingPostProcessor{})
+	postProcessors = append(postProcessors, routines.NewCappingRecommendationProcessor(globalMaxAllowed))
 	var source input_metrics.PodMetricsLister
 	if *useExternalMetrics {
 		resourceMetrics := map[apiv1.ResourceName]string{}
@@ -307,3 +316,15 @@ func run(healthCheck *metrics.HealthCheck, commonFlag *common.CommonFlags) {
 		healthCheck.UpdateLastActivity()
 	}
 }
+
+func initGlobalMaxAllowed() apiv1.ResourceList {
+	result := make(apiv1.ResourceList)
+	if !maxAllowedCPU.Quantity.IsZero() {
+		result[apiv1.ResourceCPU] = maxAllowedCPU.Quantity
+	}
+	if !maxAllowedMemory.Quantity.IsZero() {
+		result[apiv1.ResourceMemory] = maxAllowedMemory.Quantity
+	}
+
+	return result
+}
diff --git a/vertical-pod-autoscaler/pkg/recommender/routines/capping_post_processor.go b/vertical-pod-autoscaler/pkg/recommender/routines/capping_post_processor.go
@@ -19,20 +19,29 @@ package routines
 import (
 	"k8s.io/klog/v2"
 
+	apiv1 "k8s.io/api/core/v1"
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
 	vpa_utils "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/vpa"
 )
 
-// CappingPostProcessor ensure that the policy is applied to recommendation
-// it applies policy for fields: MinAllowed and MaxAllowed
-type CappingPostProcessor struct{}
+type cappingPostProcessor struct {
+	globalMaxAllowed apiv1.ResourceList
+}
+
+var _ RecommendationPostProcessor = &cappingPostProcessor{}
 
-var _ RecommendationPostProcessor = &CappingPostProcessor{}
+// NewCappingRecommendationProcessor constructs new RecommendationPostProcessor that adjusts recommendation
+// for given pod to obey VPA resources policy and a global max allowed configuration.
+func NewCappingRecommendationProcessor(globalMaxAllowed apiv1.ResourceList) RecommendationPostProcessor {
+	return &cappingPostProcessor{
+		globalMaxAllowed: globalMaxAllowed,
+	}
+}
 
 // Process apply the capping post-processing to the recommendation. (use to be function getCappedRecommendation)
-func (c CappingPostProcessor) Process(vpa *vpa_types.VerticalPodAutoscaler, recommendation *vpa_types.RecommendedPodResources) *vpa_types.RecommendedPodResources {
+func (c cappingPostProcessor) Process(vpa *vpa_types.VerticalPodAutoscaler, recommendation *vpa_types.RecommendedPodResources) *vpa_types.RecommendedPodResources {
 	// TODO: maybe rename the vpa_utils.ApplyVPAPolicy to something that mention that it is doing capping only
-	cappedRecommendation, err := vpa_utils.ApplyVPAPolicy(recommendation, vpa.Spec.ResourcePolicy)
+	cappedRecommendation, err := vpa_utils.ApplyVPAPolicy(recommendation, vpa.Spec.ResourcePolicy, c.globalMaxAllowed)
 	if err != nil {
 		klog.ErrorS(err, "Failed to apply policy for VPA", "vpa", klog.KObj(vpa))
 		return recommendation

diff --git a/vertical-pod-autoscaler/pkg/utils/vpa/capping.go b/vertical-pod-autoscaler/pkg/utils/vpa/capping.go
@@ -184,22 +184,37 @@ func applyVPAPolicy(recommendation apiv1.ResourceList, policy *vpa_types.Contain
 
 func applyVPAPolicyForContainer(containerName string,
 	containerRecommendation *vpa_types.RecommendedContainerResources,
-	policy *vpa_types.PodResourcePolicy) (*vpa_types.RecommendedContainerResources, error) {
+	policy *vpa_types.PodResourcePolicy,
+	globalMaxAllowed apiv1.ResourceList) (*vpa_types.RecommendedContainerResources, error) {
 	if containerRecommendation == nil {
 		return nil, fmt.Errorf("no recommendation available for container name %v", containerName)
 	}
 	cappedRecommendations := containerRecommendation.DeepCopy()
-	// containerPolicy can be nil (user does not have to configure it).
 	containerPolicy := GetContainerResourcePolicy(containerName, policy)
-	if containerPolicy == nil {
-		return cappedRecommendations, nil
-	}
 
 	process := func(recommendation apiv1.ResourceList) {
 		for resourceName, recommended := range recommendation {
-			cappedToMin, _ := maybeCapToPolicyMin(recommended, resourceName, containerPolicy)
-			recommendation[resourceName] = cappedToMin
-			cappedToMax, _ := maybeCapToPolicyMax(cappedToMin, resourceName, containerPolicy)
+			var maxAllowed apiv1.ResourceList
+			// containerPolicy can be nil (user does not have to configure it).
+			if containerPolicy != nil {
+				cappedToMin, _ := maybeCapToPolicyMin(recommended, resourceName, containerPolicy)
+				recommendation[resourceName] = cappedToMin
+
+				maxAllowed = containerPolicy.MaxAllowed
+			}
+
+			if maxAllowed == nil {
+				maxAllowed = globalMaxAllowed
+			} else {
+				// Set resources from the global maxAllowed if the VPA maxAllowed is missing them.
+				for resourceName, quantity := range globalMaxAllowed {
+					if _, ok := maxAllowed[resourceName]; !ok {
+						maxAllowed[resourceName] = quantity
+					}
+				}
+			}
+
+			cappedToMax, _ := maybeCapToMax(recommendation[resourceName], resourceName, maxAllowed)
 			recommendation[resourceName] = cappedToMax
 		}
 	}
@@ -241,19 +256,16 @@ func maybeCapToMin(recommended resource.Quantity, resourceName apiv1.ResourceNam
 
 // ApplyVPAPolicy returns a recommendation, adjusted to obey policy.
 func ApplyVPAPolicy(podRecommendation *vpa_types.RecommendedPodResources,
-	policy *vpa_types.PodResourcePolicy) (*vpa_types.RecommendedPodResources, error) {
+	policy *vpa_types.PodResourcePolicy, globalMaxAllowed apiv1.ResourceList) (*vpa_types.RecommendedPodResources, error) {
 	if podRecommendation == nil {
 		return nil, nil
 	}
-	if policy == nil {
-		return podRecommendation, nil
-	}
 
 	updatedRecommendations := []vpa_types.RecommendedContainerResources{}
 	for _, containerRecommendation := range podRecommendation.ContainerRecommendations {
 		containerName := containerRecommendation.ContainerName
 		updatedContainerResources, err := applyVPAPolicyForContainer(containerName,
-			&containerRecommendation, policy)
+			&containerRecommendation, policy, globalMaxAllowed)
 		if err != nil {
 			return nil, fmt.Errorf("cannot apply policy on recommendation for container name %v", containerName)
 		}