Limit node repair based by nodepool

kubernetes-sigs · Nov 20, 2024 · a10ef64 · a10ef64
1 parent c221747
commit a10ef64
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 0 deletions.
diff --git a/pkg/controllers/node/health/controller.go b/pkg/controllers/node/health/controller.go
@@ -18,10 +18,12 @@ package health
 
 import (
 	"context"
+	"fmt"
 	"time"
 
 	"github.com/samber/lo"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/klog/v2"
 	"k8s.io/utils/clock"
 	controllerruntime "sigs.k8s.io/controller-runtime"
@@ -70,6 +72,15 @@ func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcil
 		return reconcile.Result{}, nodeutils.IgnoreNodeClaimNotFoundError(err)
 	}
 
+	nodePoolHealthy, err := c.isNodePoolHealthy(ctx, nodeClaim)
+	if err != nil {
+		return reconcile.Result{}, client.IgnoreNotFound(err)
+	}
+	if !nodePoolHealthy {
+		log.FromContext(ctx).WithValues("nodepool", nodeClaim.Labels[v1.NodePoolLabelKey]).V(1).Info("nodepool are unhealthy")
+		return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil
+	}
+
 	unhealthyNodeCondition, policyTerminationDuration := c.findUnhealthyConditions(node)
 	if unhealthyNodeCondition == nil {
 		return reconcile.Result{}, nil
@@ -130,3 +141,38 @@ func (c *Controller) annotateTerminationGracePeriod(ctx context.Context, nodeCla
 
 	return nil
 }
+
+// Consider a nodepool unhealthy if less then 20% of the nodes owned by the nodepool have an unhealthy condition
+// defined by the cloud provider
+func (c *Controller) isNodePoolHealthy(ctx context.Context, nodeClaim *v1.NodeClaim) (bool, error) {
+	nodePoolName, found := nodeClaim.Labels[v1.NodePoolLabelKey]
+	if !found {
+		return false, fmt.Errorf("nodepool cannot be found from node")
+	}
+
+	nodeList := &corev1.NodeList{}
+	if err := c.kubeClient.List(ctx, nodeList, client.MatchingLabels(map[string]string{v1.NodePoolLabelKey: nodePoolName})); err != nil {
+		return false, err
+	}
+
+	for _, policy := range c.cloudProvider.RepairPolicies() {
+		unhealthyNodeCount := lo.CountBy(nodeList.Items, func(node corev1.Node) bool {
+			nodeCondition := nodeutils.GetCondition(lo.ToPtr(node), policy.ConditionType)
+			return nodeCondition.Status == policy.ConditionStatus
+		})
+
+		// This will round up to the nearest whole number. Therefore, a unhealthy node can
+		// sometimes exceed the 20% limit. This is the same as how Kubernetes
+		// handles MaxUnavailable with PDBs. Take the case with 20% unhealthy node threshold, but
+		// 3 nodes. Karpenter will opt to allow 1 node unhealhty nodepool, rather than
+		// blocking all node repair for this nodepool.
+		threshold, err := intstr.GetScaledValueFromIntOrPercent(lo.ToPtr(intstr.FromString("20%")), len(nodeList.Items), true)
+		if err != nil {
+			return false, err
+		}
+		if unhealthyNodeCount > threshold {
+			return false, nil
+		}
+	}
+	return true, nil
+}
diff --git a/pkg/controllers/node/health/suite_test.go b/pkg/controllers/node/health/suite_test.go
@@ -274,6 +274,61 @@ var _ = Describe("Node Health", func() {
 			nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
 			Expect(nodeClaim.DeletionTimestamp).ToNot(BeNil())
 		})
+		It("should ignore unhealthy nodes if more then 20% of the nodes are unhealthy", func() {
+			ExpectApplied(ctx, env.Client, nodePool)
+			nodeClaims := []*v1.NodeClaim{}
+			nodes := []*corev1.Node{}
+			for i := range 10 {
+				nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1.TerminationFinalizer}}})
+				if i < 3 {
+					node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
+						Type:               "BadNode",
+						Status:             corev1.ConditionFalse,
+						LastTransitionTime: metav1.Time{Time: fakeClock.Now()},
+					})
+				}
+				node.Labels[v1.NodePoolLabelKey] = nodePool.Name
+				nodeClaim.Labels[v1.NodePoolLabelKey] = nodePool.Name
+				nodeClaims = append(nodeClaims, nodeClaim)
+				nodes = append(nodes, node)
+				ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node)
+			}
+
+			fakeClock.Step(60 * time.Minute)
+
+			// Determine to delete unhealthy nodes
+			for i := range 4 {
+				res := ExpectObjectReconciled(ctx, env.Client, healthController, nodes[i])
+				nodeClaim = ExpectExists(ctx, env.Client, nodeClaims[i])
+				Expect(nodeClaim.DeletionTimestamp).To(BeNil())
+				Expect(res.RequeueAfter).To(BeNumerically("~", time.Minute*5, time.Second))
+			}
+		})
+		It("should allow at one node to be unhealthy in nodepool", func() {
+			nodeClaims := []*v1.NodeClaim{}
+			nodes := []*corev1.Node{}
+			for i := range 3 {
+				nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1.TerminationFinalizer}}})
+				if i == 0 {
+					node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
+						Type:               "BadNode",
+						Status:             corev1.ConditionFalse,
+						LastTransitionTime: metav1.Time{Time: fakeClock.Now()},
+					})
+				}
+				node.Labels[v1.NodePoolLabelKey] = nodePool.Name
+				nodeClaim.Labels[v1.NodePoolLabelKey] = nodePool.Name
+				nodeClaims = append(nodeClaims, nodeClaim)
+				nodes = append(nodes, node)
+				ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node)
+			}
+
+			fakeClock.Step(60 * time.Minute)
+			// Determine to delete unhealthy nodes
+			ExpectObjectReconciled(ctx, env.Client, healthController, nodes[0])
+			nodeClaim = ExpectExists(ctx, env.Client, nodeClaims[0])
+			Expect(nodeClaim.DeletionTimestamp).ToNot(BeNil())
+		})
 	})
 	Context("Metrics", func() {
 		It("should fire a karpenter_nodeclaims_disrupted_total metric when unhealthy", func() {