diff --git a/pkg/controllers/node/health/controller.go b/pkg/controllers/node/health/controller.go index db4e96f932..acd23a08f0 100644 --- a/pkg/controllers/node/health/controller.go +++ b/pkg/controllers/node/health/controller.go @@ -23,6 +23,7 @@ import ( "github.com/samber/lo" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -120,10 +121,18 @@ func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcil if err := c.annotateTerminationGracePeriod(ctx, nodeClaim); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } + + return c.deleteNodeClaim(ctx, nodeClaim, node, unhealthyNodeCondition) +} + +// deleteNodeClaim removes the NodeClaim from the api-server +func (c *Controller) deleteNodeClaim(ctx context.Context, nodeClaim *v1.NodeClaim, node *corev1.Node, unhealthyNodeCondition *corev1.NodeCondition) (reconcile.Result, error) { + if !nodeClaim.DeletionTimestamp.IsZero() { + return reconcile.Result{}, nil + } if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - // The deletion timestamp has successfully been set for the Node, update relevant metrics. log.FromContext(ctx).V(1).Info("deleting unhealthy node") metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ @@ -155,11 +164,22 @@ func (c *Controller) findUnhealthyConditions(node *corev1.Node) (nc *corev1.Node } func (c *Controller) annotateTerminationGracePeriod(ctx context.Context, nodeClaim *v1.NodeClaim) error { + if expirationTimeString, exists := nodeClaim.ObjectMeta.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey]; exists { + expirationTime, err := time.Parse(time.RFC3339, expirationTimeString) + if err == nil && expirationTime.Before(c.clock.Now()) { + return nil + } + } + stored := nodeClaim.DeepCopy() - nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: c.clock.Now().Format(time.RFC3339)}) + terminationTime := c.clock.Now().Format(time.RFC3339) + nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: terminationTime}) - if err := c.kubeClient.Patch(ctx, nodeClaim, client.MergeFrom(stored)); err != nil { - return err + if !equality.Semantic.DeepEqual(stored, nodeClaim) { + if err := c.kubeClient.Patch(ctx, nodeClaim, client.MergeFrom(stored)); err != nil { + return err + } + log.FromContext(ctx).WithValues(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime).Info("annotated nodeclaim") } return nil diff --git a/pkg/controllers/node/health/suite_test.go b/pkg/controllers/node/health/suite_test.go index ea2a368b20..5bfae95547 100644 --- a/pkg/controllers/node/health/suite_test.go +++ b/pkg/controllers/node/health/suite_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/samber/lo" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clock "k8s.io/utils/clock/testing" @@ -171,7 +172,7 @@ var _ = Describe("Node Health", func() { Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, fakeClock.Now().Format(time.RFC3339))) }) It("should not respect termination grace period if set on the nodepool", func() { - nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Minute} + nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(120 * time.Minute).Format(time.RFC3339)}) node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{ Type: "BadNode", Status: corev1.ConditionFalse, @@ -186,6 +187,23 @@ var _ = Describe("Node Health", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, fakeClock.Now().Format(time.RFC3339))) }) + It("should not update termination grace period if set before the current time", func() { + terminationTime := fakeClock.Now().Add(-3 * time.Minute).Format(time.RFC3339) + nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: terminationTime}) + node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{ + Type: "BadNode", + Status: corev1.ConditionFalse, + // We expect the last transition for HealthyNode condition to wait 30 minutes + LastTransitionTime: metav1.Time{Time: time.Now()}, + }) + fakeClock.Step(60 * time.Minute) + ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node) + // Determine to delete unhealthy nodes + ExpectObjectReconciled(ctx, env.Client, healthController, node) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime)) + }) It("should return the requeue interval for the condition closest to its terminationDuration", func() { cloudProvider.RepairPolicy = []cloudprovider.RepairPolicy{ {