Skip to content

Commit

Permalink
Avoid multiple logs messages for the node health controller
Browse files Browse the repository at this point in the history
  • Loading branch information
engedaam committed Dec 20, 2024
1 parent e339ada commit ef19da3
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 5 deletions.
28 changes: 24 additions & 4 deletions pkg/controllers/node/health/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"

Expand Down Expand Up @@ -120,10 +121,18 @@ func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcil
if err := c.annotateTerminationGracePeriod(ctx, nodeClaim); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}

return c.deleteNodeClaim(ctx, nodeClaim, node, unhealthyNodeCondition)
}

// deleteNodeClaim removes the NodeClaim from the api-server
func (c *Controller) deleteNodeClaim(ctx context.Context, nodeClaim *v1.NodeClaim, node *corev1.Node, unhealthyNodeCondition *corev1.NodeCondition) (reconcile.Result, error) {
if !nodeClaim.DeletionTimestamp.IsZero() {
return reconcile.Result{}, nil
}
if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}

// The deletion timestamp has successfully been set for the Node, update relevant metrics.
log.FromContext(ctx).V(1).Info("deleting unhealthy node")
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
Expand Down Expand Up @@ -155,11 +164,22 @@ func (c *Controller) findUnhealthyConditions(node *corev1.Node) (nc *corev1.Node
}

func (c *Controller) annotateTerminationGracePeriod(ctx context.Context, nodeClaim *v1.NodeClaim) error {
if expirationTimeString, exists := nodeClaim.ObjectMeta.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey]; exists {
expirationTime, err := time.Parse(time.RFC3339, expirationTimeString)
if err == nil && expirationTime.Before(c.clock.Now()) {
return nil
}
}

stored := nodeClaim.DeepCopy()
nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: c.clock.Now().Format(time.RFC3339)})
terminationTime := c.clock.Now().Format(time.RFC3339)
nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: terminationTime})

if err := c.kubeClient.Patch(ctx, nodeClaim, client.MergeFrom(stored)); err != nil {
return err
if !equality.Semantic.DeepEqual(stored, nodeClaim) {
if err := c.kubeClient.Patch(ctx, nodeClaim, client.MergeFrom(stored)); err != nil {
return err
}
log.FromContext(ctx).WithValues(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime).Info("annotated nodeclaim")
}

return nil
Expand Down
20 changes: 19 additions & 1 deletion pkg/controllers/node/health/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clock "k8s.io/utils/clock/testing"
Expand Down Expand Up @@ -171,7 +172,7 @@ var _ = Describe("Node Health", func() {
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, fakeClock.Now().Format(time.RFC3339)))
})
It("should not respect termination grace period if set on the nodepool", func() {
nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Minute}
nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(120 * time.Minute).Format(time.RFC3339)})
node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
Type: "BadNode",
Status: corev1.ConditionFalse,
Expand All @@ -186,6 +187,23 @@ var _ = Describe("Node Health", func() {
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, fakeClock.Now().Format(time.RFC3339)))
})
It("should not update termination grace period if set before the current time", func() {
terminationTime := fakeClock.Now().Add(-3 * time.Minute).Format(time.RFC3339)
nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: terminationTime})
node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
Type: "BadNode",
Status: corev1.ConditionFalse,
// We expect the last transition for HealthyNode condition to wait 30 minutes
LastTransitionTime: metav1.Time{Time: time.Now()},
})
fakeClock.Step(60 * time.Minute)
ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node)
// Determine to delete unhealthy nodes
ExpectObjectReconciled(ctx, env.Client, healthController, node)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime))
})
It("should return the requeue interval for the condition closest to its terminationDuration", func() {
cloudProvider.RepairPolicy = []cloudprovider.RepairPolicy{
{
Expand Down

0 comments on commit ef19da3

Please sign in to comment.