From 1fbc7a9d487f23456ccce554cade76f4ae7a70c0 Mon Sep 17 00:00:00 2001 From: Vijay Bhargav Eshappa Date: Fri, 11 Jul 2025 15:43:18 +0530 Subject: [PATCH] Handle Out of host capacity scenario in OCI nodepools --- .../oci/nodepools/oci_manager.go | 22 +++++---------- .../oci/nodepools/oci_manager_test.go | 17 ++++-------- .../oci/nodepools/oci_node_pool.go | 27 ++++++++++++++----- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go index d433a322a51c..fa1756f145c3 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go @@ -461,12 +461,6 @@ func (m *ociManagerImpl) GetExistingNodePoolSizeViaCompute(np NodePool) (int, er if !strings.HasPrefix(*item.DisplayName, displayNamePrefix) { continue } - // A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be - // returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it. - if *item.Id == "" { - klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance") - continue - } switch item.LifecycleState { case core.InstanceLifecycleStateStopped, core.InstanceLifecycleStateTerminated: klog.V(4).Infof("skipping instance is in stopped/terminated state: %q", *item.Id) @@ -525,25 +519,23 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance nodePool, err := m.nodePoolCache.get(np.Id()) if err != nil { + klog.Error(err, "error while performing GetNodePoolNodes call") return nil, err } var instances []cloudprovider.Instance for _, node := range nodePool.Nodes { - // A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be - // returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it. - if *node.Id == "" { - klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance") - continue - } - if node.NodeError != nil { + // We should move away from the approach of determining a node error as a Out of host capacity + // through string comparison. An error code specifically for Out of host capacity must be set + // and returned in the API response. errorClass := cloudprovider.OtherErrorClass if *node.NodeError.Code == "LimitExceeded" || - (*node.NodeError.Code == "InternalServerError" && - strings.Contains(*node.NodeError.Message, "quota")) { + *node.NodeError.Code == "QuotaExceeded" || + (*node.NodeError.Code == "InternalError" && + strings.Contains(*node.NodeError.Message, "Out of host capacity")) { errorClass = cloudprovider.OutOfResourcesErrorClass } diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go index 5328bd794720..0eebb4e593b6 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go @@ -6,12 +6,11 @@ package nodepools import ( "context" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts" "net/http" "reflect" "testing" - "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts" - apiv1 "k8s.io/api/core/v1" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/vendor-internal/github.com/oracle/oci-go-sdk/v65/common" @@ -120,16 +119,10 @@ func TestGetNodePoolNodes(t *testing.T) { { Id: common.String("node8"), NodeError: &oke.NodeError{ - Code: common.String("InternalServerError"), - Message: common.String("blah blah quota exceeded blah blah"), + Code: common.String("InternalError"), + Message: common.String("blah blah Out of host capacity blah blah"), }, }, - { - // This case happens if a node fails to scale up due to lack of capacity in the region. - // It's not a real node, so we shouldn't return it in the list of nodes. - Id: common.String(""), - LifecycleState: oke.NodeLifecycleStateCreating, - }, }, } @@ -186,8 +179,8 @@ func TestGetNodePoolNodes(t *testing.T) { State: cloudprovider.InstanceCreating, ErrorInfo: &cloudprovider.InstanceErrorInfo{ ErrorClass: cloudprovider.OutOfResourcesErrorClass, - ErrorCode: "InternalServerError", - ErrorMessage: "blah blah quota exceeded blah blah", + ErrorCode: "InternalError", + ErrorMessage: "blah blah Out of host capacity blah blah", }, }, }, diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go index d31618f71811..30684f305970 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go @@ -214,6 +214,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error { } } klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool) + np.manager.InvalidateAndRefreshCache() + nodes, err := np.manager.GetNodePoolNodes(np) + if err != nil { + klog.V(4).Error(err, "error while performing GetNodePoolNodes call") + return err + } + // We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on + // the below approach to determine the number running instance in a nodepool from the compute API and + //update the size of the nodepool accordingly. We should move away from this approach once we have an API + // to delete a specific node without a compute instance. + if !decreaseTargetCheckViaComputeBool { + for _, node := range nodes { + if node.Status != nil && node.Status.ErrorInfo != nil { + if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass { + klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.") + decreaseTargetCheckViaComputeBool = true + break + } + } + } + } var nodesLen int if decreaseTargetCheckViaComputeBool { nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np) @@ -222,12 +243,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error { return err } } else { - np.manager.InvalidateAndRefreshCache() - nodes, err := np.manager.GetNodePoolNodes(np) - if err != nil { - klog.V(4).Error(err, "error while performing GetNodePoolNodes call") - return err - } nodesLen = len(nodes) }