From 74d03e577d865c607023e9da6cdc695066f1fe73 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 15 Jun 2023 09:03:07 -0400 Subject: [PATCH 1/2] only add resources from cond ready and status true --- pkg/controller/clusterstate/cache/cache.go | 142 +++++++++++---------- 1 file changed, 77 insertions(+), 65 deletions(-) diff --git a/pkg/controller/clusterstate/cache/cache.go b/pkg/controller/clusterstate/cache/cache.go index 4ef2f77ec..00f682a07 100644 --- a/pkg/controller/clusterstate/cache/cache.go +++ b/pkg/controller/clusterstate/cache/cache.go @@ -33,11 +33,12 @@ package cache import ( "context" "fmt" - "github.com/golang/protobuf/proto" - dto "github.com/prometheus/client_model/go" "sync" "time" + "github.com/golang/protobuf/proto" + dto "github.com/prometheus/client_model/go" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" @@ -210,7 +211,6 @@ func (sc *ClusterStateCache) GetUnallocatedResources() *api.Resource { return r.Add(sc.availableResources) } - func (sc *ClusterStateCache) GetUnallocatedHistograms() map[string]*dto.Metric { sc.Mutex.Lock() defer sc.Mutex.Unlock() @@ -238,7 +238,7 @@ func (sc *ClusterStateCache) GetResourceCapacities() *api.Resource { // Save the cluster state. func (sc *ClusterStateCache) saveState(available *api.Resource, capacity *api.Resource, - availableHistogram *api.ResourceHistogram) error { + availableHistogram *api.ResourceHistogram) error { klog.V(12).Infof("Saving Cluster State") sc.Mutex.Lock() @@ -261,75 +261,87 @@ func (sc *ClusterStateCache) updateState() error { idleMin := api.EmptyResource() idleMax := api.EmptyResource() - firstNode := true + firstNode := true + + var err error = nil for _, value := range cluster.Nodes { - // Do not use any Unschedulable nodes in calculations - if value.Unschedulable == true { - klog.V(6).Infof("[updateState] %s is marked as unschedulable node Total: %v, Used: %v, and Idle: %v will not be included in cluster state calculation.", - value.Name, value.Allocatable, value.Used, value.Idle) - continue - } - total = total.Add(value.Allocatable) - used = used.Add(value.Used) - idle = idle.Add(value.Idle) - - // Collect Min and Max for histogram - if firstNode { - idleMin.MilliCPU = idle.MilliCPU - idleMin.Memory = idle.Memory - idleMin.GPU = idle.GPU - - idleMax.MilliCPU = idle.MilliCPU - idleMax.Memory = idle.Memory - idleMax.GPU = idle.GPU - firstNode = false - } else { - if value.Idle.MilliCPU < idleMin.MilliCPU { - idleMin.MilliCPU = value.Idle.MilliCPU - } else if value.Idle.MilliCPU > idleMax.MilliCPU { - idleMax.MilliCPU = value.Idle.MilliCPU - } + for _, cond := range value.Node.Status.Conditions { + if cond.Type == v1.NodeReady { + if cond.Status == "True" { + // Do not use any Unschedulable nodes in calculations + if value.Unschedulable == true { + klog.V(6).Infof("[updateState] %s is marked as unschedulable node Total: %v, Used: %v, and Idle: %v will not be included in cluster state calculation.", + value.Name, value.Allocatable, value.Used, value.Idle) + continue + } + + total = total.Add(value.Allocatable) + used = used.Add(value.Used) + idle = idle.Add(value.Idle) + + // Collect Min and Max for histogram + if firstNode { + idleMin.MilliCPU = idle.MilliCPU + idleMin.Memory = idle.Memory + idleMin.GPU = idle.GPU + + idleMax.MilliCPU = idle.MilliCPU + idleMax.Memory = idle.Memory + idleMax.GPU = idle.GPU + firstNode = false + } else { + if value.Idle.MilliCPU < idleMin.MilliCPU { + idleMin.MilliCPU = value.Idle.MilliCPU + } else if value.Idle.MilliCPU > idleMax.MilliCPU { + idleMax.MilliCPU = value.Idle.MilliCPU + } + + if value.Idle.Memory < idleMin.Memory { + idleMin.Memory = value.Idle.Memory + } else if value.Idle.Memory > idleMax.Memory { + idleMax.Memory = value.Idle.Memory + } + + if value.Idle.GPU < idleMin.GPU { + idleMin.GPU = value.Idle.GPU + } else if value.Idle.GPU > idleMax.GPU { + idleMax.GPU = value.Idle.GPU + } + } + } - if value.Idle.Memory < idleMin.Memory { - idleMin.Memory = value.Idle.Memory - } else if value.Idle.Memory > idleMax.Memory{ - idleMax.Memory = value.Idle.Memory - } + // Create available histograms + newIdleHistogram := api.NewResourceHistogram(idleMin, idleMax) + for _, value := range cluster.Nodes { + newIdleHistogram.Observer(value.Idle) + } + + klog.V(8).Infof("Total capacity %+v, used %+v, free space %+v", total, used, idle) + if klog.V(12).Enabled() { + // CPU histogram + metricCPU := &dto.Metric{} + (*newIdleHistogram.MilliCPU).Write(metricCPU) + klog.V(12).Infof("[updateState] CPU histogram:\n%s", proto.MarshalTextString(metricCPU)) + + // Memory histogram + metricMem := &dto.Metric{} + (*newIdleHistogram.Memory).Write(metricMem) + klog.V(12).Infof("[updateState] Memory histogram:\n%s", proto.MarshalTextString(metricMem)) + + // GPU histogram + metricGPU := &dto.Metric{} + (*newIdleHistogram.GPU).Write(metricGPU) + klog.V(12).Infof("[updateState] GPU histogram:\n%s", proto.MarshalTextString(metricGPU)) + } + + err = sc.saveState(idle, total, newIdleHistogram) - if value.Idle.GPU < idleMin.GPU { - idleMin.GPU = value.Idle.GPU - } else if value.Idle.GPU > idleMax.GPU { - idleMax.GPU = value.Idle.GPU } - } - } - // Create available histograms - newIdleHistogram := api.NewResourceHistogram(idleMin, idleMax) - for _, value := range cluster.Nodes { - newIdleHistogram.Observer(value.Idle) - } + } - klog.V(8).Infof("Total capacity %+v, used %+v, free space %+v", total, used, idle) - if klog.V(12).Enabled() { - // CPU histogram - metricCPU := &dto.Metric{} - (*newIdleHistogram.MilliCPU).Write(metricCPU) - klog.V(12).Infof("[updateState] CPU histogram:\n%s", proto.MarshalTextString(metricCPU)) - - // Memory histogram - metricMem := &dto.Metric{} - (*newIdleHistogram.Memory).Write(metricMem) - klog.V(12).Infof("[updateState] Memory histogram:\n%s", proto.MarshalTextString(metricMem)) - - // GPU histogram - metricGPU := &dto.Metric{} - (*newIdleHistogram.GPU).Write(metricGPU) - klog.V(12).Infof("[updateState] GPU histogram:\n%s", proto.MarshalTextString(metricGPU)) } - - err := sc.saveState(idle, total, newIdleHistogram) return err } From a698b900d94e8d24225c4497737320427b748a49 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Thu, 15 Jun 2023 11:10:45 -0400 Subject: [PATCH 2/2] add unit test --- .../clusterstate/cache/cache_test.go | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/pkg/controller/clusterstate/cache/cache_test.go b/pkg/controller/clusterstate/cache/cache_test.go index 30714de91..325dd231c 100644 --- a/pkg/controller/clusterstate/cache/cache_test.go +++ b/pkg/controller/clusterstate/cache/cache_test.go @@ -35,7 +35,7 @@ import ( "reflect" "testing" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -76,7 +76,7 @@ func cacheEqual(l, r *ClusterStateCache) bool { jobsEqual(l.Jobs, r.Jobs) } -func buildNode(name string, alloc v1.ResourceList) *v1.Node { +func buildNode(name string, alloc v1.ResourceList, nodecondtype string) *v1.Node { return &v1.Node{ ObjectMeta: metav1.ObjectMeta{ UID: types.UID(name), @@ -85,6 +85,9 @@ func buildNode(name string, alloc v1.ResourceList) *v1.Node { Status: v1.NodeStatus{ Capacity: alloc, Allocatable: alloc, + Conditions: []v1.NodeCondition{ + {Type: v1.NodeConditionType(nodecondtype)}, + }, }, } } @@ -155,7 +158,7 @@ func TestAddPod(t *testing.T) { j1.AddTaskInfo(pi1) j1.AddTaskInfo(pi2) - node1 := buildNode("n1", buildResourceList("2000m", "10G")) + node1 := buildNode("n1", buildResourceList("2000m", "10G"), string(v1.NodeReady)) ni1 := api.NewNodeInfo(node1) ni1.AddTask(pi2) @@ -199,3 +202,31 @@ func TestAddPod(t *testing.T) { } } +func TestNodeStatus(t *testing.T) { + + var memory float64 = 0.0 + var millicpu float64 = 0.0 + var gpu int64 = 0.0 + + node1 := buildNode("n1", buildResourceList("2000m", "10G"), string(v1.NodeDiskPressure)) + + cache := &ClusterStateCache{ + Nodes: make(map[string]*api.NodeInfo), + } + + cache.addNode(node1) + + cache.updateState() + + if cache.availableResources != nil { + memory = cache.availableResources.Memory + millicpu = cache.availableResources.MilliCPU + gpu = cache.availableResources.GPU + } + + if cache.availableResources != nil { + t.Errorf("expected all values to be 0, \n got memory: %v, millicpus: %v, gpu:%v \n", + memory, millicpu, gpu) + } + +}