Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a metric of AppWrappers counts per state #675

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ require (
github.com/prometheus/client_model v0.3.0
github.com/stretchr/testify v1.8.2
k8s.io/api v0.26.2
k8s.io/apiextensions-apiserver v0.25.1
k8s.io/apiextensions-apiserver v0.26.1
k8s.io/apimachinery v0.26.2
k8s.io/apiserver v0.26.2
k8s.io/client-go v0.26.2
k8s.io/klog/v2 v2.90.1
k8s.io/metrics v0.26.2
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
sigs.k8s.io/controller-runtime v0.14.6
sigs.k8s.io/custom-metrics-apiserver v0.0.0
sigs.k8s.io/structured-merge-diff/v4 v4.2.3
)

replace sigs.k8s.io/custom-metrics-apiserver => sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3
Expand Down Expand Up @@ -85,15 +88,15 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/net v0.8.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/sync v0.1.0 // indirect
golang.org/x/sys v0.6.0 // indirect
golang.org/x/term v0.6.0 // indirect
golang.org/x/text v0.8.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21 // indirect
google.golang.org/grpc v1.49.0 // indirect
Expand All @@ -107,9 +110,7 @@ require (
k8s.io/component-base v0.26.2 // indirect
k8s.io/kms v0.26.2 // indirect
k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d // indirect
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
19 changes: 9 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kd
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves=
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down Expand Up @@ -388,13 +387,12 @@ go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJP
go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA=
go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE=
go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
Expand Down Expand Up @@ -556,8 +554,8 @@ golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
Expand All @@ -573,7 +571,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
Expand Down Expand Up @@ -741,8 +738,8 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ=
k8s.io/api v0.26.2/go.mod h1:1kjMQsFE+QHPfskEcVNgL3+Hp88B80uj0QtSOlj8itU=
k8s.io/apiextensions-apiserver v0.25.1 h1:HEIKlxj6oHaDwHgotEIX/Ld5K/RGuOFwN/TWMiQ5s5s=
k8s.io/apiextensions-apiserver v0.25.1/go.mod h1:67sgnMs2yIO2iV4DpCdS91vlP+pdnVIsG/mz60qRn44=
k8s.io/apiextensions-apiserver v0.26.1 h1:cB8h1SRk6e/+i3NOrQgSFij1B2S0Y0wDoNl66bn8RMI=
k8s.io/apiextensions-apiserver v0.26.1/go.mod h1:AptjOSXDGuE0JICx/Em15PaoO7buLwTs0dGleIHixSM=
k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ=
k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I=
k8s.io/apiserver v0.26.2 h1:Pk8lmX4G14hYqJd1poHGC08G03nIHVqdJMR0SD3IH3o=
Expand All @@ -766,6 +763,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 h1:+xBL5uTc+BkPBwmMi3vYfUJjq+N3K+H6PXeETwf5cPI=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35/go.mod h1:WxjusMwXlKzfAs4p9km6XJRndVt2FROgMVCE4cdohFo=
sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92KcwQA=
sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0=
sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3 h1:puQ5YlyBjhxg+OQ1YPMJXwtk7WhC4E6AlWIQ9pC8jws=
sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3/go.mod h1:9nUXR/EgdYZto1aQ6yhwOksPR7J979jSyOqic1IgaOo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
Expand Down
123 changes: 123 additions & 0 deletions pkg/controller/queuejob/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
Copyright 2023 The Multi-Cluster App Dispatcher Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package queuejob

import (
arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog/v2"
"reflect"
"runtime"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"time"
)

var (
allocatableCapacityCpu = prometheus.NewGauge(prometheus.GaugeOpts{
Subsystem: "mcad",
Name: "allocatable_capacity_cpu",
Help: "Allocatable CPU Capacity (in millicores)",
})
allocatableCapacityMemory = prometheus.NewGauge(prometheus.GaugeOpts{
Subsystem: "mcad",
Name: "allocatable_capacity_memory",
Help: "Allocatable Memory Capacity",
})
allocatableCapacityGpu = prometheus.NewGauge(prometheus.GaugeOpts{
Subsystem: "mcad",
Name: "allocatable_capacity_gpu",
Help: "Allocatable GPU Capacity",
})
appWrappersCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: "mcad",
Name: "appwrappers_count",
Help: "AppWrappers count per state",
}, []string{"state"})
)

func init() {
klog.V(10).Infof("Registering metrics")
metrics.Registry.MustRegister(
allocatableCapacityCpu,
allocatableCapacityMemory,
allocatableCapacityGpu,
appWrappersCount,
)
}

func updateMetricsLoop(controller *XController, stopCh <-chan struct{}) {
updateMetricsLoopGeneric(controller, stopCh, time.Minute*1, updateAllocatableCapacity)
updateMetricsLoopGeneric(controller, stopCh, time.Second*5, updateQueue)
}

func getFuncName(f interface{}) string {
return runtime.FuncForPC(reflect.ValueOf(f).Pointer()).Name()
}

func updateMetricsLoopGeneric(controller *XController, stopCh <-chan struct{}, d time.Duration, updateFunc func(xController *XController)) {
ticker := time.NewTicker(d)
go func() {
updateFunc(controller)
for {
select {
case <-ticker.C:
if klog.V(10).Enabled() {
klog.Infof("[updateMetricsLoopGeneric] Update metrics loop tick: %v", getFuncName(updateFunc))
}
updateFunc(controller)
case <-stopCh:
if klog.V(10).Enabled() {
klog.Infof("[updateMetricsLoopGeneric] Exiting update metrics loop: %v", getFuncName(updateFunc))
}
ticker.Stop()
return
}
}
}()
}

func updateAllocatableCapacity(controller *XController) {
res := controller.GetAllocatableCapacity()
allocatableCapacityCpu.Set(res.MilliCPU)
allocatableCapacityMemory.Set(res.Memory)
allocatableCapacityGpu.Set(float64(res.GPU))
}

func updateQueue(controller *XController) {
awList, err := controller.appWrapperLister.List(labels.Everything())

if err != nil {
klog.Errorf("[updateQueue] Unable to obtain the list of AppWrappers: %+v", err)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of error, should we update the counters to 0 or leave them as is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should leave them as is and retry to update with new value.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This update is done periodically. So instead of retrying, we can simply wait for the next update on the next tick.
I dug deeper into the implementation of controller.appWrapperLister.List(labels.Everything()), and it seems that since labels.Everything().Empty() returns true, there is no way that controller.appWrapperLister.List() will return an error. So the discussion seems to be purely theoretical.

return
}
stateToAppWrapperCount := map[arbv1.AppWrapperState]int{
arbv1.AppWrapperStateEnqueued: 0,
arbv1.AppWrapperStateActive: 0,
arbv1.AppWrapperStateDeleted: 0,
arbv1.AppWrapperStateFailed: 0,
arbv1.AppWrapperStateCompleted: 0,
arbv1.AppWrapperStateRunningHoldCompletion: 0,
}
for _, aw := range awList {
state := aw.Status.State
stateToAppWrapperCount[state]++
}
for state, count := range stateToAppWrapperCount {
appWrappersCount.WithLabelValues(string(state)).Set(float64(count))
}
}
5 changes: 5 additions & 0 deletions pkg/controller/queuejob/queuejob_controller_ex.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ func (qjm *XController) allocatableCapacity() *clusterstateapi.Resource {
return capacity
}

func (qjm *XController) GetAllocatableCapacity() *clusterstateapi.Resource {
return qjm.allocatableCapacity()
}

// NewJobController create new AppWrapper Controller
func NewJobController(restConfig *rest.Config, mcadConfig *config.MCADConfiguration, extConfig *config.MCADConfigurationExtended) *XController {
cc := &XController{
Expand Down Expand Up @@ -1456,6 +1460,7 @@ func (cc *XController) Run(stopCh <-chan struct{}) {
go cc.appwrapperInformer.Informer().Run(stopCh)

cache.WaitForCacheSync(stopCh, cc.appWrapperSynced)
updateMetricsLoop(cc, stopCh)

if cc.isDispatcher {
go wait.Until(cc.UpdateAgent, 2*time.Second, stopCh) // In the Agent?
Expand Down