From d7f8907792e8ee01fcb0a1490802c16e9e6e2ad7 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Thu, 11 Apr 2024 22:31:07 +0000 Subject: [PATCH 1/3] added registeration of rolling_update_duration and recreate_group_times, and functionality of the latter --- Dockerfile | 5 +-- cmd/main.go | 3 ++ go.mod | 2 +- pkg/controllers/pod_controller.go | 2 ++ pkg/metrics/metrics.go | 57 +++++++++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 pkg/metrics/metrics.go diff --git a/Dockerfile b/Dockerfile index 10c68f79..0884bd33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,10 +19,7 @@ RUN go mod download # Copy the go source COPY cmd/main.go cmd/main.go COPY api/ api/ -COPY pkg/controllers/ pkg/controllers/ -COPY pkg/cert/ pkg/cert/ -COPY pkg/webhooks/ pkg/webhooks/ -COPY pkg/utils pkg/utils +COPY pkg/ pkg/ # Build # the GOARCH has not a default value to allow the binary be built according to the host where the command diff --git a/cmd/main.go b/cmd/main.go index 243c17a2..dd41ed72 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,6 +35,7 @@ import ( leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1" "sigs.k8s.io/lws/pkg/cert" "sigs.k8s.io/lws/pkg/controllers" + "sigs.k8s.io/lws/pkg/metrics" "sigs.k8s.io/lws/pkg/webhooks" //+kubebuilder:scaffold:imports ) @@ -76,6 +77,8 @@ func main() { kubeConfig.QPS = float32(qps) kubeConfig.Burst = burst + metrics.Register() + mgr, err := ctrl.NewManager(kubeConfig, ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{BindAddress: metricsAddr}, diff --git a/go.mod b/go.mod index c8513c5a..7d29f820 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/onsi/ginkgo/v2 v2.17.1 github.com/onsi/gomega v1.32.0 github.com/open-policy-agent/cert-controller v0.10.1 + github.com/prometheus/client_golang v1.18.0 k8s.io/api v0.29.3 k8s.io/apiextensions-apiserver v0.29.3 k8s.io/apimachinery v0.29.3 @@ -48,7 +49,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.18.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect diff --git a/pkg/controllers/pod_controller.go b/pkg/controllers/pod_controller.go index d86e61e1..85a31bf6 100644 --- a/pkg/controllers/pod_controller.go +++ b/pkg/controllers/pod_controller.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1" + "sigs.k8s.io/lws/pkg/metrics" acceleratorutils "sigs.k8s.io/lws/pkg/utils/accelerators" podutils "sigs.k8s.io/lws/pkg/utils/pod" statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset" @@ -174,6 +175,7 @@ func (r *PodReconciler) handleRestartPolicy(ctx context.Context, pod corev1.Pod, }); err != nil { return false, err } + metrics.RecreatingGroup(leader.Name) return true, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 00000000..ee6ff1a1 --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,57 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + rollingUpdateDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "lws", + Name: "rolling_update_duration", + Help: "Duration of rolling updates", + }, []string{"hash"}, + ) + + recreateGroupTimes = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: "lws", + Name: "recreate_group_times", + Help: "number of times a group has been recreated", + }, []string{"leadername"}, + ) +) + +func RollingUpdate(hash string, duration time.Duration) { + rollingUpdateDuration.WithLabelValues(hash).Observe(duration.Seconds()) +} + +func RecreatingGroup(leaderName string) { + recreateGroupTimes.WithLabelValues(leaderName).Inc() +} + +func Register() { + metrics.Registry.MustRegister( + rollingUpdateDuration, + recreateGroupTimes, + ) +} From c901648d25ec08354b5cecf10c95be200d71acf7 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Tue, 16 Apr 2024 22:55:25 +0000 Subject: [PATCH 2/3] added ReplicaReadyStatus metric, added unit tests --- pkg/controllers/leaderworkerset_controller.go | 14 +++++++++- pkg/metrics/metrics.go | 13 +++++++++ pkg/metrics/metrics_test.go | 28 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 pkg/metrics/metrics_test.go diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go index e449d686..feab0d77 100644 --- a/pkg/controllers/leaderworkerset_controller.go +++ b/pkg/controllers/leaderworkerset_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strconv" + "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -41,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1" + "sigs.k8s.io/lws/pkg/metrics" "sigs.k8s.io/lws/pkg/utils" podutils "sigs.k8s.io/lws/pkg/utils/pod" statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset" @@ -367,7 +369,8 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l } if podutils.PodRunningAndReady(leaderPod) { readyCount++ - + waitTime := getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws) + metrics.ReplicaReadyStatus(sts.Name, time.Since(waitTime.Time)) if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash { updatedCount++ } @@ -569,3 +572,12 @@ func templateUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerS func replicasUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerSet) bool { return *sts.Spec.Replicas != *lws.Spec.Replicas } + +func getLastTransitionTime(conditionType string, lws *leaderworkerset.LeaderWorkerSet) metav1.Time { + for _, condition := range lws.Status.Conditions { + if condition.Type == conditionType { + return condition.LastTransitionTime + } + } + return metav1.Now() +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index ee6ff1a1..9282a18e 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -39,6 +39,14 @@ var ( Help: "number of times a group has been recreated", }, []string{"leadername"}, ) + + replicaReadyStatusDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "lws", + Name: "replica_ready_status_duration", + Help: "latency for each replica to be scheduled and become ready", + }, []string{"leadername"}, + ) ) func RollingUpdate(hash string, duration time.Duration) { @@ -49,9 +57,14 @@ func RecreatingGroup(leaderName string) { recreateGroupTimes.WithLabelValues(leaderName).Inc() } +func ReplicaReadyStatus(leaderName string, time time.Duration) { + replicaReadyStatusDuration.WithLabelValues(leaderName).Observe(time.Seconds()) +} + func Register() { metrics.Registry.MustRegister( rollingUpdateDuration, recreateGroupTimes, + replicaReadyStatusDuration, ) } diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 00000000..5443e62b --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,28 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestRecreatingGroup(t *testing.T) { + prometheus.MustRegister(recreateGroupTimes) + + RecreatingGroup("lws-sample-0") + RecreatingGroup("lws-sample-1") + RecreatingGroup("lws-sample-0") + + if count := testutil.CollectAndCount(recreateGroupTimes); count != 2 { + t.Errorf("Expecting %d metrics, got: %d", 2, count) + } + + if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-0")); count != float64(2) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count) + } + + if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-1")); count != float64(1) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count) + } +} From 59a987a01a829a741710cb169ec5e081ed3f6567 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Wed, 17 Apr 2024 17:31:32 +0000 Subject: [PATCH 3/3] fixed where the readystatus is reported --- pkg/controllers/leaderworkerset_controller.go | 9 +++++---- pkg/metrics/metrics.go | 12 ++++++++++-- pkg/utils/pod/pod_utils.go | 5 +++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go index feab0d77..7677f689 100644 --- a/pkg/controllers/leaderworkerset_controller.go +++ b/pkg/controllers/leaderworkerset_controller.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "strconv" - "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -350,7 +349,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l readyCount := 0 updatedCount := 0 templateHash := utils.LeaderWorkerTemplateHash(lws) - + var readyLeaderPods []corev1.Pod // Iterate through all statefulsets. for _, sts := range lwssts.Items { if sts.Name == lws.Name { @@ -369,8 +368,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l } if podutils.PodRunningAndReady(leaderPod) { readyCount++ - waitTime := getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws) - metrics.ReplicaReadyStatus(sts.Name, time.Since(waitTime.Time)) + readyLeaderPods = append(readyLeaderPods, leaderPod) if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash { updatedCount++ } @@ -392,6 +390,9 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l updateCondition := setCondition(lws, condition) // if condition changed, record events if updateCondition { + if updatedCount == int(*lws.Spec.Replicas) { + metrics.ReplicaReadyStatus(readyLeaderPods, getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws)) + } r.Record.Eventf(lws, corev1.EventTypeNormal, condition.Reason, condition.Message+fmt.Sprintf(", with %d groups ready of total %d groups", readyCount, int(*lws.Spec.Replicas))) } return updateStatus || updateCondition, nil diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 9282a18e..2854cc94 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -19,6 +19,10 @@ package metrics import ( "time" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + podutils "sigs.k8s.io/lws/pkg/utils/pod" + "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" ) @@ -57,8 +61,12 @@ func RecreatingGroup(leaderName string) { recreateGroupTimes.WithLabelValues(leaderName).Inc() } -func ReplicaReadyStatus(leaderName string, time time.Duration) { - replicaReadyStatusDuration.WithLabelValues(leaderName).Observe(time.Seconds()) +func ReplicaReadyStatus(readyPods []corev1.Pod, startTime metav1.Time) { + for _, pod := range readyPods { + readyTime := podutils.PodReadyConditionLastTransitionTime(pod).Time + latency := readyTime.Sub(startTime.Time) + replicaReadyStatusDuration.WithLabelValues(pod.Name).Observe(latency.Seconds()) + } } func Register() { diff --git a/pkg/utils/pod/pod_utils.go b/pkg/utils/pod/pod_utils.go index 046436ed..d7356950 100644 --- a/pkg/utils/pod/pod_utils.go +++ b/pkg/utils/pod/pod_utils.go @@ -18,6 +18,7 @@ package pod import ( corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1" ) @@ -56,6 +57,10 @@ func PodRunningAndReady(pod corev1.Pod) bool { return pod.Status.Phase == corev1.PodRunning && podReady(pod) } +func PodReadyConditionLastTransitionTime(pod corev1.Pod) metav1.Time { + return getPodReadyCondition(pod.Status).LastTransitionTime +} + func podReady(pod corev1.Pod) bool { return podReadyConditionTrue(pod.Status) }