diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go index e449d686..feab0d77 100644 --- a/pkg/controllers/leaderworkerset_controller.go +++ b/pkg/controllers/leaderworkerset_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strconv" + "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -41,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1" + "sigs.k8s.io/lws/pkg/metrics" "sigs.k8s.io/lws/pkg/utils" podutils "sigs.k8s.io/lws/pkg/utils/pod" statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset" @@ -367,7 +369,8 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l } if podutils.PodRunningAndReady(leaderPod) { readyCount++ - + waitTime := getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws) + metrics.ReplicaReadyStatus(sts.Name, time.Since(waitTime.Time)) if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash { updatedCount++ } @@ -569,3 +572,12 @@ func templateUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerS func replicasUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerSet) bool { return *sts.Spec.Replicas != *lws.Spec.Replicas } + +func getLastTransitionTime(conditionType string, lws *leaderworkerset.LeaderWorkerSet) metav1.Time { + for _, condition := range lws.Status.Conditions { + if condition.Type == conditionType { + return condition.LastTransitionTime + } + } + return metav1.Now() +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index ee6ff1a1..9282a18e 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -39,6 +39,14 @@ var ( Help: "number of times a group has been recreated", }, []string{"leadername"}, ) + + replicaReadyStatusDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "lws", + Name: "replica_ready_status_duration", + Help: "latency for each replica to be scheduled and become ready", + }, []string{"leadername"}, + ) ) func RollingUpdate(hash string, duration time.Duration) { @@ -49,9 +57,14 @@ func RecreatingGroup(leaderName string) { recreateGroupTimes.WithLabelValues(leaderName).Inc() } +func ReplicaReadyStatus(leaderName string, time time.Duration) { + replicaReadyStatusDuration.WithLabelValues(leaderName).Observe(time.Seconds()) +} + func Register() { metrics.Registry.MustRegister( rollingUpdateDuration, recreateGroupTimes, + replicaReadyStatusDuration, ) } diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 00000000..5443e62b --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,28 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestRecreatingGroup(t *testing.T) { + prometheus.MustRegister(recreateGroupTimes) + + RecreatingGroup("lws-sample-0") + RecreatingGroup("lws-sample-1") + RecreatingGroup("lws-sample-0") + + if count := testutil.CollectAndCount(recreateGroupTimes); count != 2 { + t.Errorf("Expecting %d metrics, got: %d", 2, count) + } + + if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-0")); count != float64(2) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count) + } + + if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-1")); count != float64(1) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count) + } +}