From b43cb9bc01cb16c810035da51dad8358d9432a7d Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 23 Jun 2023 11:36:23 +0200 Subject: [PATCH 1/4] scale-only: manager.yaml: GOMAXPROCS --- bootstrap/kubeadm/config/manager/manager.yaml | 3 +++ config/manager/manager.yaml | 2 ++ controlplane/kubeadm/config/manager/manager.yaml | 2 ++ test/infrastructure/docker/config/manager/manager.yaml | 2 ++ test/infrastructure/inmemory/config/manager/manager.yaml | 2 ++ 5 files changed, 11 insertions(+) diff --git a/bootstrap/kubeadm/config/manager/manager.yaml b/bootstrap/kubeadm/config/manager/manager.yaml index d54386179159..3f7f357af8db 100644 --- a/bootstrap/kubeadm/config/manager/manager.yaml +++ b/bootstrap/kubeadm/config/manager/manager.yaml @@ -25,6 +25,9 @@ spec: - "--bootstrap-token-ttl=${KUBEADM_BOOTSTRAP_TOKEN_TTL:=15m}" image: controller:latest name: manager + env: + - name: GOMAXPROCS + value: "16" ports: - containerPort: 9440 name: healthz diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 8a2cd9c3c23d..2cfbf6b3c298 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -38,6 +38,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.uid + - name: GOMAXPROCS + value: "16" ports: - containerPort: 9440 name: healthz diff --git a/controlplane/kubeadm/config/manager/manager.yaml b/controlplane/kubeadm/config/manager/manager.yaml index 34e85f86e86d..14ae8ecec1d2 100644 --- a/controlplane/kubeadm/config/manager/manager.yaml +++ b/controlplane/kubeadm/config/manager/manager.yaml @@ -37,6 +37,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.uid + - name: GOMAXPROCS + value: "16" ports: - containerPort: 9440 name: healthz diff --git a/test/infrastructure/docker/config/manager/manager.yaml b/test/infrastructure/docker/config/manager/manager.yaml index 0a6fcafe94b8..bb1f5e9fb134 100644 --- a/test/infrastructure/docker/config/manager/manager.yaml +++ b/test/infrastructure/docker/config/manager/manager.yaml @@ -37,6 +37,8 @@ spec: fieldPath: metadata.uid - name: DOCKER_HOST value: ${CAPD_DOCKER_HOST:=""} + - name: GOMAXPROCS + value: "16" ports: - containerPort: 9440 name: healthz diff --git a/test/infrastructure/inmemory/config/manager/manager.yaml b/test/infrastructure/inmemory/config/manager/manager.yaml index 3c412cf85ccd..2daa31acdde2 100644 --- a/test/infrastructure/inmemory/config/manager/manager.yaml +++ b/test/infrastructure/inmemory/config/manager/manager.yaml @@ -28,6 +28,8 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP + - name: GOMAXPROCS + value: "16" ports: - containerPort: 9440 name: healthz From 3d0c9d31ca390bd4fc47d0afc69692e6d7dd6b70 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 23 Jun 2023 11:48:06 +0200 Subject: [PATCH 2/4] scale-only: go.mod: use CR with more client metrics --- go.mod | 2 ++ go.sum | 4 ++-- hack/tools/go.mod | 2 ++ hack/tools/go.sum | 4 ++-- test/go.mod | 2 ++ test/go.sum | 4 ++-- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 45579a397bca..df5add4b77c9 100644 --- a/go.mod +++ b/go.mod @@ -154,3 +154,5 @@ require ( github.com/russross/blackfriday/v2 v2.1.0 // indirect golang.org/x/tools v0.9.3 // indirect ) + +replace sigs.k8s.io/controller-runtime => github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 diff --git a/go.sum b/go.sum index a08904f42944..fe6e092f62e1 100644 --- a/go.sum +++ b/go.sum @@ -446,6 +446,8 @@ github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 h1:ToG0t2Q94/N5asZiY27UfEK8ipaGKQUare1Q29r+GRE= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= @@ -968,8 +970,6 @@ k8s.io/utils v0.0.0-20230209194617-a36077c30491/go.mod h1:OLgZIPagt7ERELqWJFomSt rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/controller-runtime v0.15.0 h1:ML+5Adt3qZnMSYxZ7gAverBLNPSMQEibtzAgp0UPojU= -sigs.k8s.io/controller-runtime v0.15.0/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/kustomize/api v0.13.2 h1:kejWfLeJhUsTGioDoFNJET5LQe/ajzXhJGYoU+pJsiA= diff --git a/hack/tools/go.mod b/hack/tools/go.mod index e6adc69f1916..ded1856e2e9c 100644 --- a/hack/tools/go.mod +++ b/hack/tools/go.mod @@ -159,3 +159,5 @@ require ( sigs.k8s.io/kustomize/kyaml v0.14.1 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect ) + +replace sigs.k8s.io/controller-runtime => github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 diff --git a/hack/tools/go.sum b/hack/tools/go.sum index 3fb823479db7..201b88d3d505 100644 --- a/hack/tools/go.sum +++ b/hack/tools/go.sum @@ -472,6 +472,8 @@ github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjR github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 h1:ToG0t2Q94/N5asZiY27UfEK8ipaGKQUare1Q29r+GRE= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= @@ -991,8 +993,6 @@ oras.land/oras-go v1.2.2/go.mod h1:Apa81sKoZPpP7CDciE006tSZ0x3Q3+dOoBcMZ/aNxvw= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/controller-runtime v0.15.0 h1:ML+5Adt3qZnMSYxZ7gAverBLNPSMQEibtzAgp0UPojU= -sigs.k8s.io/controller-runtime v0.15.0/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= sigs.k8s.io/controller-tools v0.12.0 h1:TY6CGE6+6hzO7hhJFte65ud3cFmmZW947jajXkuDfBw= sigs.k8s.io/controller-tools v0.12.0/go.mod h1:rXlpTfFHZMpZA8aGq9ejArgZiieHd+fkk/fTatY8A2M= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/test/go.mod b/test/go.mod index 0f70f3741629..9cf91aeb5f5d 100644 --- a/test/go.mod +++ b/test/go.mod @@ -157,3 +157,5 @@ require ( sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect ) + +replace sigs.k8s.io/controller-runtime => github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 diff --git a/test/go.sum b/test/go.sum index 3da9ee5d1e64..f4010fd4137a 100644 --- a/test/go.sum +++ b/test/go.sum @@ -456,6 +456,8 @@ github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjR github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599 h1:ToG0t2Q94/N5asZiY27UfEK8ipaGKQUare1Q29r+GRE= +github.com/sbueringer/controller-runtime v0.2.0-beta.1.0.20230629121040-af02ca513599/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= @@ -1011,8 +1013,6 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.1.2 h1:trsWhjU5jZrx6UvFu4WzQDrN7Pga4a7Qg+zcfcj64PA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.1.2/go.mod h1:+qG7ISXqCDVVcyO8hLn12AKVYYUjM7ftlqsqmrhMZE0= -sigs.k8s.io/controller-runtime v0.15.0 h1:ML+5Adt3qZnMSYxZ7gAverBLNPSMQEibtzAgp0UPojU= -sigs.k8s.io/controller-runtime v0.15.0/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/kind v0.20.0 h1:f0sc3v9mQbGnjBUaqSFST1dwIuiikKVGgoTwpoP33a8= From 27d625715fc498391c0a3d2598b8541bd80850fd Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Wed, 28 Jun 2023 16:22:03 +0200 Subject: [PATCH 3/4] scale-only: Define scenarios in scale_test.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- test/e2e/scale_test.go | 96 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 11 deletions(-) diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 6478ae2ed2e7..5065fbdcd3db 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -24,21 +24,95 @@ import ( "k8s.io/utils/pointer" ) -var _ = Describe("When scale testing using in-memory provider [Scale]", func() { +// Stefan +var _ = Describe("When scale testing using in-memory provider [Scale] [Small workload cluster]", func() { scaleSpec(ctx, func() scaleSpecInput { return scaleSpecInput{ - E2EConfig: e2eConfig, - ClusterctlConfigPath: clusterctlConfigPath, - InfrastructureProvider: pointer.String("in-memory"), - BootstrapClusterProxy: bootstrapClusterProxy, - ArtifactFolder: artifactFolder, - ClusterCount: pointer.Int64(10), + E2EConfig: e2eConfig, + ClusterctlConfigPath: clusterctlConfigPath, + InfrastructureProvider: pointer.String("in-memory"), + BootstrapClusterProxy: bootstrapClusterProxy, + ArtifactFolder: artifactFolder, + FailFast: false, + SkipWaitForCreation: false, + SkipCleanup: true, + Flavor: pointer.String(""), + // per Scenario + Concurrency: pointer.Int64(50), + ClusterCount: pointer.Int64(2000), + ControlPlaneMachineCount: pointer.Int64(1), + MachineDeploymentCount: pointer.Int64(0), + WorkerMachineCount: pointer.Int64(0), + DeployClusterInSeparateNamespaces: true, + } + }) +}) + +// Yuvaraj +var _ = Describe("When scale testing using in-memory provider [Scale] [Small medium workload cluster]", func() { + scaleSpec(ctx, func() scaleSpecInput { + return scaleSpecInput{ + E2EConfig: e2eConfig, + ClusterctlConfigPath: clusterctlConfigPath, + InfrastructureProvider: pointer.String("in-memory"), + BootstrapClusterProxy: bootstrapClusterProxy, + ArtifactFolder: artifactFolder, + FailFast: false, + SkipWaitForCreation: false, + SkipCleanup: true, + Flavor: pointer.String(""), + // per Scenario + Concurrency: pointer.Int64(20), + ClusterCount: pointer.Int64(200), + ControlPlaneMachineCount: pointer.Int64(3), + MachineDeploymentCount: pointer.Int64(1), + WorkerMachineCount: pointer.Int64(10), + } + }) +}) + +// Yuvaraj +var _ = Describe("When scale testing using in-memory provider [Scale] [Medium workload cluster]", func() { + scaleSpec(ctx, func() scaleSpecInput { + return scaleSpecInput{ + E2EConfig: e2eConfig, + ClusterctlConfigPath: clusterctlConfigPath, + InfrastructureProvider: pointer.String("in-memory"), + BootstrapClusterProxy: bootstrapClusterProxy, + ArtifactFolder: artifactFolder, + FailFast: false, + SkipWaitForCreation: false, + SkipCleanup: true, + Flavor: pointer.String(""), + // per Scenario Concurrency: pointer.Int64(5), - Flavor: pointer.String(""), - ControlPlaneMachineCount: pointer.Int64(1), + ClusterCount: pointer.Int64(40), + ControlPlaneMachineCount: pointer.Int64(3), MachineDeploymentCount: pointer.Int64(1), - WorkerMachineCount: pointer.Int64(3), - SkipCleanup: skipCleanup, + WorkerMachineCount: pointer.Int64(50), + } + }) +}) + +// Yuvaraj +var _ = Describe("When scale testing using in-memory provider [Scale] [Large workload cluster]", func() { + scaleSpec(ctx, func() scaleSpecInput { + return scaleSpecInput{ + E2EConfig: e2eConfig, + ClusterctlConfigPath: clusterctlConfigPath, + InfrastructureProvider: pointer.String("in-memory"), + BootstrapClusterProxy: bootstrapClusterProxy, + ArtifactFolder: artifactFolder, + FailFast: false, + SkipWaitForCreation: false, + SkipCleanup: true, + Flavor: pointer.String(""), + // per Scenario + Concurrency: pointer.Int64(1), + ClusterCount: pointer.Int64(1), + ControlPlaneMachineCount: pointer.Int64(3), + MachineDeploymentCount: pointer.Int64(500), + WorkerMachineCount: pointer.Int64(2), } }) }) From 9fde59c9c3ba0326aec62f5a9d8529feec8a0e3b Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 23 Jun 2023 12:11:36 +0200 Subject: [PATCH 4/4] follow-up: implement tracing --- controllers/alias.go | 6 + controllers/external/util.go | 7 + controllers/remote/cluster_cache_tracker.go | 12 + controlplane/kubeadm/controllers/alias.go | 3 + controlplane/kubeadm/internal/cluster.go | 16 + .../kubeadm/internal/control_plane.go | 7 + .../internal/controllers/controller.go | 38 +- .../kubeadm/internal/controllers/helpers.go | 16 + .../internal/controllers/remediation.go | 7 + .../kubeadm/internal/controllers/scale.go | 10 + .../kubeadm/internal/controllers/status.go | 4 + .../kubeadm/internal/controllers/upgrade.go | 4 + .../kubeadm/internal/etcd_client_generator.go | 7 + .../kubeadm/internal/workload_cluster.go | 34 ++ .../internal/workload_cluster_conditions.go | 16 + .../internal/workload_cluster_coredns.go | 19 + .../kubeadm/internal/workload_cluster_etcd.go | 25 + .../kubeadm/internal/workload_cluster_rbac.go | 14 + controlplane/kubeadm/main.go | 32 +- go.mod | 13 + go.sum | 39 ++ .../dashboards/cluster-api-traces.json | 434 ++++++++++++++++++ hack/observability/grafana/kustomization.yaml | 1 + hack/tools/go.mod | 13 + hack/tools/go.sum | 32 ++ .../controllers/machine/machine_controller.go | 19 +- .../machine/machine_controller_noderef.go | 4 + .../machine/machine_controller_phases.go | 15 +- .../controllers/topology/cluster/blueprint.go | 4 + .../topology/cluster/cluster_controller.go | 12 +- .../topology/cluster/current_state.go | 16 +- .../topology/cluster/desired_state.go | 4 + .../topology/cluster/reconcile_state.go | 37 ++ internal/util/ssa/managedfields.go | 7 + internal/util/ssa/patch.go | 4 + internal/util/trace/trace.go | 157 +++++++ main.go | 33 +- util/conversion/conversion.go | 4 + util/kubeconfig/kubeconfig.go | 7 + util/patch/patch.go | 4 + util/secret/certificates.go | 5 + 41 files changed, 1132 insertions(+), 9 deletions(-) create mode 100644 hack/observability/grafana/dashboards/cluster-api-traces.json create mode 100644 internal/util/trace/trace.go diff --git a/controllers/alias.go b/controllers/alias.go index a48394c425fa..f89b409034cc 100644 --- a/controllers/alias.go +++ b/controllers/alias.go @@ -20,6 +20,7 @@ import ( "context" "time" + oteltrace "go.opentelemetry.io/otel/trace" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -65,6 +66,7 @@ type MachineReconciler struct { UnstructuredCachingClient client.Client APIReader client.Reader Tracker *remote.ClusterCacheTracker + TraceProvider oteltrace.TracerProvider // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -79,6 +81,7 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag UnstructuredCachingClient: r.UnstructuredCachingClient, APIReader: r.APIReader, Tracker: r.Tracker, + TraceProvider: r.TraceProvider, WatchFilterValue: r.WatchFilterValue, NodeDrainClientTimeout: r.NodeDrainClientTimeout, }).SetupWithManager(ctx, mgr, options) @@ -150,6 +153,8 @@ type ClusterTopologyReconciler struct { RuntimeClient runtimeclient.Client + TraceProvider oteltrace.TracerProvider + // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -164,6 +169,7 @@ func (r *ClusterTopologyReconciler) SetupWithManager(ctx context.Context, mgr ct APIReader: r.APIReader, RuntimeClient: r.RuntimeClient, UnstructuredCachingClient: r.UnstructuredCachingClient, + TraceProvider: r.TraceProvider, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } diff --git a/controllers/external/util.go b/controllers/external/util.go index 89647454417d..85fbf154755c 100644 --- a/controllers/external/util.go +++ b/controllers/external/util.go @@ -28,10 +28,14 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) // Get uses the client and reference to get an external, unstructured object. func Get(ctx context.Context, c client.Reader, ref *corev1.ObjectReference, namespace string) (*unstructured.Unstructured, error) { + ctx, span := traceutil.Start(ctx, "external.Get") + defer span.End() + if ref == nil { return nil, errors.Errorf("cannot get object - object reference not set") } @@ -48,6 +52,9 @@ func Get(ctx context.Context, c client.Reader, ref *corev1.ObjectReference, name // Delete uses the client and reference to delete an external, unstructured object. func Delete(ctx context.Context, c client.Writer, ref *corev1.ObjectReference) error { + ctx, span := traceutil.Start(ctx, "external.Delete") + defer span.End() + obj := new(unstructured.Unstructured) obj.SetAPIVersion(ref.APIVersion) obj.SetKind(ref.Kind) diff --git a/controllers/remote/cluster_cache_tracker.go b/controllers/remote/cluster_cache_tracker.go index 191e09db85eb..eed03fc24216 100644 --- a/controllers/remote/cluster_cache_tracker.go +++ b/controllers/remote/cluster_cache_tracker.go @@ -27,6 +27,7 @@ import ( "github.com/go-logr/logr" "github.com/pkg/errors" + oteltrace "go.opentelemetry.io/otel/trace" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -97,6 +98,8 @@ type ClusterCacheTracker struct { // This information will be used to detected if the controller is running on a workload cluster, so // that we can then access the apiserver directly. controllerPodMetadata *metav1.ObjectMeta + + traceProvider oteltrace.TracerProvider } // ClusterCacheTrackerOptions defines options to configure @@ -121,6 +124,8 @@ type ClusterCacheTrackerOptions struct { // This is used to calculate the user agent string. // If not set, it defaults to "cluster-cache-tracker". ControllerName string + + TraceProvider oteltrace.TracerProvider } func setDefaultOptions(opts *ClusterCacheTrackerOptions) { @@ -135,6 +140,10 @@ func setDefaultOptions(opts *ClusterCacheTrackerOptions) { &corev1.Secret{}, } } + + if opts.TraceProvider == nil { + opts.TraceProvider = oteltrace.NewNoopTracerProvider() + } } // NewClusterCacheTracker creates a new ClusterCacheTracker. @@ -166,6 +175,7 @@ func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOpt controllerPodMetadata: controllerPodMetadata, log: *options.Log, clientUncachedObjects: options.ClientUncachedObjects, + traceProvider: options.TraceProvider, client: manager.GetClient(), secretCachingClient: options.SecretCachingClient, scheme: manager.GetScheme(), @@ -294,6 +304,8 @@ func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster cl if err != nil { return nil, errors.Wrapf(err, "error fetching REST client config for remote cluster %q", cluster.String()) } + // FIXME: this seems to lead to problems with spans (random 10s spans in the trace) + // config.Wrap(tracing.WrapperFor(t.traceProvider)) //nolint:gocritic // Create a client and a cache for the cluster. c, uncachedClient, cache, err := t.createClient(ctx, config, cluster, indexes) diff --git a/controlplane/kubeadm/controllers/alias.go b/controlplane/kubeadm/controllers/alias.go index 7b03abff618a..8ba2f0b53231 100644 --- a/controlplane/kubeadm/controllers/alias.go +++ b/controlplane/kubeadm/controllers/alias.go @@ -20,6 +20,7 @@ import ( "context" "time" + "go.opentelemetry.io/otel/trace" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -33,6 +34,7 @@ type KubeadmControlPlaneReconciler struct { Client client.Client SecretCachingClient client.Client Tracker *remote.ClusterCacheTracker + TraceProvider trace.TracerProvider EtcdDialTimeout time.Duration EtcdCallTimeout time.Duration @@ -47,6 +49,7 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg Client: r.Client, SecretCachingClient: r.SecretCachingClient, Tracker: r.Tracker, + TraceProvider: r.TraceProvider, EtcdDialTimeout: r.EtcdDialTimeout, EtcdCallTimeout: r.EtcdCallTimeout, WatchFilterValue: r.WatchFilterValue, diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index 3f4b7721ccba..e9c6ec68912f 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -32,6 +32,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/secret" ) @@ -84,11 +85,17 @@ func (m *Management) List(ctx context.Context, list client.ObjectList, opts ...c // GetMachinesForCluster returns a list of machines that can be filtered or not. // If no filter is supplied then all machines associated with the target cluster are returned. func (m *Management) GetMachinesForCluster(ctx context.Context, cluster *clusterv1.Cluster, filters ...collections.Func) (collections.Machines, error) { + ctx, span := traceutil.Start(ctx, "Management.GetMachinesForCluster") + defer span.End() + return collections.GetFilteredMachinesForCluster(ctx, m.Client, cluster, filters...) } // GetMachinePoolsForCluster returns a list of machine pools owned by the cluster. func (m *Management) GetMachinePoolsForCluster(ctx context.Context, cluster *clusterv1.Cluster) (*expv1.MachinePoolList, error) { + ctx, span := traceutil.Start(ctx, "Management.GetMachinesForCluster") + defer span.End() + selectors := []client.ListOption{ client.InNamespace(cluster.GetNamespace()), client.MatchingLabels{ @@ -103,6 +110,9 @@ func (m *Management) GetMachinePoolsForCluster(ctx context.Context, cluster *clu // GetWorkloadCluster builds a cluster object. // The cluster comes with an etcd client generator to connect to any etcd pod living on a managed machine. func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error) { + ctx, span := traceutil.Start(ctx, "Management.GetWorkloadCluster") + defer span.End() + // TODO(chuckha): Inject this dependency. // TODO(chuckha): memoize this function. The workload client only exists as long as a reconciliation loop. restConfig, err := m.Tracker.GetRESTConfig(ctx, clusterKey) @@ -178,6 +188,9 @@ func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey client.O } func (m *Management) getEtcdCAKeyPair(ctx context.Context, clusterKey client.ObjectKey) ([]byte, []byte, error) { + ctx, span := traceutil.Start(ctx, "Management.getEtcdCAKeyPair") + defer span.End() + etcdCASecret := &corev1.Secret{} etcdCAObjectKey := client.ObjectKey{ Namespace: clusterKey.Namespace, @@ -207,6 +220,9 @@ func (m *Management) getEtcdCAKeyPair(ctx context.Context, clusterKey client.Obj } func (m *Management) getAPIServerEtcdClientCert(ctx context.Context, clusterKey client.ObjectKey) (tls.Certificate, error) { + ctx, span := traceutil.Start(ctx, "Management.getAPIServerEtcdClientCert") + defer span.End() + apiServerEtcdClientCertificateSecret := &corev1.Secret{} apiServerEtcdClientCertificateObjectKey := client.ObjectKey{ Namespace: clusterKey.Namespace, diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 958aea89cbd6..792710579823 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -30,6 +30,7 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/external" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/failuredomains" "sigs.k8s.io/cluster-api/util/patch" @@ -58,6 +59,9 @@ type ControlPlane struct { // NewControlPlane returns an instantiated ControlPlane. func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, client client.Client, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, ownedMachines collections.Machines) (*ControlPlane, error) { + ctx, span := traceutil.Start(ctx, "NewControlPlane") + defer span.End() + infraObjects, err := getInfraResources(ctx, client, ownedMachines) if err != nil { return nil, err @@ -255,6 +259,9 @@ func (c *ControlPlane) HasUnhealthyMachine() bool { // PatchMachines patches all the machines conditions. func (c *ControlPlane) PatchMachines(ctx context.Context) error { + ctx, span := traceutil.Start(ctx, "ControlPlane.PatchMachines") + defer span.End() + errList := []error{} for i := range c.Machines { machine := c.Machines[i] diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 3e6bc71af9ea..349542c272d1 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -24,6 +24,7 @@ import ( "github.com/blang/semver" "github.com/pkg/errors" + oteltrace "go.opentelemetry.io/otel/trace" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -47,6 +48,7 @@ import ( "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/util/ssa" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" @@ -76,6 +78,7 @@ type KubeadmControlPlaneReconciler struct { controller controller.Controller recorder record.EventRecorder Tracker *remote.ClusterCacheTracker + TraceProvider oteltrace.TracerProvider EtcdDialTimeout time.Duration EtcdCallTimeout time.Duration @@ -95,6 +98,10 @@ type KubeadmControlPlaneReconciler struct { } func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + if r.TraceProvider == nil { + r.TraceProvider = oteltrace.NewNoopTracerProvider() + } + tr := traceutil.Reconciler(r, r.TraceProvider, "kubeadmcontrolplane", "KubeadmControlPlane") c, err := ctrl.NewControllerManagedBy(mgr). For(&controlplanev1.KubeadmControlPlane{}). Owns(&clusterv1.Machine{}). @@ -109,7 +116,7 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg predicates.ClusterUnpausedAndInfrastructureReady(ctrl.LoggerFrom(ctx)), ), ), - ).Build(r) + ).Build(tr) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } @@ -301,6 +308,9 @@ func (r *KubeadmControlPlaneReconciler) initControlPlaneScope(ctx context.Contex } func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error { + ctx, span := traceutil.Start(ctx, "patchKubeadmControlPlane") + defer span.End() + // Always update the readyCondition by summarizing the state of other conditions. conditions.SetSummary(kcp, conditions.WithConditions( @@ -332,6 +342,9 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc // reconcile handles KubeadmControlPlane reconciliation. func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPlane *internal.ControlPlane) (res ctrl.Result, reterr error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcile") + defer span.End() + log := ctrl.LoggerFrom(ctx) log.Info("Reconcile KubeadmControlPlane") @@ -507,6 +520,9 @@ func (r *KubeadmControlPlaneReconciler) reconcileClusterCertificates(ctx context // The implementation does not take non-control plane workloads into consideration. This may or may not change in the future. // Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064. func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileDelete") + defer span.End() + log := ctrl.LoggerFrom(ctx) log.Info("Reconcile KubeadmControlPlane deletion") @@ -594,6 +610,9 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(_ context.C // Otherwise, fields would be co-owned by our "old" "manager" and "capi-kubeadmcontrolplane" and then we would not be // able to e.g. drop labels and annotations. func (r *KubeadmControlPlaneReconciler) syncMachines(ctx context.Context, controlPlane *internal.ControlPlane) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.syncMachines") + defer span.End() + patchHelpers := map[string]*patch.Helper{} for machineName := range controlPlane.Machines { m := controlPlane.Machines[machineName] @@ -677,6 +696,9 @@ func (r *KubeadmControlPlaneReconciler) syncMachines(ctx context.Context, contro // reconcileControlPlaneConditions is responsible of reconciling conditions reporting the status of static pods and // the status of the etcd cluster. func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneConditions(ctx context.Context, controlPlane *internal.ControlPlane) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileControlPlaneConditions") + defer span.End() + // If the cluster is not yet initialized, there is no way to connect to the workload cluster and fetch information // for updating conditions. Return early. if !controlPlane.KCP.Status.Initialized { @@ -706,6 +728,9 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneConditions(ctx cont // // NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, controlPlane *internal.ControlPlane) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileEtcdMembers") + defer span.End() + log := ctrl.LoggerFrom(ctx) // If etcd is not managed by KCP this is a no-op. @@ -758,6 +783,9 @@ func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context } func (r *KubeadmControlPlaneReconciler) reconcileCertificateExpiries(ctx context.Context, controlPlane *internal.ControlPlane) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileCertificateExpiries") + defer span.End() + log := ctrl.LoggerFrom(ctx) // Return if there are no KCP-owned control-plane machines. @@ -828,6 +856,9 @@ func (r *KubeadmControlPlaneReconciler) reconcileCertificateExpiries(ctx context } func (r *KubeadmControlPlaneReconciler) adoptMachines(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines, cluster *clusterv1.Cluster) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.adoptMachines") + defer span.End() + // We do an uncached full quorum read against the KCP to avoid re-adopting Machines the garbage collector just intentionally orphaned // See https://github.com/kubernetes/kubernetes/issues/42639 uncached := controlplanev1.KubeadmControlPlane{} @@ -905,6 +936,9 @@ func (r *KubeadmControlPlaneReconciler) adoptMachines(ctx context.Context, kcp * } func (r *KubeadmControlPlaneReconciler) adoptOwnedSecrets(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, currentOwner *bootstrapv1.KubeadmConfig, clusterName string) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.adoptOwnedSecrets") + defer span.End() + secrets := corev1.SecretList{} if err := r.Client.List(ctx, &secrets, client.InNamespace(kcp.Namespace), client.MatchingLabels{clusterv1.ClusterNameLabel: clusterName}); err != nil { return errors.Wrap(err, "error finding secrets for adoption") @@ -941,6 +975,8 @@ func (r *KubeadmControlPlaneReconciler) adoptOwnedSecrets(ctx context.Context, k // ensureCertificatesOwnerRef ensures an ownerReference to the owner is added on the Secrets holding certificates. func (r *KubeadmControlPlaneReconciler) ensureCertificatesOwnerRef(ctx context.Context, certificates secret.Certificates, owner metav1.OwnerReference) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.ensureCertificatesOwnerRef") + defer span.End() for _, c := range certificates { if c.Secret == nil { continue diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index 49431f90a729..74af4da9c676 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -38,6 +38,7 @@ import ( controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" "sigs.k8s.io/cluster-api/internal/util/ssa" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/certs" "sigs.k8s.io/cluster-api/util/conditions" @@ -48,6 +49,9 @@ import ( ) func (r *KubeadmControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileKubeconfig") + defer span.End() + log := ctrl.LoggerFrom(ctx) endpoint := controlPlane.Cluster.Spec.ControlPlaneEndpoint @@ -102,6 +106,9 @@ func (r *KubeadmControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, // Ensure the KubeadmConfigSecret has an owner reference to the control plane if it is not a user-provided secret. func (r *KubeadmControlPlaneReconciler) adoptKubeconfigSecret(ctx context.Context, configSecret *corev1.Secret, kcp *controlplanev1.KubeadmControlPlane) (reterr error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.adoptKubeconfigSecret") + defer span.End() + patchHelper, err := patch.NewHelper(configSecret, r.Client) if err != nil { return errors.Wrap(err, "failed to create patch helper for the kubeconfig secret") @@ -132,6 +139,9 @@ func (r *KubeadmControlPlaneReconciler) adoptKubeconfigSecret(ctx context.Contex } func (r *KubeadmControlPlaneReconciler) reconcileExternalReference(ctx context.Context, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileExternalReference") + defer span.End() + if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { return nil } @@ -278,6 +288,9 @@ func (r *KubeadmControlPlaneReconciler) generateKubeadmConfig(ctx context.Contex // updateExternalObject updates the external object with the labels and annotations from KCP. func (r *KubeadmControlPlaneReconciler) updateExternalObject(ctx context.Context, obj client.Object, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.updateExternalObject") + defer span.End() + updatedObject := &unstructured.Unstructured{} updatedObject.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind()) updatedObject.SetNamespace(obj.GetNamespace()) @@ -312,6 +325,9 @@ func (r *KubeadmControlPlaneReconciler) createMachine(ctx context.Context, kcp * } func (r *KubeadmControlPlaneReconciler) updateMachine(ctx context.Context, machine *clusterv1.Machine, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) (*clusterv1.Machine, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.updateMachine") + defer span.End() + updatedMachine, err := r.computeDesiredMachine( kcp, cluster, &machine.Spec.InfrastructureRef, machine.Spec.Bootstrap.ConfigRef, diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index d1c8fd44efe9..1e225a20be73 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -33,6 +33,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" @@ -41,6 +42,9 @@ import ( // reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.reconcileUnhealthyMachines") + defer span.End() + log := ctrl.LoggerFrom(ctx) reconciliationTime := time.Now().UTC() @@ -351,6 +355,9 @@ func max(x, y time.Duration) time.Duration { // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers // as well as reconcileControlPlaneConditions before this. func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.canSafelyRemoveEtcdMember") + defer span.End() + log := ctrl.LoggerFrom(ctx) workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 4ed1682c210b..8006ca1da3ff 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -31,11 +31,15 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" ) func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.initializeControlPlane") + defer span.End() + logger := ctrl.LoggerFrom(ctx) bootstrapSpec := controlPlane.InitialControlPlaneConfig() @@ -51,6 +55,9 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte } func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.scaleUpControlPlane") + defer span.End() + logger := ctrl.LoggerFrom(ctx) // Run preflight checks to ensure that the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. @@ -76,6 +83,9 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( controlPlane *internal.ControlPlane, outdatedMachines collections.Machines, ) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.scaleDownControlPlane") + defer span.End() + logger := ctrl.LoggerFrom(ctx) // Pick the Machine that we should scale down. diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index f135baf0aae2..c49cc61d500f 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -24,6 +24,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" ) @@ -31,6 +32,9 @@ import ( // updateStatus is called after every reconcilitation loop in a defer statement to always make sure we have the // resource status subresourcs up-to-date. func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, controlPlane *internal.ControlPlane) error { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.updateStatus") + defer span.End() + selector := collections.ControlPlaneSelectorForCluster(controlPlane.Cluster.Name) // Copy label selector to its status counterpart in string format. // This is necessary for CRDs including scale subresources. diff --git a/controlplane/kubeadm/internal/controllers/upgrade.go b/controlplane/kubeadm/internal/controllers/upgrade.go index b1528283d959..83313c912993 100644 --- a/controlplane/kubeadm/internal/controllers/upgrade.go +++ b/controlplane/kubeadm/internal/controllers/upgrade.go @@ -25,6 +25,7 @@ import ( controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/version" @@ -35,6 +36,9 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( controlPlane *internal.ControlPlane, machinesRequireUpgrade collections.Machines, ) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "kubeadmcontrolplane.Reconciler.upgradeControlPlane") + defer span.End() + logger := ctrl.LoggerFrom(ctx) if controlPlane.KCP.Spec.RolloutStrategy == nil || controlPlane.KCP.Spec.RolloutStrategy.RollingUpdate == nil { diff --git a/controlplane/kubeadm/internal/etcd_client_generator.go b/controlplane/kubeadm/internal/etcd_client_generator.go index f67fc04281d3..de2e97c87d11 100644 --- a/controlplane/kubeadm/internal/etcd_client_generator.go +++ b/controlplane/kubeadm/internal/etcd_client_generator.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) // EtcdClientGenerator generates etcd clients that connect to specific etcd members on particular control plane nodes. @@ -47,6 +48,9 @@ func NewEtcdClientGenerator(restConfig *rest.Config, tlsConfig *tls.Config, etcd ecg := &EtcdClientGenerator{restConfig: restConfig, tlsConfig: tlsConfig} ecg.createClient = func(ctx context.Context, endpoint string) (*etcd.Client, error) { + ctx, span := traceutil.Start(ctx, "EtcdClientGenerator.createClient") + defer span.End() + p := proxy.Proxy{ Kind: "pods", Namespace: metav1.NamespaceSystem, @@ -67,6 +71,9 @@ func NewEtcdClientGenerator(restConfig *rest.Config, tlsConfig *tls.Config, etcd // forFirstAvailableNode takes a list of nodes and returns a client for the first one that connects. func (c *EtcdClientGenerator) forFirstAvailableNode(ctx context.Context, nodeNames []string) (*etcd.Client, error) { + ctx, span := traceutil.Start(ctx, "EtcdClientGenerator.forFirstAvailableNode") + defer span.End() + // This is an additional safeguard for avoiding this func to return nil, nil. if len(nodeNames) == 0 { return nil, errors.New("invalid argument: forLeader can't be called with an empty list of nodes") diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go index 4112106f570e..ba308ab0ed18 100644 --- a/controlplane/kubeadm/internal/workload_cluster.go +++ b/controlplane/kubeadm/internal/workload_cluster.go @@ -48,6 +48,7 @@ import ( controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy" "sigs.k8s.io/cluster-api/internal/util/kubeadm" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/certs" containerutil "sigs.k8s.io/cluster-api/util/container" @@ -136,6 +137,9 @@ type Workload struct { var _ WorkloadCluster = &Workload{} func (w *Workload) getControlPlaneNodes(ctx context.Context) (*corev1.NodeList, error) { + ctx, span := traceutil.Start(ctx, "Workload.getControlPlaneNodes") + defer span.End() + controlPlaneNodes := &corev1.NodeList{} controlPlaneNodeNames := sets.Set[string]{} @@ -173,6 +177,9 @@ func (w *Workload) getConfigMap(ctx context.Context, configMap ctrlclient.Object // UpdateImageRepositoryInKubeadmConfigMap updates the image repository in the kubeadm config map. func (w *Workload) UpdateImageRepositoryInKubeadmConfigMap(ctx context.Context, imageRepository string, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateImageRepositoryInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { if imageRepository == "" { return @@ -183,6 +190,9 @@ func (w *Workload) UpdateImageRepositoryInKubeadmConfigMap(ctx context.Context, // UpdateKubernetesVersionInKubeadmConfigMap updates the kubernetes version in the kubeadm config map. func (w *Workload) UpdateKubernetesVersionInKubeadmConfigMap(ctx context.Context, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateKubernetesVersionInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { c.KubernetesVersion = fmt.Sprintf("v%s", version.String()) }, version) @@ -191,6 +201,9 @@ func (w *Workload) UpdateKubernetesVersionInKubeadmConfigMap(ctx context.Context // UpdateKubeletConfigMap will create a new kubelet-config-1.x config map for a new version of the kubelet. // This is a necessary process for upgrades. func (w *Workload) UpdateKubeletConfigMap(ctx context.Context, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateKubeletConfigMap") + defer span.End() + // Check if the desired configmap already exists desiredKubeletConfigMapName := generateKubeletConfigName(version) configMapKey := ctrlclient.ObjectKey{Name: desiredKubeletConfigMapName, Namespace: metav1.NamespaceSystem} @@ -270,6 +283,9 @@ func (w *Workload) UpdateKubeletConfigMap(ctx context.Context, version semver.Ve // UpdateAPIServerInKubeadmConfigMap updates api server configuration in kubeadm config map. func (w *Workload) UpdateAPIServerInKubeadmConfigMap(ctx context.Context, apiServer bootstrapv1.APIServer, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateAPIServerInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { c.APIServer = apiServer }, version) @@ -277,6 +293,9 @@ func (w *Workload) UpdateAPIServerInKubeadmConfigMap(ctx context.Context, apiSer // UpdateControllerManagerInKubeadmConfigMap updates controller manager configuration in kubeadm config map. func (w *Workload) UpdateControllerManagerInKubeadmConfigMap(ctx context.Context, controllerManager bootstrapv1.ControlPlaneComponent, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateControllerManagerInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { c.ControllerManager = controllerManager }, version) @@ -284,6 +303,9 @@ func (w *Workload) UpdateControllerManagerInKubeadmConfigMap(ctx context.Context // UpdateSchedulerInKubeadmConfigMap updates scheduler configuration in kubeadm config map. func (w *Workload) UpdateSchedulerInKubeadmConfigMap(ctx context.Context, scheduler bootstrapv1.ControlPlaneComponent, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateSchedulerInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { c.Scheduler = scheduler }, version) @@ -301,6 +323,9 @@ func (w *Workload) RemoveMachineFromKubeadmConfigMap(ctx context.Context, machin // RemoveNodeFromKubeadmConfigMap removes the entry for the node from the kubeadm configmap. func (w *Workload) RemoveNodeFromKubeadmConfigMap(ctx context.Context, name string, v semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.RemoveNodeFromKubeadmConfigMap") + defer span.End() + if version.Compare(v, minKubernetesVersionWithoutClusterStatus, version.WithoutPreReleases()) >= 0 { return nil } @@ -400,6 +425,9 @@ type ClusterStatus struct { // ClusterStatus returns the status of the cluster. func (w *Workload) ClusterStatus(ctx context.Context) (ClusterStatus, error) { + ctx, span := traceutil.Start(ctx, "Workload.ClusterStatus") + defer span.End() + status := ClusterStatus{} // count the control plane nodes @@ -430,6 +458,9 @@ func (w *Workload) ClusterStatus(ctx context.Context) (ClusterStatus, error) { // GetAPIServerCertificateExpiry returns the certificate expiry of the apiserver on the given node. func (w *Workload) GetAPIServerCertificateExpiry(ctx context.Context, kubeadmConfig *bootstrapv1.KubeadmConfig, nodeName string) (*time.Time, error) { + ctx, span := traceutil.Start(ctx, "Workload.GetAPIServerCertificateExpiry") + defer span.End() + // Create a proxy. p := proxy.Proxy{ Kind: "pods", @@ -539,6 +570,9 @@ func staticPodName(component, nodeName string) string { // UpdateKubeProxyImageInfo updates kube-proxy image in the kube-proxy DaemonSet. func (w *Workload) UpdateKubeProxyImageInfo(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateKubeProxyImageInfo") + defer span.End() + // Return early if we've been asked to skip kube-proxy upgrades entirely. if _, ok := kcp.Annotations[controlplanev1.SkipKubeProxyAnnotation]; ok { return nil diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions.go b/controlplane/kubeadm/internal/workload_cluster_conditions.go index c54b1cdf617d..0bb8ad716a33 100644 --- a/controlplane/kubeadm/internal/workload_cluster_conditions.go +++ b/controlplane/kubeadm/internal/workload_cluster_conditions.go @@ -32,6 +32,7 @@ import ( controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" etcdutil "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd/util" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" ) @@ -40,6 +41,9 @@ import ( // This operation is best effort, in the sense that in case of problems in retrieving member status, it sets // the condition to Unknown state without returning any error. func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { + ctx, span := traceutil.Start(ctx, "Workload.UpdateEtcdConditions") + defer span.End() + if controlPlane.IsEtcdManaged() { w.updateManagedEtcdConditions(ctx, controlPlane) return @@ -57,6 +61,9 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane } func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { + ctx, span := traceutil.Start(ctx, "Workload.updateManagedEtcdConditions") + defer span.End() + // NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd // as ultimate source of truth for the list of members and for their health. controlPlaneNodes, err := w.getControlPlaneNodes(ctx) @@ -170,6 +177,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane } func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) { + ctx, span := traceutil.Start(ctx, "Workload.getCurrentEtcdMembers") + defer span.End() + // Create the etcd Client for the etcd Pod scheduled on the Node etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName}) if err != nil { @@ -244,6 +254,9 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Membe // components running in a static pod generated by kubeadm. This operation is best effort, in the sense that in case // of problems in retrieving the pod status, it sets the condition to Unknown state without returning any error. func (w *Workload) UpdateStaticPodConditions(ctx context.Context, controlPlane *ControlPlane) { + ctx, span := traceutil.Start(ctx, "Workload.UpdateStaticPodConditions") + defer span.End() + allMachinePodConditions := []clusterv1.ConditionType{ controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.MachineControllerManagerPodHealthyCondition, @@ -372,6 +385,9 @@ func nodeHasUnreachableTaint(node corev1.Node) bool { // in a static pod generated by kubeadm. This operation is best effort, in the sense that in case of problems // in retrieving the pod status, it sets the condition to Unknown state without returning any error. func (w *Workload) updateStaticPodCondition(ctx context.Context, machine *clusterv1.Machine, node corev1.Node, component string, staticPodCondition clusterv1.ConditionType) { + ctx, span := traceutil.Start(ctx, "Workload.updateStaticPodCondition") + defer span.End() + // If node ready is unknown there is a good chance that kubelet is not updating mirror pods, so we consider pod status // to be unknown as well without further investigations. if nodeReadyUnknown(node) { diff --git a/controlplane/kubeadm/internal/workload_cluster_coredns.go b/controlplane/kubeadm/internal/workload_cluster_coredns.go index 9ebf7eda9950..8e31b83a1dc6 100644 --- a/controlplane/kubeadm/internal/workload_cluster_coredns.go +++ b/controlplane/kubeadm/internal/workload_cluster_coredns.go @@ -36,6 +36,7 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/internal/util/kubeadm" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" containerutil "sigs.k8s.io/cluster-api/util/container" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/version" @@ -105,6 +106,9 @@ type coreDNSInfo struct { // UpdateCoreDNS updates the kubeadm configmap, coredns corefile and coredns // deployment. func (w *Workload) UpdateCoreDNS(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateCoreDNS") + defer span.End() + // Return early if we've been asked to skip CoreDNS upgrades entirely. if _, ok := kcp.Annotations[controlplanev1.SkipCoreDNSAnnotation]; ok { return nil @@ -160,6 +164,9 @@ func (w *Workload) UpdateCoreDNS(ctx context.Context, kcp *controlplanev1.Kubead // getCoreDNSInfo returns all necessary coredns based information. func (w *Workload) getCoreDNSInfo(ctx context.Context, clusterConfig *bootstrapv1.ClusterConfiguration, version semver.Version) (*coreDNSInfo, error) { + ctx, span := traceutil.Start(ctx, "Workload.getCoreDNSInfo") + defer span.End() + // Get the coredns configmap and corefile. key := ctrlclient.ObjectKey{Name: coreDNSKey, Namespace: metav1.NamespaceSystem} cm, err := w.getConfigMap(ctx, key) @@ -254,6 +261,9 @@ func (w *Workload) getCoreDNSInfo(ctx context.Context, clusterConfig *bootstrapv // imageRepo:imageTag in the KCP dns. It will also ensure the volume of the // deployment uses the Corefile key of the coredns configmap. func (w *Workload) updateCoreDNSDeployment(ctx context.Context, info *coreDNSInfo, kubernetesVersion semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.updateCoreDNSDeployment") + defer span.End() + helper, err := patch.NewHelper(info.Deployment, w.Client) if err != nil { return err @@ -271,6 +281,9 @@ func (w *Workload) updateCoreDNSDeployment(ctx context.Context, info *coreDNSInf // updateCoreDNSImageInfoInKubeadmConfigMap updates the kubernetes version in the kubeadm config map. func (w *Workload) updateCoreDNSImageInfoInKubeadmConfigMap(ctx context.Context, dns *bootstrapv1.DNS, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.updateCoreDNSImageInfoInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { c.DNS.ImageRepository = dns.ImageRepository c.DNS.ImageTag = dns.ImageTag @@ -282,6 +295,9 @@ func (w *Workload) updateCoreDNSImageInfoInKubeadmConfigMap(ctx context.Context, // To support Kubernetes clusters >= 1.22 (which have been initialized with kubeadm < 1.22) with CoreDNS versions >= 1.8.1 // we have to update the ClusterRole accordingly. func (w *Workload) updateCoreDNSClusterRole(ctx context.Context, kubernetesVersion semver.Version, info *coreDNSInfo) error { + ctx, span := traceutil.Start(ctx, "Workload.updateCoreDNSClusterRole") + defer span.End() + // Do nothing for Kubernetes < 1.22. if version.Compare(kubernetesVersion, semver.Version{Major: 1, Minor: 22, Patch: 0}, version.WithoutPreReleases()) < 0 { return nil @@ -359,6 +375,9 @@ func generateClusterRolePolicies(policyRules []rbacv1.PolicyRule) map[string]map // in version number. It also creates a corefile backup and patches the // deployment to point to the backup corefile before migrating. func (w *Workload) updateCoreDNSCorefile(ctx context.Context, info *coreDNSInfo) error { + ctx, span := traceutil.Start(ctx, "Workload.updateCoreDNSCorefile") + defer span.End() + // Run the CoreDNS migration tool first because if it cannot migrate the // corefile, then there's no point in continuing further. updatedCorefile, err := w.CoreDNSMigrator.Migrate(info.CurrentMajorMinorPatch, info.TargetMajorMinorPatch, info.Corefile, false) diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index 3b3662a29f01..81efec985491 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -27,6 +27,7 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" etcdutil "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd/util" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) type etcdClientFor interface { @@ -37,6 +38,9 @@ type etcdClientFor interface { // ReconcileEtcdMembers iterates over all etcd members and finds members that do not have corresponding nodes. // If there are any such members, it deletes them from etcd and removes their nodes from the kubeadm configmap so that kubeadm does not run etcd health checks on them. func (w *Workload) ReconcileEtcdMembers(ctx context.Context, nodeNames []string, version semver.Version) ([]string, error) { + ctx, span := traceutil.Start(ctx, "Workload.ReconcileEtcdMembers") + defer span.End() + allRemovedMembers := []string{} allErrs := []error{} for _, nodeName := range nodeNames { @@ -49,6 +53,9 @@ func (w *Workload) ReconcileEtcdMembers(ctx context.Context, nodeNames []string, } func (w *Workload) reconcileEtcdMember(ctx context.Context, nodeNames []string, nodeName string, version semver.Version) ([]string, []error) { + ctx, span := traceutil.Start(ctx, "Workload.reconcileEtcdMember") + defer span.End() + // Create the etcd Client for the etcd Pod scheduled on the Node etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName}) if err != nil { @@ -94,6 +101,9 @@ loopmembers: // UpdateEtcdVersionInKubeadmConfigMap sets the imageRepository or the imageTag or both in the kubeadm config map. func (w *Workload) UpdateEtcdVersionInKubeadmConfigMap(ctx context.Context, imageRepository, imageTag string, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateEtcdVersionInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { if c.Etcd.Local != nil { c.Etcd.Local.ImageRepository = imageRepository @@ -104,6 +114,9 @@ func (w *Workload) UpdateEtcdVersionInKubeadmConfigMap(ctx context.Context, imag // UpdateEtcdExtraArgsInKubeadmConfigMap sets extraArgs in the kubeadm config map. func (w *Workload) UpdateEtcdExtraArgsInKubeadmConfigMap(ctx context.Context, extraArgs map[string]string, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.UpdateEtcdExtraArgsInKubeadmConfigMap") + defer span.End() + return w.updateClusterConfiguration(ctx, func(c *bootstrapv1.ClusterConfiguration) { if c.Etcd.Local != nil { c.Etcd.Local.ExtraArgs = extraArgs @@ -114,6 +127,9 @@ func (w *Workload) UpdateEtcdExtraArgsInKubeadmConfigMap(ctx context.Context, ex // RemoveEtcdMemberForMachine removes the etcd member from the target cluster's etcd cluster. // Removing the last remaining member of the cluster is not supported. func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error { + ctx, span := traceutil.Start(ctx, "Workload.RemoveEtcdMemberForMachine") + defer span.End() + if machine == nil || machine.Status.NodeRef == nil { // Nothing to do, no node for Machine return nil @@ -122,6 +138,9 @@ func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clus } func (w *Workload) removeMemberForNode(ctx context.Context, name string) error { + ctx, span := traceutil.Start(ctx, "Workload.removeMemberForNode") + defer span.End() + controlPlaneNodes, err := w.getControlPlaneNodes(ctx) if err != nil { return err @@ -164,6 +183,9 @@ func (w *Workload) removeMemberForNode(ctx context.Context, name string) error { // ForwardEtcdLeadership forwards etcd leadership to the first follower. func (w *Workload) ForwardEtcdLeadership(ctx context.Context, machine *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { + ctx, span := traceutil.Start(ctx, "Workload.ForwardEtcdLeadership") + defer span.End() + if machine == nil || machine.Status.NodeRef == nil { return nil } @@ -222,6 +244,9 @@ type EtcdMemberStatus struct { // but then it relies on etcd as ultimate source of truth for the list of members. // This is intended to allow informed decisions on actions impacting etcd quorum. func (w *Workload) EtcdMembers(ctx context.Context) ([]string, error) { + ctx, span := traceutil.Start(ctx, "Workload.EtcdMembers") + defer span.End() + nodes, err := w.getControlPlaneNodes(ctx) if err != nil { return nil, errors.Wrap(err, "failed to list control plane nodes") diff --git a/controlplane/kubeadm/internal/workload_cluster_rbac.go b/controlplane/kubeadm/internal/workload_cluster_rbac.go index d714c6329c5f..771be03fbd84 100644 --- a/controlplane/kubeadm/internal/workload_cluster_rbac.go +++ b/controlplane/kubeadm/internal/workload_cluster_rbac.go @@ -26,6 +26,8 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) const ( @@ -50,6 +52,9 @@ const ( // EnsureResource creates a resoutce if the target resource doesn't exist. If the resource exists already, this function will ignore the resource instead. func (w *Workload) EnsureResource(ctx context.Context, obj client.Object) error { + ctx, span := traceutil.Start(ctx, "Workload.EnsureResource") + defer span.End() + testObj := obj.DeepCopyObject().(client.Object) key := client.ObjectKeyFromObject(obj) if err := w.Client.Get(ctx, key, testObj); err != nil && !apierrors.IsNotFound(err) { @@ -68,6 +73,9 @@ func (w *Workload) EnsureResource(ctx context.Context, obj client.Object) error // AllowBootstrapTokensToGetNodes creates RBAC rules to allow Node Bootstrap Tokens to list nodes. func (w *Workload) AllowBootstrapTokensToGetNodes(ctx context.Context) error { + ctx, span := traceutil.Start(ctx, "Workload.AllowBootstrapTokensToGetNodes") + defer span.End() + if err := w.EnsureResource(ctx, &rbacv1.ClusterRole{ ObjectMeta: metav1.ObjectMeta{ Name: GetNodesClusterRoleName, @@ -118,6 +126,9 @@ func generateKubeletConfigRoleName(version semver.Version) string { // ReconcileKubeletRBACBinding will create a RoleBinding for the new kubelet version during upgrades. // If the role binding already exists this function is a no-op. func (w *Workload) ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.ReconcileKubeletRBACBinding") + defer span.End() + roleName := generateKubeletConfigRoleName(version) return w.EnsureResource(ctx, &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ @@ -147,6 +158,9 @@ func (w *Workload) ReconcileKubeletRBACBinding(ctx context.Context, version semv // ReconcileKubeletRBACRole will create a Role for the new kubelet version during upgrades. // If the role already exists this function is a no-op. func (w *Workload) ReconcileKubeletRBACRole(ctx context.Context, version semver.Version) error { + ctx, span := traceutil.Start(ctx, "Workload.ReconcileKubeletRBACRole") + defer span.End() + return w.EnsureResource(ctx, &rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ Name: generateKubeletConfigRoleName(version), diff --git a/controlplane/kubeadm/main.go b/controlplane/kubeadm/main.go index 4b4e40670acb..d3adbc211a9d 100644 --- a/controlplane/kubeadm/main.go +++ b/controlplane/kubeadm/main.go @@ -28,6 +28,7 @@ import ( // +kubebuilder:scaffold:imports "github.com/spf13/pflag" + oteltrace "go.opentelemetry.io/otel/trace" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" @@ -40,7 +41,10 @@ import ( "k8s.io/component-base/logs" logsv1 "k8s.io/component-base/logs/api/v1" _ "k8s.io/component-base/logs/json/register" + "k8s.io/component-base/tracing" + tracingapi "k8s.io/component-base/tracing/api/v1" "k8s.io/klog/v2" + "k8s.io/utils/pointer" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" @@ -57,6 +61,7 @@ import ( "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" kcpwebhooks "sigs.k8s.io/cluster-api/controlplane/kubeadm/webhooks" "sigs.k8s.io/cluster-api/feature" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/flags" "sigs.k8s.io/cluster-api/version" ) @@ -65,6 +70,7 @@ var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") controllerName = "cluster-api-kubeadm-control-plane-manager" + appName = "capi-kubeadm-control-plane-controller-manager" ) func init() { @@ -101,6 +107,9 @@ var ( etcdCallTimeout time.Duration tlsOptions = flags.TLSOptions{} logOptions = logs.NewOptions() + + tracingEndpoint string + tracingSamplingRatePerMillion int32 ) // InitFlags initializes the flags. @@ -155,6 +164,13 @@ func InitFlags(fs *pflag.FlagSet) { fs.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.") + // FIXME: re-work flags + fs.StringVar(&tracingEndpoint, "tracing-endpoint", "", + "endpoint to send traces to") + + fs.Int32Var(&tracingSamplingRatePerMillion, "tracing-sampling-rate", 0, + "sample rate per million for tracing") + fs.DurationVar(&etcdDialTimeout, "etcd-dial-timeout-duration", 10*time.Second, "Duration that the etcd client waits at most to establish a connection with etcd") @@ -184,6 +200,16 @@ func main() { restConfig.Burst = restConfigBurst restConfig.UserAgent = remote.DefaultClusterAPIUserAgent(controllerName) + tp, err := traceutil.NewProvider(&tracingapi.TracingConfiguration{ + Endpoint: pointer.String(tracingEndpoint), + SamplingRatePerMillion: pointer.Int32(tracingSamplingRatePerMillion), + }, controllerName, appName) + if err != nil { + setupLog.Error(err, "unable to create tracing provider") + os.Exit(1) + } + restConfig.Wrap(tracing.WrapperFor(tp)) + tlsOptionOverrides, err := flags.GetTLSOptionOverrideFuncs(tlsOptions) if err != nil { setupLog.Error(err, "unable to add TLS settings to the webhook server") @@ -257,7 +283,7 @@ func main() { ctx := ctrl.SetupSignalHandler() setupChecks(mgr) - setupReconcilers(ctx, mgr) + setupReconcilers(ctx, mgr, tp) setupWebhooks(mgr) // +kubebuilder:scaffold:builder @@ -280,7 +306,7 @@ func setupChecks(mgr ctrl.Manager) { } } -func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { +func setupReconcilers(ctx context.Context, mgr ctrl.Manager, tp oteltrace.TracerProvider) { secretCachingClient, err := client.New(mgr.GetConfig(), client.Options{ HTTPClient: mgr.GetHTTPClient(), Cache: &client.CacheOptions{ @@ -306,6 +332,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { &appsv1.Deployment{}, &appsv1.DaemonSet{}, }, + TraceProvider: tp, }) if err != nil { setupLog.Error(err, "unable to create cluster cache tracker") @@ -324,6 +351,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { Client: mgr.GetClient(), SecretCachingClient: secretCachingClient, Tracker: tracker, + TraceProvider: tp, WatchFilterValue: watchFilterValue, EtcdDialTimeout: etcdDialTimeout, EtcdCallTimeout: etcdCallTimeout, diff --git a/go.mod b/go.mod index df5add4b77c9..f2ae9baa9539 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,10 @@ require ( github.com/valyala/fastjson v1.6.4 go.etcd.io/etcd/api/v3 v3.5.9 go.etcd.io/etcd/client/v3 v3.5.9 + go.opentelemetry.io/otel v1.10.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 + go.opentelemetry.io/otel/sdk v1.10.0 + go.opentelemetry.io/otel/trace v1.10.0 golang.org/x/net v0.11.0 // indirect golang.org/x/oauth2 v0.9.0 google.golang.org/grpc v1.55.0 @@ -145,13 +149,22 @@ require ( require ( github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.1.3 // indirect github.com/emicklei/go-restful/v3 v3.9.0 // indirect + github.com/felixge/httpsnoop v1.0.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/google/gnostic v0.6.9 // indirect github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 // indirect + go.opentelemetry.io/otel/metric v0.31.0 // indirect + go.opentelemetry.io/proto/otlp v0.19.0 // indirect golang.org/x/tools v0.9.3 // indirect ) diff --git a/go.sum b/go.sum index fe6e092f62e1..688615e97629 100644 --- a/go.sum +++ b/go.sum @@ -83,8 +83,11 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= +github.com/cenkalti/backoff/v4 v4.1.3 h1:cFAlzYUlVYDysBEH2T5hyJZMh3+5+WCBvSnK6Q8UtC4= +github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= @@ -96,7 +99,11 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= +github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4/go.mod h1:6pvJx4me5XPnfI9Z40ddWsdw2W/uZgQLFXToKeRcDiI= github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/coredns/caddy v1.1.0 h1:ezvsPrT/tA/7pYDBZxu0cT0VmWk75AfIaf6GSYCNMf0= github.com/coredns/caddy v1.1.0/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4= github.com/coredns/corefile-migration v1.0.20 h1:MdOkT6F3ehju/n9tgxlGct8XAajOX2vN+wG7To4BWSI= @@ -135,6 +142,7 @@ github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1m github.com/envoyproxy/go-control-plane v0.9.7/go.mod h1:cwu0lG7PUMfa9snN8LXBig5ynNVH9qI8YYLbd1fK2po= github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= +github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= @@ -145,6 +153,8 @@ github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d/go.mod h1:ZZM github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= +github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flatcar/container-linux-config-transpiler v0.9.4 h1:yXQ0NB8PeNrKJPrZvbv5/DV63PNhTqt8vaf8YxmX/RA= github.com/flatcar/container-linux-config-transpiler v0.9.4/go.mod h1:LxanhPvXkWgHG9PrkT4rX/p7YhUPdDGGsUdkNpV3L5U= github.com/flatcar/ignition v0.36.2 h1:xGHgScUe0P4Fkprjqv7L2CE58emiQgP833OCCn9z2v4= @@ -166,8 +176,11 @@ github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2 github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA= github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= @@ -188,6 +201,8 @@ github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zV github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= +github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -237,6 +252,7 @@ github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v48 v48.2.0 h1:68puzySE6WqUY9KWmpOsDEQfDZsso98rT6pZcz9HqcE= @@ -279,6 +295,8 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmg github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 h1:BZHcxBETFHIdVyhyEfOvn/RdU/QGdLI4y34qQGjGWO0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -535,7 +553,25 @@ go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 h1:sxoY9kG1s1WpSYNyzm24rlwH4lnRYFXUVVBmKMBfRgw= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1/go.mod h1:9NiG9I2aHTKkcxqCILhjtyNA1QEiCjdBACv4IvrFQ+c= +go.opentelemetry.io/otel v1.10.0 h1:Y7DTJMR6zs1xkS/upamJYk0SxxN4C9AqRd77jmZnyY4= +go.opentelemetry.io/otel v1.10.0/go.mod h1:NbvWjCthWHKBEUMpf0/v8ZRZlni86PpGFEMA9pnQSnQ= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 h1:TaB+1rQhddO1sF71MpZOZAuSPW1klK2M8XxfrBMfK7Y= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0/go.mod h1:78XhIg8Ht9vR4tbLNUhXsiOnE2HOuSeKAiAcoVQEpOY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 h1:pDDYmo0QadUPal5fwXoY1pmMpFcdyhXOmL5drCrI3vU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0/go.mod h1:Krqnjl22jUJ0HgMzw5eveuCvFDXY4nSYb4F8t5gdrag= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 h1:KtiUEhQmj/Pa874bVYKGNVdq8NPKiacPbaRRtgXi+t4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0/go.mod h1:OfUCyyIiDvNXHWpcWgbF+MWvqPZiNa3YDEnivcnYsV0= +go.opentelemetry.io/otel/metric v0.31.0 h1:6SiklT+gfWAwWUR0meEMxQBtihpiEs4c+vL9spDTqUs= +go.opentelemetry.io/otel/metric v0.31.0/go.mod h1:ohmwj9KTSIeBnDBm/ZwH2PSZxZzoOaG2xZeekTRzL5A= +go.opentelemetry.io/otel/sdk v1.10.0 h1:jZ6K7sVn04kk/3DNUdJ4mqRlGDiXAVuIG+MMENpTNdY= +go.opentelemetry.io/otel/sdk v1.10.0/go.mod h1:vO06iKzD5baltJz1zarxMCNHFpUlUiOy4s65ECtn6kE= +go.opentelemetry.io/otel/trace v1.10.0 h1:npQMbR8o7mum8uF95yFbOEJffhs1sbCOfDh8zAJiH5E= +go.opentelemetry.io/otel/trace v1.10.0/go.mod h1:Sij3YYczqAdz+EhmGhE6TpTxUO5/F/AzrK+kxfGqySM= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= +go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJPI1Nnw= +go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee331t6JAXeK2bcyhLOOc= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5/go.mod h1:nmDLcffg48OtT/PSW0Hg7FvpRQsQh5OSqIylirxKC7o= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= @@ -655,6 +691,7 @@ golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20201208152858-08078c50e5b5/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.9.0 h1:BPpt2kU7oMRq3kCHAA1tbSEshXRw1LpG2ztgDwrzuAs= golang.org/x/oauth2 v0.9.0/go.mod h1:qYgFZaFiu6Wg24azG8bdV52QJXJGbZzIIsRCdVKzbLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -869,6 +906,7 @@ google.golang.org/genproto v0.0.0-20201210142538-e3217bee35cc/go.mod h1:FWY/as6D google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210108203827-ffc7fda8c3d7/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210226172003-ab064af71705/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20211118181313-81c1377c94b1/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= @@ -891,6 +929,7 @@ google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA5 google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= +google.golang.org/grpc v1.42.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag= google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= diff --git a/hack/observability/grafana/dashboards/cluster-api-traces.json b/hack/observability/grafana/dashboards/cluster-api-traces.json new file mode 100644 index 000000000000..2a0bf3cebfbf --- /dev/null +++ b/hack/observability/grafana/dashboards/cluster-api-traces.json @@ -0,0 +1,434 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sort_desc(rate(traces_spanmetrics_latency_sum{span_name!~\"HTTP.*\",span_name!~\"topology.*\",span_name!~\"machine.*\"}[5m]) / rate(traces_spanmetrics_latency_count[5m]))", + "legendFormat": "{{span_name}}", + "range": true, + "refId": "A" + } + ], + "title": "KCP average", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sort_desc(histogram_quantile(0.50, sum by(le,span_name) (rate(traces_spanmetrics_latency_bucket{span_name=~\"$Span\",span_name!~\"HTTP.*\",span_name!~\"topology.*\",span_name!~\"machine.*\"}[5m])))) > 0.01", + "legendFormat": "{{span_name}}", + "range": true, + "refId": "A" + } + ], + "title": "KCP 50% percentile", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sort_desc(histogram_quantile(0.90, sum by(le,span_name) (rate(traces_spanmetrics_latency_bucket{span_name=~\"$Span\",span_name!~\"HTTP.*\",span_name!~\"topology.*\",span_name!~\"machine.*\"}[5m])))) > 0.01", + "legendFormat": "{{span_name}}", + "range": true, + "refId": "A" + } + ], + "title": "KCP 90% percentile", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 2, + "options": { + "calculate": false, + "cellGap": 2, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": true, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(traces_spanmetrics_latency_bucket{span_name=~\"$Span\",span_name!~\"HTTP.*\",span_name!~\"topology.*\",span_name!~\"machine.*\"}[$__rate_interval])", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "KCP Traces", + "type": "heatmap" + } + ], + "refresh": "", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(traces_spanmetrics_latency_sum,span_name)", + "hide": 0, + "includeAll": true, + "label": "Span", + "multi": true, + "name": "Span", + "options": [], + "query": { + "query": "label_values(traces_spanmetrics_latency_sum,span_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster API Traces", + "uid": "b85270a6-5a83-45da-8ee8-1d4352e38bf4", + "version": 1, + "weekStart": "" +} diff --git a/hack/observability/grafana/kustomization.yaml b/hack/observability/grafana/kustomization.yaml index 03c85aef7cba..61a893eda807 100644 --- a/hack/observability/grafana/kustomization.yaml +++ b/hack/observability/grafana/kustomization.yaml @@ -15,5 +15,6 @@ configMapGenerator: - dashboards/cluster-api-mgmt-apiserver-requests.json - dashboards/cluster-api-performance.json - dashboards/cluster-api-state.json + - dashboards/cluster-api-traces.json - dashboards/cluster-api-wl-apiserver-requests.json - dashboards/controller-runtime.json diff --git a/hack/tools/go.mod b/hack/tools/go.mod index ded1856e2e9c..7ba09c419dc0 100644 --- a/hack/tools/go.mod +++ b/hack/tools/go.mod @@ -48,6 +48,7 @@ require ( github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.1.3 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/containerd/containerd v1.6.18 // indirect github.com/coredns/caddy v1.1.0 // indirect @@ -64,9 +65,11 @@ require ( github.com/emicklei/go-restful/v3 v3.9.0 // indirect github.com/evanphx/json-patch v5.6.0+incompatible // indirect github.com/evanphx/json-patch/v5 v5.6.0 // indirect + github.com/felixge/httpsnoop v1.0.3 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/go-errors/errors v1.4.2 // indirect github.com/go-logr/logr v1.2.4 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.1 // indirect github.com/go-openapi/swag v0.22.3 // indirect @@ -88,6 +91,7 @@ require ( github.com/googleapis/gax-go/v2 v2.8.0 // indirect github.com/gorilla/mux v1.8.0 // indirect github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect @@ -130,6 +134,15 @@ require ( github.com/subosito/gotenv v1.4.2 // indirect github.com/xlab/treeprint v1.1.0 // indirect go.opencensus.io v0.24.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 // indirect + go.opentelemetry.io/otel v1.10.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 // indirect + go.opentelemetry.io/otel/metric v0.31.0 // indirect + go.opentelemetry.io/otel/sdk v1.10.0 // indirect + go.opentelemetry.io/otel/trace v1.10.0 // indirect + go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect golang.org/x/crypto v0.10.0 // indirect golang.org/x/mod v0.10.0 // indirect diff --git a/hack/tools/go.sum b/hack/tools/go.sum index 201b88d3d505..3431008aa0ea 100644 --- a/hack/tools/go.sum +++ b/hack/tools/go.sum @@ -92,6 +92,8 @@ github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx2 github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd h1:rFt+Y/IK1aEZkEHchZRSq9OQbsSzIT/OrI8YFFmRIng= github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b h1:otBG+dV+YK+Soembjv71DPz3uX/V/6MMlSyD9JBQ6kQ= github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0 h1:nvj0OLI3YqYXer/kZD8Ri1aaunCxIEsOst1BVJswV0o= +github.com/cenkalti/backoff/v4 v4.1.3 h1:cFAlzYUlVYDysBEH2T5hyJZMh3+5+WCBvSnK6Q8UtC4= +github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -166,6 +168,7 @@ github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJ github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= +github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= @@ -182,8 +185,11 @@ github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2 github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= @@ -200,6 +206,8 @@ github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zV github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= +github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -251,6 +259,7 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v48 v48.2.0 h1:68puzySE6WqUY9KWmpOsDEQfDZsso98rT6pZcz9HqcE= @@ -303,6 +312,8 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmg github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 h1:BZHcxBETFHIdVyhyEfOvn/RdU/QGdLI4y34qQGjGWO0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -553,7 +564,25 @@ go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 h1:sxoY9kG1s1WpSYNyzm24rlwH4lnRYFXUVVBmKMBfRgw= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1/go.mod h1:9NiG9I2aHTKkcxqCILhjtyNA1QEiCjdBACv4IvrFQ+c= +go.opentelemetry.io/otel v1.10.0 h1:Y7DTJMR6zs1xkS/upamJYk0SxxN4C9AqRd77jmZnyY4= +go.opentelemetry.io/otel v1.10.0/go.mod h1:NbvWjCthWHKBEUMpf0/v8ZRZlni86PpGFEMA9pnQSnQ= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 h1:TaB+1rQhddO1sF71MpZOZAuSPW1klK2M8XxfrBMfK7Y= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0/go.mod h1:78XhIg8Ht9vR4tbLNUhXsiOnE2HOuSeKAiAcoVQEpOY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 h1:pDDYmo0QadUPal5fwXoY1pmMpFcdyhXOmL5drCrI3vU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0/go.mod h1:Krqnjl22jUJ0HgMzw5eveuCvFDXY4nSYb4F8t5gdrag= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 h1:KtiUEhQmj/Pa874bVYKGNVdq8NPKiacPbaRRtgXi+t4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0/go.mod h1:OfUCyyIiDvNXHWpcWgbF+MWvqPZiNa3YDEnivcnYsV0= +go.opentelemetry.io/otel/metric v0.31.0 h1:6SiklT+gfWAwWUR0meEMxQBtihpiEs4c+vL9spDTqUs= +go.opentelemetry.io/otel/metric v0.31.0/go.mod h1:ohmwj9KTSIeBnDBm/ZwH2PSZxZzoOaG2xZeekTRzL5A= +go.opentelemetry.io/otel/sdk v1.10.0 h1:jZ6K7sVn04kk/3DNUdJ4mqRlGDiXAVuIG+MMENpTNdY= +go.opentelemetry.io/otel/sdk v1.10.0/go.mod h1:vO06iKzD5baltJz1zarxMCNHFpUlUiOy4s65ECtn6kE= +go.opentelemetry.io/otel/trace v1.10.0 h1:npQMbR8o7mum8uF95yFbOEJffhs1sbCOfDh8zAJiH5E= +go.opentelemetry.io/otel/trace v1.10.0/go.mod h1:Sij3YYczqAdz+EhmGhE6TpTxUO5/F/AzrK+kxfGqySM= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= +go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJPI1Nnw= +go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee331t6JAXeK2bcyhLOOc= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5/go.mod h1:nmDLcffg48OtT/PSW0Hg7FvpRQsQh5OSqIylirxKC7o= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= @@ -666,6 +695,7 @@ golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20201208152858-08078c50e5b5/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.9.0 h1:BPpt2kU7oMRq3kCHAA1tbSEshXRw1LpG2ztgDwrzuAs= golang.org/x/oauth2 v0.9.0/go.mod h1:qYgFZaFiu6Wg24azG8bdV52QJXJGbZzIIsRCdVKzbLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -888,6 +918,7 @@ google.golang.org/genproto v0.0.0-20201210142538-e3217bee35cc/go.mod h1:FWY/as6D google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210108203827-ffc7fda8c3d7/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210226172003-ab064af71705/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20211118181313-81c1377c94b1/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= @@ -910,6 +941,7 @@ google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA5 google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= +google.golang.org/grpc v1.42.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag= google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8= diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 7ffe710dff5c..cba6dbeb26f2 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -22,6 +22,7 @@ import ( "time" "github.com/pkg/errors" + oteltrace "go.opentelemetry.io/otel/trace" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -48,6 +49,7 @@ import ( "sigs.k8s.io/cluster-api/controllers/noderefutil" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/util/ssa" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" @@ -78,6 +80,7 @@ type Reconciler struct { UnstructuredCachingClient client.Client APIReader client.Reader Tracker *remote.ClusterCacheTracker + TraceProvider oteltrace.TracerProvider // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -105,6 +108,10 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt r.nodeDeletionRetryTimeout = 10 * time.Second } + if r.TraceProvider == nil { + r.TraceProvider = oteltrace.NewNoopTracerProvider() + } + tr := traceutil.Reconciler(r, r.TraceProvider, "machine", "Machine") c, err := ctrl.NewControllerManagedBy(mgr). For(&clusterv1.Machine{}). WithOptions(options). @@ -122,7 +129,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), ), )). - Build(r) + Build(tr) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } @@ -268,6 +275,9 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust } func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcile") + defer span.End() + // If the machine is a stand-alone one, meaning not originated from a MachineDeployment, then set it as directly // owned by the Cluster (if not already present). if r.shouldAdopt(m) { @@ -288,6 +298,10 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, res := ctrl.Result{} errs := []error{} + + ctx, span = traceutil.Start(ctx, "machine.Reconciler.reconcile.phases") + defer span.End() + s := &scope{ cluster: cluster, machine: m, @@ -326,6 +340,9 @@ type scope struct { } func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) { //nolint:gocyclo + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcileDelete") + defer span.End() + log := ctrl.LoggerFrom(ctx) err := r.isDeleteNodeAllowed(ctx, cluster, m) diff --git a/internal/controllers/machine/machine_controller_noderef.go b/internal/controllers/machine/machine_controller_noderef.go index 7f32c72c48c5..9fe1d9758e81 100644 --- a/internal/controllers/machine/machine_controller_noderef.go +++ b/internal/controllers/machine/machine_controller_noderef.go @@ -32,6 +32,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" "sigs.k8s.io/cluster-api/internal/util/taints" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" @@ -43,6 +44,9 @@ var ( ) func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcileNode") + defer span.End() + log := ctrl.LoggerFrom(ctx) cluster := s.cluster machine := s.machine diff --git a/internal/controllers/machine/machine_controller_phases.go b/internal/controllers/machine/machine_controller_phases.go index 5e34f8904444..b184c3866b29 100644 --- a/internal/controllers/machine/machine_controller_phases.go +++ b/internal/controllers/machine/machine_controller_phases.go @@ -36,6 +36,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/external" capierrors "sigs.k8s.io/cluster-api/errors" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" @@ -89,6 +90,9 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) { // reconcileExternal handles generic unstructured objects referenced by a Machine. func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) { + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcileExternal") + defer span.End() + log := ctrl.LoggerFrom(ctx) if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil { @@ -167,6 +171,9 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C // reconcileBootstrap reconciles the Spec.Bootstrap.ConfigRef object on a Machine. func (r *Reconciler) reconcileBootstrap(ctx context.Context, s *scope) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcileBootstrap") + defer span.End() + log := ctrl.LoggerFrom(ctx) cluster := s.cluster m := s.machine @@ -240,6 +247,9 @@ func (r *Reconciler) reconcileBootstrap(ctx context.Context, s *scope) (ctrl.Res // reconcileInfrastructure reconciles the Spec.InfrastructureRef object on a Machine. func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "machine.Reconciler.reconcileInfrastructure") + defer span.End() + log := ctrl.LoggerFrom(ctx) cluster := s.cluster m := s.machine @@ -322,7 +332,10 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr return ctrl.Result{}, nil } -func (r *Reconciler) reconcileCertificateExpiry(_ context.Context, s *scope) (ctrl.Result, error) { +func (r *Reconciler) reconcileCertificateExpiry(ctx context.Context, s *scope) (ctrl.Result, error) { + _, span := traceutil.Start(ctx, "machine.Reconciler.reconcileCertificateExpiry") + defer span.End() + m := s.machine var annotations map[string]string diff --git a/internal/controllers/topology/cluster/blueprint.go b/internal/controllers/topology/cluster/blueprint.go index a455beb17aa4..5c8ba2328916 100644 --- a/internal/controllers/topology/cluster/blueprint.go +++ b/internal/controllers/topology/cluster/blueprint.go @@ -24,12 +24,16 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/scope" tlog "sigs.k8s.io/cluster-api/internal/log" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) // getBlueprint gets a ClusterBlueprint with the ClusterClass and the referenced templates to be used for a managed Cluster topology. // It also converts and patches all ObjectReferences in ClusterClass and ControlPlane to the latest apiVersion of the current contract. // NOTE: This function assumes that cluster.Spec.Topology.Class is set. func (r *Reconciler) getBlueprint(ctx context.Context, cluster *clusterv1.Cluster, clusterClass *clusterv1.ClusterClass) (_ *scope.ClusterBlueprint, reterr error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getBlueprint") + defer span.End() + blueprint := &scope.ClusterBlueprint{ Topology: cluster.Spec.Topology, ClusterClass: clusterClass, diff --git a/internal/controllers/topology/cluster/cluster_controller.go b/internal/controllers/topology/cluster/cluster_controller.go index 64a2bb36c442..3ea4f0100a0a 100644 --- a/internal/controllers/topology/cluster/cluster_controller.go +++ b/internal/controllers/topology/cluster/cluster_controller.go @@ -22,6 +22,7 @@ import ( "time" "github.com/pkg/errors" + oteltrace "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" kerrors "k8s.io/apimachinery/pkg/util/errors" @@ -46,6 +47,7 @@ import ( tlog "sigs.k8s.io/cluster-api/internal/log" runtimeclient "sigs.k8s.io/cluster-api/internal/runtime/client" "sigs.k8s.io/cluster-api/internal/util/ssa" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/internal/webhooks" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -70,6 +72,8 @@ type Reconciler struct { RuntimeClient runtimeclient.Client + TraceProvider oteltrace.TracerProvider + // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -87,6 +91,10 @@ type Reconciler struct { } func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + if r.TraceProvider == nil { + r.TraceProvider = oteltrace.NewNoopTracerProvider() + } + tr := traceutil.Reconciler(r, r.TraceProvider, "topology/cluster", "Cluster") c, err := ctrl.NewControllerManagedBy(mgr). For(&clusterv1.Cluster{}, builder.WithPredicates( // Only reconcile Cluster with topology. @@ -105,7 +113,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt ). WithOptions(options). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). - Build(r) + Build(tr) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") @@ -198,6 +206,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re // reconcile handles cluster reconciliation. func (r *Reconciler) reconcile(ctx context.Context, s *scope.Scope) (ctrl.Result, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcile") + defer span.End() var err error // Get ClusterClass. diff --git a/internal/controllers/topology/cluster/current_state.go b/internal/controllers/topology/cluster/current_state.go index 9302aa425237..4a301923b534 100644 --- a/internal/controllers/topology/cluster/current_state.go +++ b/internal/controllers/topology/cluster/current_state.go @@ -31,12 +31,16 @@ import ( "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/scope" tlog "sigs.k8s.io/cluster-api/internal/log" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/labels" ) // getCurrentState gets information about the current state of a Cluster by inspecting the state of the InfrastructureCluster, // the ControlPlane, and the MachineDeployments associated with the Cluster. func (r *Reconciler) getCurrentState(ctx context.Context, s *scope.Scope) (*scope.ClusterState, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getCurrentState") + defer span.End() + // NOTE: current scope has been already initialized with the Cluster. currentState := s.Current @@ -74,7 +78,11 @@ func (r *Reconciler) getCurrentState(ctx context.Context, s *scope.Scope) (*scop // getCurrentInfrastructureClusterState looks for the state of the InfrastructureCluster. If a reference is set but not // found, either from an error or the object not being found, an error is thrown. -func (r *Reconciler) getCurrentInfrastructureClusterState(ctx context.Context, blueprintInfrastructureClusterTemplate *unstructured.Unstructured, cluster *clusterv1.Cluster) (*unstructured.Unstructured, error) { +func (r *Reconciler) getCurrentInfrastructureClusterState(ctx context.Context, blueprintInfrastructureClusterTemplate *unstructured.Unstructured, + cluster *clusterv1.Cluster) (*unstructured.Unstructured, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getCurrentInfrastructureClusterState") + defer span.End() + ref, err := alignRefAPIVersion(blueprintInfrastructureClusterTemplate, cluster.Spec.InfrastructureRef) if err != nil { return nil, errors.Wrapf(err, "failed to read %s", tlog.KRef{Ref: cluster.Spec.InfrastructureRef}) @@ -96,6 +104,9 @@ func (r *Reconciler) getCurrentInfrastructureClusterState(ctx context.Context, b // an error is thrown. If the ControlPlane requires MachineInfrastructure according to its ClusterClass an error will be // thrown if the ControlPlane has no MachineTemplates. func (r *Reconciler) getCurrentControlPlaneState(ctx context.Context, blueprintControlPlane *scope.ControlPlaneBlueprint, blueprintHasControlPlaneInfrastructureMachine bool, cluster *clusterv1.Cluster) (*scope.ControlPlaneState, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getCurrentControlPlaneState") + defer span.End() + var err error res := &scope.ControlPlaneState{} @@ -158,6 +169,9 @@ func (r *Reconciler) getCurrentControlPlaneState(ctx context.Context, blueprintC // expected on first reconcile. If MachineDeployments are found for the Cluster their Infrastructure and Bootstrap references // are inspected. Where these are not found the function will throw an error. func (r *Reconciler) getCurrentMachineDeploymentState(ctx context.Context, blueprintMachineDeployments map[string]*scope.MachineDeploymentBlueprint, cluster *clusterv1.Cluster) (map[string]*scope.MachineDeploymentState, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getCurrentMachineDeploymentState") + defer span.End() + state := make(scope.MachineDeploymentsStateMap) // List all the machine deployments in the current cluster and in a managed topology. diff --git a/internal/controllers/topology/cluster/desired_state.go b/internal/controllers/topology/cluster/desired_state.go index ce222170c28a..64e59c7165be 100644 --- a/internal/controllers/topology/cluster/desired_state.go +++ b/internal/controllers/topology/cluster/desired_state.go @@ -38,6 +38,7 @@ import ( "sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/scope" "sigs.k8s.io/cluster-api/internal/hooks" tlog "sigs.k8s.io/cluster-api/internal/log" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" ) @@ -46,6 +47,9 @@ import ( // the entire compute operation will fail. This might be improved in the future if support for reconciling // subset of a topology will be implemented. func (r *Reconciler) computeDesiredState(ctx context.Context, s *scope.Scope) (*scope.ClusterState, error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.computeDesiredState") + defer span.End() + var err error desiredState := &scope.ClusterState{ ControlPlane: &scope.ControlPlaneState{}, diff --git a/internal/controllers/topology/cluster/reconcile_state.go b/internal/controllers/topology/cluster/reconcile_state.go index ce90b6048352..6220472cd7c4 100644 --- a/internal/controllers/topology/cluster/reconcile_state.go +++ b/internal/controllers/topology/cluster/reconcile_state.go @@ -44,6 +44,7 @@ import ( "sigs.k8s.io/cluster-api/internal/hooks" tlog "sigs.k8s.io/cluster-api/internal/log" "sigs.k8s.io/cluster-api/internal/topology/check" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) const ( @@ -57,6 +58,9 @@ const ( // the entire reconcile operation will fail. This might be improved in the future if support for reconciling // subset of a topology will be implemented. func (r *Reconciler) reconcileState(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileState") + defer span.End() + log := tlog.LoggerFrom(ctx) log.Infof("Reconciling state for topology owned objects") @@ -95,6 +99,9 @@ func (r *Reconciler) reconcileState(ctx context.Context, s *scope.Scope) error { // Reconcile the Cluster shim, a temporary object used a mean to collect objects/templates // that might be orphaned in case of errors during the remaining part of the reconcile process. func (r *Reconciler) reconcileClusterShim(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileClusterShim") + defer span.End() + shim := clusterShim(s.Current.Cluster) // If we are going to create the InfrastructureCluster or the ControlPlane object, then @@ -274,6 +281,9 @@ func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope // reconcileInfrastructureCluster reconciles the desired state of the InfrastructureCluster object. func (r *Reconciler) reconcileInfrastructureCluster(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileInfrastructureCluster") + defer span.End() + ctx, _ = tlog.LoggerFrom(ctx).WithObject(s.Desired.InfrastructureCluster).Into(ctx) ignorePaths, err := contract.InfrastructureCluster().IgnorePaths(s.Desired.InfrastructureCluster) @@ -292,6 +302,9 @@ func (r *Reconciler) reconcileInfrastructureCluster(ctx context.Context, s *scop // reconcileControlPlane works to bring the current state of a managed topology in line with the desired state. This involves // updating the cluster where needed. func (r *Reconciler) reconcileControlPlane(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileControlPlane") + defer span.End() + // If the ControlPlane has defined a current or desired MachineHealthCheck attempt to reconcile it. // MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation // even if the Control Plane is pending an upgrade. @@ -424,6 +437,9 @@ func (r *Reconciler) reconcileMachineHealthCheck(ctx context.Context, current, d // most specifically, after a Cluster is created it is assumed that the reference to the InfrastructureCluster / // ControlPlane objects should never change (only the content of the objects can change). func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileCluster") + defer span.End() + ctx, log := tlog.LoggerFrom(ctx).WithObject(s.Desired.Cluster).Into(ctx) // Check differences between current and desired state, and eventually patch the current object. @@ -448,6 +464,7 @@ func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error // Note: It is good enough to check that the resource version changed. Other controllers might have updated the // Cluster as well, but the combination of the patch call above without a conflict and a changed resource // version here guarantees that we see the changes of our own update. + span.AddEvent("WaitCacheStart") err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { key := client.ObjectKey{Namespace: s.Current.Cluster.GetNamespace(), Name: s.Current.Cluster.GetName()} cachedCluster := &clusterv1.Cluster{} @@ -456,6 +473,7 @@ func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error } return s.Current.Cluster.GetResourceVersion() != cachedCluster.GetResourceVersion(), nil }) + span.AddEvent("WaitCacheEnd") if err != nil { return errors.Wrapf(err, "failed waiting for Cluster %s to be updated in the cache after patch", tlog.KObj{Obj: s.Current.Cluster}) } @@ -464,6 +482,9 @@ func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error // reconcileMachineDeployments reconciles the desired state of the MachineDeployment objects. func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.Scope) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.reconcileMachineDeployments") + defer span.End() + diff := calculateMachineDeploymentDiff(s.Current.MachineDeployments, s.Desired.MachineDeployments) // Create MachineDeployments. @@ -513,6 +534,9 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S // getCurrentMachineDeployments gets the current list of MachineDeployments via the APIReader. func (r *Reconciler) getCurrentMachineDeployments(ctx context.Context, s *scope.Scope) (sets.Set[string], error) { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.getCurrentMachineDeployments") + defer span.End() + // TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our // implementation for topology dryrun doesn't support PartialObjectMetadataList. mdList := &clusterv1.MachineDeploymentList{} @@ -539,6 +563,9 @@ func (r *Reconciler) getCurrentMachineDeployments(ctx context.Context, s *scope. // createMachineDeployment creates a MachineDeployment and the corresponding Templates. func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope, md *scope.MachineDeploymentState) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.createMachineDeployment") + defer span.End() + // Do not create the MachineDeployment if it is marked as pending create. // This will also block MHC creation because creating the MHC without the corresponding // MachineDeployment is unnecessary. @@ -585,6 +612,7 @@ func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope // Wait until MachineDeployment is visible in the cache. // Note: We have to do this because otherwise using a cached client in current state could // miss a newly created MachineDeployment (because the cache might be stale). + span.AddEvent("WaitCacheStart") err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { key := client.ObjectKey{Namespace: md.Object.Namespace, Name: md.Object.Name} if err := r.Client.Get(ctx, key, &clusterv1.MachineDeployment{}); err != nil { @@ -595,6 +623,7 @@ func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope } return true, nil }) + span.AddEvent("WaitCacheEnd") if err != nil { return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be visible in the cache after create", md.Object.Kind) } @@ -610,6 +639,9 @@ func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope // updateMachineDeployment updates a MachineDeployment. Also rotates the corresponding Templates if necessary. func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope, mdTopologyName string, currentMD, desiredMD *scope.MachineDeploymentState) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.updateMachineDeployment") + defer span.End() + log := tlog.LoggerFrom(ctx).WithMachineDeployment(desiredMD.Object) // Patch MachineHealthCheck for the MachineDeployment. @@ -676,6 +708,7 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope // Note: It is good enough to check that the resource version changed. Other controllers might have updated the // MachineDeployment as well, but the combination of the patch call above without a conflict and a changed resource // version here guarantees that we see the changes of our own update. + span.AddEvent("WaitCacheStart") err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { key := client.ObjectKey{Namespace: currentMD.Object.GetNamespace(), Name: currentMD.Object.GetName()} cachedMD := &clusterv1.MachineDeployment{} @@ -684,6 +717,7 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope } return currentMD.Object.GetResourceVersion() != cachedMD.GetResourceVersion(), nil }) + span.AddEvent("WaitCacheEnd") if err != nil { return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be updated in the cache after patch", tlog.KObj{Obj: currentMD.Object}) } @@ -705,6 +739,9 @@ func logMachineDeploymentVersionChange(current, desired *clusterv1.MachineDeploy // deleteMachineDeployment deletes a MachineDeployment. func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clusterv1.Cluster, md *scope.MachineDeploymentState) error { + ctx, span := traceutil.Start(ctx, "topology/cluster.Reconciler.deleteMachineDeployment") + defer span.End() + log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object).WithObject(md.Object) // delete MachineHealthCheck for the MachineDeployment. diff --git a/internal/util/ssa/managedfields.go b/internal/util/ssa/managedfields.go index c701b6e36c68..d2da87a1bc81 100644 --- a/internal/util/ssa/managedfields.go +++ b/internal/util/ssa/managedfields.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/cluster-api/internal/contract" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) const classicManager = "manager" @@ -44,6 +45,9 @@ const classicManager = "manager" // as we assume that if other controllers are still writing fields on the object they will just do it again and thus // gain ownership again. func DropManagedFields(ctx context.Context, c client.Client, obj client.Object, ssaManager string, paths []contract.Path) error { + ctx, span := traceutil.Start(ctx, "ssa.DropManagedFields") + defer span.End() + // Return if `ssaManager` already owns any fields. if hasFieldsManagedBy(obj, ssaManager) { return nil @@ -110,6 +114,9 @@ func DropManagedFields(ctx context.Context, c client.Client, obj client.Object, // Dropping all existing "manager" entries (which could also be from other controllers) is safe, as we assume that if // other controllers are still writing fields on the object they will just do it again and thus gain ownership again. func CleanUpManagedFieldsForSSAAdoption(ctx context.Context, c client.Client, obj client.Object, ssaManager string) error { + ctx, span := traceutil.Start(ctx, "ssa.CleanUpManagedFieldsForSSAAdoption") + defer span.End() + // Return if `ssaManager` already owns any fields. if hasFieldsManagedBy(obj, ssaManager) { return nil diff --git a/internal/util/ssa/patch.go b/internal/util/ssa/patch.go index 4f90d5e043b2..c9c1e4723f9b 100644 --- a/internal/util/ssa/patch.go +++ b/internal/util/ssa/patch.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/cluster-api/internal/contract" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" ) // Option is the interface for configuration that modifies Options for a patch request. @@ -62,6 +63,9 @@ type Options struct { // If WithCachingProxy is set and the request didn't change the object // we will cache this result, so subsequent calls don't have to run SSA again. func Patch(ctx context.Context, c client.Client, fieldManager string, modified client.Object, opts ...Option) error { + ctx, span := traceutil.Start(ctx, "ssa.Patch") + defer span.End() + // Calculate the options. options := &Options{} for _, opt := range opts { diff --git a/internal/util/trace/trace.go b/internal/util/trace/trace.go new file mode 100644 index 000000000000..85bbb1a2b7c7 --- /dev/null +++ b/internal/util/trace/trace.go @@ -0,0 +1,157 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package trace implements utilities for tracing. +package trace + +import ( + "context" + "fmt" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + otelsdkresource "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.12.0" + oteltrace "go.opentelemetry.io/otel/trace" + tracing "k8s.io/component-base/tracing" + tracingapi "k8s.io/component-base/tracing/api/v1" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// traceReconciler adds tracing. +type traceReconciler struct { + Reconciler reconcile.Reconciler + Tracer oteltrace.Tracer + controllerName string + kind string +} + +// Reconciler creates a reconciles which wraps the current reconciler and adds logs & traces. +func Reconciler(reconciler reconcile.Reconciler, tracerProvider oteltrace.TracerProvider, reconcilerName, kind string) reconcile.Reconciler { + return &traceReconciler{ + Reconciler: reconciler, + Tracer: tracerProvider.Tracer(reconcilerName), + controllerName: reconcilerName, + kind: kind, + } +} + +// Reconcile +// FIXME: Open issue: we should really make sure the log.Error in CR gets the trace id too, by either: +// * creating this span in CR. +// * disabling the error log in CR and doing it ourselves. +func (r *traceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + ctx, span := r.Tracer.Start(ctx, fmt.Sprintf("%s.Reconciler.Reconcile", r.controllerName), + oteltrace.WithAttributes( + attribute.String(r.kind, klog.KRef(req.Namespace, req.Name).String()), + attribute.String("controller", r.controllerName), + attribute.String("reconcileID", string(controller.ReconcileIDFromContext(ctx))), + ), + ) + defer span.End() + + // Add tracer to context. + ctx = addTracerToContext(ctx, r.Tracer) + + // Add traceID to logger. + log := ctrl.LoggerFrom(ctx) + log = log.WithValues("traceID", span.SpanContext().TraceID().String()) + ctx = ctrl.LoggerInto(ctx, log) + + res, err := r.Reconciler.Reconcile(ctx, req) + if err != nil { + span.SetAttributes(attribute.String("error", "true")) + } + + return res, err +} + +// NewProvider provides a NewProvider. +func NewProvider(tracingConfiguration *tracingapi.TracingConfiguration, service, appName string) (oteltrace.TracerProvider, error) { + if tracingConfiguration == nil || tracingConfiguration.Endpoint == nil || *tracingConfiguration.Endpoint == "" { + return oteltrace.NewNoopTracerProvider(), nil + } + + resourceOpts := []otelsdkresource.Option{ + // FIXME: reevaluate + otelsdkresource.WithProcess(), + otelsdkresource.WithAttributes( + // ~ aligned to kube-apiserver + attribute.Key("app").String(appName), + // semconv.HostNameKey.String(hostname), + semconv.ServiceNameKey.String(service), + // This should probably be the pod name + // semconv.ServiceInstanceIDKey.String(service+uuid.New().String()), + ), + } + tp, err := tracing.NewProvider(context.Background(), tracingConfiguration, []otlptracegrpc.Option{}, resourceOpts) + if err != nil { + return nil, fmt.Errorf("could not configure tracer provider: %w", err) + } + return tp, nil +} + +// tracerFromContext gets an oteltrace.Tracer from the current context. +func tracerFromContext(ctx context.Context) oteltrace.Tracer { + r, ok := ctx.Value(tracerKey{}).(oteltrace.Tracer) + if !ok { + return oteltrace.NewNoopTracerProvider().Tracer("noop") + } + + return r +} + +// tracerKey is a context.Context Value key. Its associated value should +// be a types.UID. +type tracerKey struct{} + +func addTracerToContext(ctx context.Context, tracer oteltrace.Tracer) context.Context { + return context.WithValue(ctx, tracerKey{}, tracer) +} + +// Span is a span. +type Span struct { + span oteltrace.Span + // FIXME(sbueringer) skipping go tracing for now + // task *trace.Task //nolint:gocritic +} + +// Start starts a span. +func Start(ctx context.Context, name string) (context.Context, *Span) { + tracer := tracerFromContext(ctx) + + ctx, span := tracer.Start(ctx, name) + + // ctx, task := trace.NewTask(ctx, name) //nolint:gocritic + return ctx, &Span{ + span: span, + //task: task, + } +} + +// End ends a span. +func (s *Span) End() { + s.span.End() + // s.task.End() //nolint:gocritic +} + +// AddEvent adds an event with the provided name and options. +func (s *Span) AddEvent(name string, options ...oteltrace.EventOption) { + s.span.AddEvent(name, options...) +} diff --git a/main.go b/main.go index bf46399b8804..760f67f1ffca 100644 --- a/main.go +++ b/main.go @@ -28,6 +28,7 @@ import ( // +kubebuilder:scaffold:imports "github.com/spf13/pflag" + oteltrace "go.opentelemetry.io/otel/trace" corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/labels" @@ -39,7 +40,10 @@ import ( "k8s.io/component-base/logs" logsv1 "k8s.io/component-base/logs/api/v1" _ "k8s.io/component-base/logs/json/register" + "k8s.io/component-base/tracing" + tracingapi "k8s.io/component-base/tracing/api/v1" "k8s.io/klog/v2" + "k8s.io/utils/pointer" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" @@ -69,6 +73,7 @@ import ( "sigs.k8s.io/cluster-api/feature" runtimeclient "sigs.k8s.io/cluster-api/internal/runtime/client" runtimeregistry "sigs.k8s.io/cluster-api/internal/runtime/registry" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" runtimewebhooks "sigs.k8s.io/cluster-api/internal/webhooks/runtime" "sigs.k8s.io/cluster-api/util/flags" "sigs.k8s.io/cluster-api/version" @@ -80,6 +85,7 @@ var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") controllerName = "cluster-api-controller-manager" + appName = "capi-controller-manager" // flags. metricsBindAddr string @@ -110,6 +116,9 @@ var ( healthAddr string tlsOptions = flags.TLSOptions{} logOptions = logs.NewOptions() + + tracingEndpoint string + tracingSamplingRatePerMillion int32 ) func init() { @@ -219,6 +228,13 @@ func InitFlags(fs *pflag.FlagSet) { fs.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.") + // FIXME: re-work flags + fs.StringVar(&tracingEndpoint, "tracing-endpoint", "", + "endpoint to send traces to") + + fs.Int32Var(&tracingSamplingRatePerMillion, "tracing-sampling-rate", 0, + "sample rate per million for tracing") + flags.AddTLSOptions(fs, &tlsOptions) feature.MutableGates.AddFlag(fs) @@ -248,6 +264,16 @@ func main() { os.Exit(1) } + tp, err := traceutil.NewProvider(&tracingapi.TracingConfiguration{ + Endpoint: pointer.String(tracingEndpoint), + SamplingRatePerMillion: pointer.Int32(tracingSamplingRatePerMillion), + }, controllerName, appName) + if err != nil { + setupLog.Error(err, "unable to create tracing provider") + os.Exit(1) + } + restConfig.Wrap(tracing.WrapperFor(tp)) + minVer := version.MinimumKubernetesVersion if feature.Gates.Enabled(feature.ClusterTopology) { minVer = version.MinimumKubernetesVersionClusterTopology @@ -327,7 +353,7 @@ func main() { setupChecks(mgr) setupIndexes(ctx, mgr) - setupReconcilers(ctx, mgr) + setupReconcilers(ctx, mgr, tp) setupWebhooks(mgr) // +kubebuilder:scaffold:builder @@ -357,7 +383,7 @@ func setupIndexes(ctx context.Context, mgr ctrl.Manager) { } } -func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { +func setupReconcilers(ctx context.Context, mgr ctrl.Manager, tp oteltrace.TracerProvider) { secretCachingClient, err := client.New(mgr.GetConfig(), client.Options{ HTTPClient: mgr.GetHTTPClient(), Cache: &client.CacheOptions{ @@ -379,6 +405,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { ControllerName: controllerName, Log: &log, Indexes: []remote.Index{remote.NodeProviderIDIndex}, + TraceProvider: tp, }, ) if err != nil { @@ -433,6 +460,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { APIReader: mgr.GetAPIReader(), RuntimeClient: runtimeClient, UnstructuredCachingClient: unstructuredCachingClient, + TraceProvider: tp, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(clusterTopologyConcurrency)); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ClusterTopology") @@ -484,6 +512,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { UnstructuredCachingClient: unstructuredCachingClient, APIReader: mgr.GetAPIReader(), Tracker: tracker, + TraceProvider: tp, WatchFilterValue: watchFilterValue, NodeDrainClientTimeout: nodeDrainClientTimeout, }).SetupWithManager(ctx, mgr, concurrency(machineConcurrency)); err != nil { diff --git a/util/conversion/conversion.go b/util/conversion/conversion.go index af1dff3c4a34..3f246526f81c 100644 --- a/util/conversion/conversion.go +++ b/util/conversion/conversion.go @@ -41,6 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/conversion" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" ) @@ -60,6 +61,9 @@ var ( // The object passed as input is modified in place if an updated compatible version is found. // NOTE: This version depends on CRDs being named correctly as defined by contract.CalculateCRDName. func UpdateReferenceAPIContract(ctx context.Context, c client.Client, ref *corev1.ObjectReference) error { + ctx, span := traceutil.Start(ctx, "UpdateReferenceAPIContract") + defer span.End() + gvk := ref.GroupVersionKind() metadata, err := util.GetGVKMetadata(ctx, c, gvk) diff --git a/util/kubeconfig/kubeconfig.go b/util/kubeconfig/kubeconfig.go index 3c5338f683d9..985788e8846a 100644 --- a/util/kubeconfig/kubeconfig.go +++ b/util/kubeconfig/kubeconfig.go @@ -33,6 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/certs" "sigs.k8s.io/cluster-api/util/secret" @@ -109,6 +110,9 @@ func CreateSecret(ctx context.Context, c client.Client, cluster *clusterv1.Clust // CreateSecretWithOwner creates the Kubeconfig secret for the given cluster name, namespace, endpoint, and owner reference. func CreateSecretWithOwner(ctx context.Context, c client.Client, clusterName client.ObjectKey, endpoint string, owner metav1.OwnerReference) error { + ctx, span := traceutil.Start(ctx, "CreateSecretWithOwner") + defer span.End() + server := fmt.Sprintf("https://%s", endpoint) out, err := generateKubeconfig(ctx, c, clusterName, server) if err != nil { @@ -178,6 +182,9 @@ func NeedsClientCertRotation(configSecret *corev1.Secret, threshold time.Duratio // RegenerateSecret creates and stores a new Kubeconfig in the given secret. func RegenerateSecret(ctx context.Context, c client.Client, configSecret *corev1.Secret) error { + ctx, span := traceutil.Start(ctx, "RegenerateSecret") + defer span.End() + clusterName, _, err := secret.ParseSecretName(configSecret.Name) if err != nil { return errors.Wrap(err, "failed to parse secret name") diff --git a/util/patch/patch.go b/util/patch/patch.go index a6559f6cd4d1..5a8014aa6928 100644 --- a/util/patch/patch.go +++ b/util/patch/patch.go @@ -32,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" ) @@ -82,6 +83,9 @@ func NewHelper(obj client.Object, crClient client.Client) (*Helper, error) { // Patch will attempt to patch the given object, including its status. func (h *Helper) Patch(ctx context.Context, obj client.Object, opts ...Option) error { + ctx, span := traceutil.Start(ctx, "Helper.Patch") + defer span.End() + // Return early if the object is nil. if util.IsNil(obj) { return errors.New("Patch could not be completed: object is nil") diff --git a/util/secret/certificates.go b/util/secret/certificates.go index bf438d647ced..87f1e961624d 100644 --- a/util/secret/certificates.go +++ b/util/secret/certificates.go @@ -39,6 +39,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" + traceutil "sigs.k8s.io/cluster-api/internal/util/trace" "sigs.k8s.io/cluster-api/util/certs" ) @@ -295,6 +296,8 @@ func (c Certificates) SaveGenerated(ctx context.Context, ctrlclient client.Clien // LookupOrGenerate is a convenience function that wraps cluster bootstrap certificate behavior. func (c Certificates) LookupOrGenerate(ctx context.Context, ctrlclient client.Client, clusterName client.ObjectKey, owner metav1.OwnerReference) error { + ctx, span := traceutil.Start(ctx, "Certificates.LookupOrGenerate") + defer span.End() return c.LookupOrGenerateCached(ctx, nil, ctrlclient, clusterName, owner) } @@ -302,6 +305,8 @@ func (c Certificates) LookupOrGenerate(ctx context.Context, ctrlclient client.Cl // During lookup we first try to lookup the certificate secret via the secretCachingClient. If we get a NotFound error // we fall back to the regular uncached client. func (c Certificates) LookupOrGenerateCached(ctx context.Context, secretCachingClient, ctrlclient client.Client, clusterName client.ObjectKey, owner metav1.OwnerReference) error { + ctx, span := traceutil.Start(ctx, "Certificates.LookupOrGenerateCached") + defer span.End() // Find the certificates that exist if err := c.LookupCached(ctx, secretCachingClient, ctrlclient, clusterName); err != nil { return err