diff --git a/.gitignore b/.gitignore index b976538e8..2d3fec369 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ # Language-Specific Build Artifacts & Dependencies # ============================================================================ +### Version Managers ### +mise.toml + ### Go ### # Binaries for programs and plugins *.exe @@ -451,4 +454,4 @@ labeler/labeler metadata-collector/metadata-collector node-drainer/node-drainer platform-connectors/platform-connectors -event-exporter/event-exporter \ No newline at end of file +event-exporter/event-exporter diff --git a/health-monitors/csp-health-monitor/cmd/csp-health-monitor/main.go b/health-monitors/csp-health-monitor/cmd/csp-health-monitor/main.go index 4583c8881..715cfb558 100644 --- a/health-monitors/csp-health-monitor/cmd/csp-health-monitor/main.go +++ b/health-monitors/csp-health-monitor/cmd/csp-health-monitor/main.go @@ -34,6 +34,7 @@ import ( "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/config" "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp" awsclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/aws" + azureclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/azure" gcpclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/gcp" "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/datastore" eventpkg "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/event" @@ -256,7 +257,24 @@ func initActiveMonitor( return awsMonitor } - slog.Info("No CSP is explicitly enabled in the configuration (GCP or AWS).") + if cfg.Azure.Enabled { + slog.Info("Azure configuration is enabled.") + + azureMonitor, err := azureclient.NewClient(ctx, cfg.Azure, cfg.ClusterName, kubeconfigPath) + if err != nil { + metrics.CSPMonitorErrors.WithLabelValues(string(model.CSPAzure), "init_error").Inc() + slog.Error("Failed to initialize Azure monitor. Azure will not be monitored.", "error", err) + + return nil + } + + slog.Info("Azure monitor initialized", + "subscriptionID", cfg.Azure.SubscriptionID) + + return azureMonitor + } + + slog.Info("No CSP is explicitly enabled in the configuration (GCP, AWS, or Azure).") return nil } diff --git a/health-monitors/csp-health-monitor/go.mod b/health-monitors/csp-health-monitor/go.mod index f44b1b760..bdaf2acc3 100644 --- a/health-monitors/csp-health-monitor/go.mod +++ b/health-monitors/csp-health-monitor/go.mod @@ -7,6 +7,9 @@ toolchain go1.25.3 require ( cloud.google.com/go/compute v1.38.0 cloud.google.com/go/logging v1.13.1 + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0 github.com/BurntSushi/toml v1.5.0 github.com/aws/aws-sdk-go-v2 v1.40.0 github.com/aws/aws-sdk-go-v2/config v1.32.1 @@ -31,13 +34,14 @@ require ( ) require ( - cel.dev/expr v0.25.1 // indirect cloud.google.com/go v0.121.6 // indirect cloud.google.com/go/auth v0.17.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect cloud.google.com/go/iam v1.5.2 // indirect cloud.google.com/go/longrunning v0.6.7 // indirect + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.1 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.14 // indirect @@ -75,6 +79,7 @@ require ( github.com/go-openapi/swag/typeutils v0.25.1 // indirect github.com/go-openapi/swag/yamlutils v0.25.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect github.com/golang/snappy v1.0.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect @@ -85,11 +90,13 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/lib/pq v1.10.9 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.2 // indirect @@ -118,7 +125,6 @@ require ( golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.14.0 // indirect - gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/health-monitors/csp-health-monitor/go.sum b/health-monitors/csp-health-monitor/go.sum index f938b7e0c..1b96261aa 100644 --- a/health-monitors/csp-health-monitor/go.sum +++ b/health-monitors/csp-health-monitor/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= -cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= cloud.google.com/go v0.121.6 h1:waZiuajrI28iAf40cWgycWNgaXPO06dupuS+sgibK6c= cloud.google.com/go v0.121.6/go.mod h1:coChdst4Ea5vUpiALcYKXEpR1S9ZgXbhEzzMcMR66vI= cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= @@ -20,6 +20,20 @@ cloud.google.com/go/monitoring v1.24.2 h1:5OTsoJ1dXYIiMiuL+sYscLc9BumrL3CarVLL7d cloud.google.com/go/monitoring v1.24.2/go.mod h1:x7yzPWcgDRnPEv3sI+jJGBkwl5qINf+6qY4eq0I9B4U= cloud.google.com/go/storage v1.56.0 h1:iixmq2Fse2tqxMbWhLWC9HfBj1qdxqAmiK8/eqtsLxI= cloud.google.com/go/storage v1.56.0/go.mod h1:Tpuj6t4NweCLzlNbw9Z9iwxEkrSem20AetIeH/shgVU= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0 h1:KpMC6LFL7mqpExyMC9jVOYRiVhLmamjeZfRsUpB7l4s= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0 h1:rx/pIYQIlCjb+n7TzMyFUzIJYb+d0Gi7Vh+ozA0fSJA= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0/go.mod h1:o8YD+BbSeK8ANH4SpxQFCiz5OIFKgHxV1uwF2FrQJYY= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= @@ -132,6 +146,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -160,6 +176,8 @@ github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+l github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -186,6 +204,8 @@ github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -294,6 +314,7 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -319,8 +340,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= -gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/api v0.256.0 h1:u6Khm8+F9sxbCTYNoBHg6/Hwv0N/i+V94MvkOSor6oI= diff --git a/health-monitors/csp-health-monitor/pkg/config/config.go b/health-monitors/csp-health-monitor/pkg/config/config.go index 65680ca2c..985aca67f 100644 --- a/health-monitors/csp-health-monitor/pkg/config/config.go +++ b/health-monitors/csp-health-monitor/pkg/config/config.go @@ -37,13 +37,14 @@ const ( ) type Config struct { - MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"` - TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"` - PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"` - NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"` - ClusterName string `toml:"clusterName"` - GCP GCPConfig `toml:"gcp"` - AWS AWSConfig `toml:"aws"` + MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"` + TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"` + PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"` + NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"` + ClusterName string `toml:"clusterName"` + GCP GCPConfig `toml:"gcp"` + AWS AWSConfig `toml:"aws"` + Azure AzureConfig `toml:"azure"` } // GCPConfig holds GCP specific configuration. @@ -62,6 +63,13 @@ type AWSConfig struct { Region string `toml:"region"` } +// AzureConfig holds Azure specific configuration. +type AzureConfig struct { + Enabled bool `toml:"enabled"` + SubscriptionID string `toml:"subscriptionId"` + PollingIntervalSeconds int `toml:"pollingIntervalSeconds"` +} + // LoadConfig reads the configuration from a TOML file. func LoadConfig(filePath string) (*Config, error) { var cfg Config @@ -174,7 +182,7 @@ func validateGeneralConfig(cfg *Config) error { return nil } -// validateCSPConfig checks GCP/AWS polling intervals and ensures only one CSP is enabled. +// validateCSPConfig checks GCP/AWS/Azure polling intervals and ensures only one CSP is enabled. func validateCSPConfig(cfg *Config) error { // Validate GCP polling interval if cfg.GCP.Enabled && cfg.GCP.APIPollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds { @@ -194,9 +202,28 @@ func validateCSPConfig(cfg *Config) error { ) } + // Validate Azure polling interval + if cfg.Azure.Enabled && cfg.Azure.PollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds { + return fmt.Errorf( + "azure.pollingIntervalSeconds must be at least %d seconds (got %d)", + minCSPSpecificPollingIntervalSeconds, + cfg.Azure.PollingIntervalSeconds, + ) + } + // Ensure only one CSP is enabled - if cfg.GCP.Enabled && cfg.AWS.Enabled { - return fmt.Errorf("multiple CSPs enabled: only one of GCP or AWS can be enabled at a time in the configuration") + count := 0 + + for _, csp := range []bool{cfg.GCP.Enabled, cfg.AWS.Enabled, cfg.Azure.Enabled} { + if csp { + count++ + } + } + + if count > 1 { + return fmt.Errorf( + "multiple CSPs enabled: only one of GCP, AWS, or Azure can be enabled at a time in the configuration", + ) } return nil diff --git a/health-monitors/csp-health-monitor/pkg/csp/azure/azure.go b/health-monitors/csp-health-monitor/pkg/csp/azure/azure.go new file mode 100644 index 000000000..3faa7bcfa --- /dev/null +++ b/health-monitors/csp-health-monitor/pkg/csp/azure/azure.go @@ -0,0 +1,383 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package azure + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/config" + eventpkg "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/event" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/metrics" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/model" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" + v1informer "k8s.io/client-go/informers/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/clientcmd" +) + +// Client encapsulates all state required to poll Azure for +// maintenance events and forward them to the main pipeline. +type Client struct { + config config.AzureConfig + updatesClient *armmaintenance.UpdatesClient + normalizer eventpkg.Normalizer + clusterName string + kubeconfigPath string + subscriptionID string + nodeInformer v1informer.NodeInformer +} + +// NewClient builds and initialises a new Azure monitoring Client. +func NewClient( + ctx context.Context, + cfg config.AzureConfig, + clusterName string, + kubeconfigPath string, +) (*Client, error) { + subscriptionID, err := getSubscriptionID(cfg) + if err != nil { + return nil, err + } + + cred, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, fmt.Errorf("failed to create Azure credential: %w", err) + } + + updatesClient, err := armmaintenance.NewUpdatesClient(subscriptionID, cred, nil) + if err != nil { + return nil, fmt.Errorf("failed to create Azure maintenance client: %w", err) + } + + slog.Info("Successfully initialized Azure VM and maintenance clients", "subscriptionID", subscriptionID) + + var ( + k8sClient kubernetes.Interface + k8sRestConfig *rest.Config + ) + + slog.Info("Azure Client: Using kubeconfig from path", "path", kubeconfigPath) + + k8sRestConfig, err = clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + metrics.CSPMonitorErrors.WithLabelValues(string(model.CSPAzure), "k8s_config_error").Inc() + return nil, fmt.Errorf("failed to load Kubernetes config: %w", err) + } + + k8sClient, err = kubernetes.NewForConfig(k8sRestConfig) + if err != nil { + metrics.CSPMonitorErrors.WithLabelValues(string(model.CSPAzure), "k8s_client_error").Inc() + return nil, fmt.Errorf("failed to create Kubernetes client: %w", err) + } + + slog.Info("Azure Client: Successfully initialized Kubernetes client") + + nodeInformer, _ := newNodeInformer(k8sClient) + + normalizer, err := eventpkg.GetNormalizer(model.CSPAzure) + if err != nil { + return nil, fmt.Errorf("failed to get Azure normalizer: %w", err) + } + + return &Client{ + config: cfg, + updatesClient: updatesClient, + normalizer: normalizer, + clusterName: clusterName, + kubeconfigPath: kubeconfigPath, + subscriptionID: subscriptionID, + nodeInformer: nodeInformer, + }, nil +} + +func newNodeInformer(k8sClient kubernetes.Interface) (v1informer.NodeInformer, chan struct{}) { + factory := informers.NewSharedInformerFactory(k8sClient, 0*time.Second) + + nodeInformer := factory.Core().V1().Nodes() + stopCh := make(chan struct{}) + + _, err := nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) {}, + DeleteFunc: func(obj interface{}) {}, + UpdateFunc: func(oldObj, newObj interface{}) {}, + }) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to add event handler: %v", err) + return nil, nil + } + + factory.Start(stopCh) + + synced := factory.WaitForCacheSync(stopCh) + for v, ok := range synced { + if !ok { + fmt.Fprintf(os.Stderr, "caches failed to sync: %v", v) + return nil, nil + } + } + + return nodeInformer, stopCh +} + +func (c *Client) GetName() model.CSP { + return model.CSPAzure +} + +func (c *Client) StartMonitoring(ctx context.Context, eventChan chan<- model.MaintenanceEvent) error { + slog.Info("Starting Azure VM maintenance monitoring", + "intervalSeconds", c.config.PollingIntervalSeconds) + + ticker := time.NewTicker(time.Duration(c.config.PollingIntervalSeconds) * time.Second) + defer ticker.Stop() + + for { + if ctx.Err() != nil { + slog.Info("Azure monitoring stopping due to context cancellation.") + return ctx.Err() + } + + c.pollForMaintenanceEvents(ctx, eventChan) + + select { + case <-ctx.Done(): + slog.Info("Azure monitoring stopping due to context cancellation.") + return ctx.Err() + case <-ticker.C: + } + } +} + +// pollForMaintenanceEvents checks all cluster nodes for Azure maintenance events in parallel. +func (c *Client) pollForMaintenanceEvents(ctx context.Context, eventChan chan<- model.MaintenanceEvent) { + pollStart := time.Now() + + defer func() { + metrics.CSPPollingDuration.WithLabelValues(string(model.CSPAzure)).Observe(time.Since(pollStart).Seconds()) + }() + + slog.Debug("Polling Azure for VM maintenance events") + + nodeList, err := c.nodeInformer.Lister().List(labels.Everything()) + if err != nil { + metrics.CSPAPIErrors.WithLabelValues(string(model.CSPAzure), "list_nodes_error").Inc() + slog.Error("Failed to list Kubernetes nodes", "error", err) + + return + } + + slog.Debug("Found nodes to check for maintenance events", "count", len(nodeList)) + + var wg sync.WaitGroup + for _, node := range nodeList { + wg.Add(1) + + go func(node *v1.Node) { + defer wg.Done() + + c.processNodeMaintenanceEvents(ctx, node, eventChan) + }(node) + } + + wg.Wait() + + slog.Debug("Completed Azure maintenance event poll") +} + +// processNodeMaintenanceEvents processes maintenance events for a single node +func (c *Client) processNodeMaintenanceEvents( + ctx context.Context, + node *v1.Node, + eventChan chan<- model.MaintenanceEvent) { + resourceGroup, vmName, err := parseAzureProviderID(node.Spec.ProviderID) + if err != nil { + slog.Warn("Failed to parse Azure provider ID", + "node", node.Name, + "providerID", node.Spec.ProviderID, + "error", err) + + return + } + + // Rebuild the resource ID needed for event normalization + resourceID := fmt.Sprintf("/subscriptions/%s/resourcegroups/%s/providers/Microsoft.Compute/virtualMachines/%s", + c.subscriptionID, resourceGroup, vmName) + + pager := c.updatesClient.NewListPager( + resourceGroup, + "Microsoft.Compute", + "virtualMachines", + vmName, + nil, + ) + + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + metrics.CSPAPIErrors.WithLabelValues(string(model.CSPAzure), "updates_list_error").Inc() + slog.Error("Failed to get Azure maintenance updates", + "node", node.Name, + "resourceGroup", resourceGroup, + "vmName", vmName, + "error", err) + + return + } + + for _, update := range page.Value { + if c.shouldReportUpdate(update) { + c.normalizeAndSendEvent(ctx, node, update, resourceGroup, vmName, resourceID, eventChan) + } + } + } +} + +// shouldReportUpdate determines if a maintenance update should be reported +func (c *Client) shouldReportUpdate(update *armmaintenance.Update) bool { + // These are the two fields we need to determine if a maintenance event needs to be reported + if update.Status == nil || update.ImpactType == nil { + return false + } + + // Only report updates that are not completed and have an actual impact + return *update.Status != armmaintenance.UpdateStatusCompleted && *update.ImpactType != armmaintenance.ImpactTypeNone +} + +// normalizeAndSendEvent normalizes a maintenance update and sends it to the event channel +func (c *Client) normalizeAndSendEvent( + ctx context.Context, + node *v1.Node, + update *armmaintenance.Update, + resourceGroup string, + vmName string, + resourceID string, + eventChan chan<- model.MaintenanceEvent, +) { + metrics.CSPEventsReceived.WithLabelValues(string(model.CSPAzure)).Inc() + + slog.Info("Detected Azure maintenance event", + "node", node.Name, + "resourceGroup", resourceGroup, + "vmName", vmName, + "status", *update.Status, + "impactType", *update.ImpactType, + ) + + nodeInfo := map[string]interface{}{ + "nodeName": node.Name, + "providerID": node.Spec.ProviderID, + "clusterName": c.clusterName, + "resourceGroup": resourceGroup, + "vmName": vmName, + "resourceID": resourceID, + "update": update, + } + + event, err := c.normalizer.Normalize(update, nodeInfo) + if err != nil { + slog.Error("Failed to normalize Azure maintenance event", + "node", node.Name, + "error", err) + + return + } + + select { + case eventChan <- *event: + slog.Debug("Sent maintenance event to channel", + "eventID", event.EventID, + "node", event.NodeName) + case <-ctx.Done(): + slog.Info("Context cancelled while sending event") + } +} + +func getSubscriptionID(cfg config.AzureConfig) (string, error) { + if cfg.SubscriptionID != "" { + return cfg.SubscriptionID, nil + } + + // pulled from https://github.com/Microsoft/azureimds/blob/master/imdssample.go + var PTransport = &http.Transport{Proxy: nil} + + client := http.Client{Transport: PTransport} + + req, _ := http.NewRequestWithContext(context.Background(), "GET", "http://169.254.169.254/metadata/instance", nil) + req.Header.Add("Metadata", "True") + + q := req.URL.Query() + q.Add("format", "json") + q.Add("api-version", "2021-02-01") + req.URL.RawQuery = q.Encode() + + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // now that we have the response get the subscription ID from it + var result struct { + Compute struct { + SubscriptionID string `json:"subscriptionId"` + } `json:"compute"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("failed to decode IMDS response: %w", err) + } + + return result.Compute.SubscriptionID, nil +} + +// parseAzureProviderID parses the provider ID to extract the resource group and VM name. +// Example provider ID: +// azure:///subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/ +func parseAzureProviderID(providerID string) (string, string, error) { + parts := strings.Split(providerID, "/") + if len(parts) < 9 { + return "", "", fmt.Errorf("invalid provider ID format: %s", providerID) + } + + var resourceGroup, vmName string + + for i, part := range parts { + if part == "resourceGroups" && i+1 < len(parts) { + resourceGroup = parts[i+1] + } + + if part == "virtualMachines" && i+1 < len(parts) { + vmName = parts[i+1] + } + } + + if resourceGroup == "" || vmName == "" { + return "", "", fmt.Errorf("could not extract resource group or VM name from provider ID: %s", providerID) + } + + return resourceGroup, vmName, nil +} diff --git a/health-monitors/csp-health-monitor/pkg/csp/azure/azure_test.go b/health-monitors/csp-health-monitor/pkg/csp/azure/azure_test.go new file mode 100644 index 000000000..d5e4ef24b --- /dev/null +++ b/health-monitors/csp-health-monitor/pkg/csp/azure/azure_test.go @@ -0,0 +1,239 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package azure + +import ( + "context" + "fmt" + "net/http" + "os" + "testing" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/fake" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance" + fakearm "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance/fake" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/config" + eventpkg "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/event" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/model" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/envtest" +) + +var ( + testK8sClient kubernetes.Interface +) + +func TestMain(m *testing.M) { + testEnv := &envtest.Environment{ + ErrorIfCRDPathMissing: false, + } + + testK8sConfig, err := testEnv.Start() + if err != nil { + panic(fmt.Sprintf("Failed to start test environment: %v", err)) + } + + testK8sClient, err = kubernetes.NewForConfig(testK8sConfig) + if err != nil { + panic(fmt.Sprintf("Failed to create Kubernetes client: %v", err)) + } + + code := m.Run() + + err = testEnv.Stop() + if err != nil { + panic(fmt.Sprintf("Failed to stop test environment: %v", err)) + } + + os.Exit(code) +} + +// TestPollForMaintenanceEvents_NoMaintenanceEvents tests the basic happy path +// where we have one node but it returns no maintenance updates +func TestPollForMaintenanceEvents_NoMaintenanceEvents(t *testing.T) { + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: v1.NodeSpec{ + ProviderID: "azure:///subscriptions/test-sub-id/resourceGroups/test-rg/providers/Microsoft.Compute/virtualMachines/test-vm", + }, + } + + _, err := testK8sClient.CoreV1().Nodes().Create(context.Background(), node, metav1.CreateOptions{}) + require.NoError(t, err, "Failed to create test node") + + defer func() { + err := testK8sClient.CoreV1().Nodes().Delete(context.Background(), node.Name, metav1.DeleteOptions{}) + require.NoError(t, err, "Failed to delete test node") + }() + + // Create a fake Azure Updates server that returns no updates + fakeUpdatesServer := fakearm.UpdatesServer{ + NewListPager: func(resourceGroupName, providerName, resourceType, resourceName string, options *armmaintenance.UpdatesClientListOptions) (resp fake.PagerResponder[armmaintenance.UpdatesClientListResponse]) { + // Return an empty list of updates (no maintenance events) + resp.AddPage(http.StatusOK, armmaintenance.UpdatesClientListResponse{ + ListUpdatesResult: armmaintenance.ListUpdatesResult{ + Value: []*armmaintenance.Update{}, + }, + }, nil) + return + }, + } + + updatesClientOptions := &arm.ClientOptions{ + ClientOptions: azcore.ClientOptions{ + Transport: fakearm.NewUpdatesServerTransport(&fakeUpdatesServer), + }, + } + updatesClient, err := armmaintenance.NewUpdatesClient("test-sub-id", &fake.TokenCredential{}, updatesClientOptions) + require.NoError(t, err, "Failed to create updates client") + + nodeInformer, stopCh := newNodeInformer(testK8sClient) + t.Cleanup(func() { + close(stopCh) + }) + + client := &Client{ + config: config.AzureConfig{ + PollingIntervalSeconds: 60, + }, + updatesClient: updatesClient, + normalizer: &eventpkg.AzureNormalizer{}, + clusterName: "test-cluster", + subscriptionID: "test-sub-id", + nodeInformer: nodeInformer, + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + eventChan := make(chan model.MaintenanceEvent, 10) + + client.pollForMaintenanceEvents(ctx, eventChan) + + // Verify no events were sent (since we have no maintenance updates) + select { + case event := <-eventChan: + assert.Fail(t, "Expected no events", "Received unexpected event: %+v", event) + default: + // Expected: no events in channel + } +} + +// TestPollForMaintenanceEvents_OneMaintenanceEvent tests that a pending maintenance +// event is detected and sent to the event channel +func TestPollForMaintenanceEvents_OneMaintenanceEvent(t *testing.T) { + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: v1.NodeSpec{ + ProviderID: "azure:///subscriptions/test-sub-id/resourceGroups/test-rg/providers/Microsoft.Compute/virtualMachines/test-vm", + }, + } + + _, err := testK8sClient.CoreV1().Nodes().Create(context.Background(), node, metav1.CreateOptions{}) + require.NoError(t, err, "Failed to create test node") + + defer func() { + err := testK8sClient.CoreV1().Nodes().Delete(context.Background(), node.Name, metav1.DeleteOptions{}) + require.NoError(t, err, "Failed to delete test node") + }() + + // Create a pending maintenance update that should trigger an event + pendingStatus := armmaintenance.UpdateStatusPending + restartImpact := armmaintenance.ImpactTypeRestart + notBefore := time.Now().Add(1 * time.Hour) + + maintenanceUpdate := &armmaintenance.Update{ + Status: &pendingStatus, + ImpactType: &restartImpact, + NotBefore: ¬Before, + } + + // Create a fake Azure Updates server that returns one pending update + fakeUpdatesServer := fakearm.UpdatesServer{ + NewListPager: func(resourceGroupName, providerName, resourceType, resourceName string, options *armmaintenance.UpdatesClientListOptions) (resp fake.PagerResponder[armmaintenance.UpdatesClientListResponse]) { + // Return one pending maintenance update + resp.AddPage(http.StatusOK, armmaintenance.UpdatesClientListResponse{ + ListUpdatesResult: armmaintenance.ListUpdatesResult{ + Value: []*armmaintenance.Update{maintenanceUpdate}, + }, + }, nil) + return + }, + } + + updatesClientOptions := &arm.ClientOptions{ + ClientOptions: azcore.ClientOptions{ + Transport: fakearm.NewUpdatesServerTransport(&fakeUpdatesServer), + }, + } + updatesClient, err := armmaintenance.NewUpdatesClient("test-sub-id", &fake.TokenCredential{}, updatesClientOptions) + require.NoError(t, err, "Failed to create updates client") + + nodeInformer, stopCh := newNodeInformer(testK8sClient) + t.Cleanup(func() { + close(stopCh) + }) + + client := &Client{ + config: config.AzureConfig{ + PollingIntervalSeconds: 60, + }, + updatesClient: updatesClient, + normalizer: &eventpkg.AzureNormalizer{}, + clusterName: "test-cluster", + subscriptionID: "test-sub-id", + nodeInformer: nodeInformer, + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + eventChan := make(chan model.MaintenanceEvent, 10) + + client.pollForMaintenanceEvents(ctx, eventChan) + + // Verify exactly one event was sent + select { + case event := <-eventChan: + // Verify basic properties of the event + assert.Equal(t, model.CSPAzure, event.CSP, "Event should be from Azure CSP") + assert.Equal(t, "test-cluster", event.ClusterName, "Event should have correct cluster name") + assert.Equal(t, "test-node-1", event.NodeName, "Event should have correct node name") + assert.Equal(t, model.StatusDetected, event.Status, "Event status should be DETECTED") + assert.Equal(t, model.TypeScheduled, event.MaintenanceType, "Event should be SCHEDULED maintenance type") + assert.NotEmpty(t, event.EventID, "Event should have a non-empty EventID") + case <-time.After(2 * time.Second): + assert.Fail(t, "Expected to receive an event, but timeout occurred") + } + + // Verify no additional events were sent + select { + case event := <-eventChan: + assert.Fail(t, "Expected only one event", "Received additional event: %+v", event) + default: + // Expected: no more events in channel + } +} diff --git a/health-monitors/csp-health-monitor/pkg/event/azure_normalizer.go b/health-monitors/csp-health-monitor/pkg/event/azure_normalizer.go new file mode 100644 index 000000000..6fc373ca2 --- /dev/null +++ b/health-monitors/csp-health-monitor/pkg/event/azure_normalizer.go @@ -0,0 +1,143 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package event + +import ( + "fmt" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance" + pb "github.com/nvidia/nvsentinel/data-models/pkg/protos" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/model" +) + +type AzureNormalizer struct{} + +func (n *AzureNormalizer) Normalize( + rawEvent interface{}, + additionalInfo ...interface{}) (*model.MaintenanceEvent, error) { + // We need the normalizer interface but expect a struct + // from the azure sdk + update, ok := rawEvent.(*armmaintenance.Update) + if !ok { + return nil, fmt.Errorf("expected *armmaintenance.Update, got %T", rawEvent) + } + + if len(additionalInfo) == 0 { + return nil, fmt.Errorf("missing node information in additionalInfo") + } + + nodeInfo, ok := additionalInfo[0].(map[string]interface{}) + if !ok { + return nil, fmt.Errorf("expected map[string]interface{} for nodeInfo, got %T", additionalInfo[0]) + } + + nodeName, _ := nodeInfo["nodeName"].(string) + providerID, _ := nodeInfo["providerID"].(string) + clusterName, _ := nodeInfo["clusterName"].(string) + resourceGroup, _ := nodeInfo["resourceGroup"].(string) + vmName, _ := nodeInfo["vmName"].(string) + resourceID, _ := nodeInfo["resourceID"].(string) + + now := time.Now().UTC() + + eventID := fmt.Sprintf("azure-%s-%s-%d", resourceGroup, vmName, now.Unix()) + + metadata := map[string]string{ + "resourceGroup": resourceGroup, + "vmName": vmName, + "providerID": providerID, + "resourceID": resourceID, + } + + if update.MaintenanceScope != nil { + metadata["maintenanceScope"] = string(*update.MaintenanceScope) + } + + if update.ImpactType != nil { + metadata["impactType"] = string(*update.ImpactType) + } + + if update.ImpactDurationInSec != nil { + metadata["impactDurationInSec"] = fmt.Sprintf("%d", *update.ImpactDurationInSec) + } + + if update.Status != nil { + metadata["updateStatus"] = string(*update.Status) + } + + status := mapAzureStatusToInternal(update.Status) + cspStatus := mapAzureStatusToCSPStatus(update.Status) + + event := &model.MaintenanceEvent{ + EventID: eventID, + CSP: model.CSPAzure, + ClusterName: clusterName, + ResourceType: "VirtualMachine", + ResourceID: resourceID, + MaintenanceType: model.TypeScheduled, + Status: status, + CSPStatus: cspStatus, + ScheduledStartTime: update.NotBefore, + ScheduledEndTime: nil, // Azure doesn't provide end time in Updates API + EventReceivedTimestamp: now, + LastUpdatedTimestamp: now, + RecommendedAction: pb.RecommendedAction_RESTART_VM.String(), + Metadata: metadata, + NodeName: nodeName, + } + + return event, nil +} + +// mapAzureStatusToInternal maps status from the Azure SDK to the internal status +func mapAzureStatusToInternal(status *armmaintenance.UpdateStatus) model.InternalStatus { + if status == nil { + return model.StatusDetected + } + + switch *status { + case armmaintenance.UpdateStatusPending: + return model.StatusDetected + case armmaintenance.UpdateStatusInProgress: + return model.StatusMaintenanceOngoing + case armmaintenance.UpdateStatusCompleted: + return model.StatusMaintenanceComplete + case armmaintenance.UpdateStatusRetryNow, armmaintenance.UpdateStatusRetryLater: + return model.StatusError + default: + return model.StatusDetected + } +} + +// mapAzureStatusToCSPStatus maps status from the Azure SDK to the CSP provider status +func mapAzureStatusToCSPStatus(status *armmaintenance.UpdateStatus) model.ProviderStatus { + if status == nil { + return model.CSPStatusUnknown + } + + switch *status { + case armmaintenance.UpdateStatusPending: + return model.CSPStatusPending + case armmaintenance.UpdateStatusInProgress: + return model.CSPStatusOngoing + case armmaintenance.UpdateStatusCompleted: + return model.CSPStatusCompleted + case armmaintenance.UpdateStatusRetryNow, armmaintenance.UpdateStatusRetryLater: + return model.CSPStatusPending + default: + return model.CSPStatusUnknown + } +} diff --git a/health-monitors/csp-health-monitor/pkg/event/azure_normalizer_test.go b/health-monitors/csp-health-monitor/pkg/event/azure_normalizer_test.go new file mode 100644 index 000000000..54ca27efa --- /dev/null +++ b/health-monitors/csp-health-monitor/pkg/event/azure_normalizer_test.go @@ -0,0 +1,173 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package event + +import ( + "testing" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance" + pb "github.com/nvidia/nvsentinel/data-models/pkg/protos" + "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/model" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestAzureNormalizer_Normalize_PendingUpdate tests normalizing a pending maintenance update +func TestAzureNormalizer_Normalize_PendingUpdate(t *testing.T) { + normalizer := &AzureNormalizer{} + + // 1. Define inputs + pendingStatus := armmaintenance.UpdateStatusPending + restartImpact := armmaintenance.ImpactTypeRestart + notBefore := time.Date(2025, 12, 1, 10, 0, 0, 0, time.UTC) + impactDuration := int32(300) + + update := &armmaintenance.Update{ + Status: &pendingStatus, + ImpactType: &restartImpact, + NotBefore: ¬Before, + ImpactDurationInSec: &impactDuration, + } + + nodeInfo := map[string]interface{}{ + "nodeName": "test-node", + "providerID": "azure:///subscriptions/sub-123/resourceGroups/rg-test/providers/Microsoft.Compute/virtualMachines/vm-test", + "clusterName": "test-cluster", + "resourceGroup": "rg-test", + "vmName": "vm-test", + "resourceID": "/subscriptions/sub-123/resourcegroups/rg-test/providers/Microsoft.Compute/virtualMachines/vm-test", + } + + // 2. Define expected output (excluding time-based fields that are set during normalization) + expectedEvent := &model.MaintenanceEvent{ + CSP: model.CSPAzure, + ClusterName: "test-cluster", + ResourceType: "VirtualMachine", + ResourceID: "/subscriptions/sub-123/resourcegroups/rg-test/providers/Microsoft.Compute/virtualMachines/vm-test", + MaintenanceType: model.TypeScheduled, + Status: model.StatusDetected, + CSPStatus: model.CSPStatusPending, + ScheduledStartTime: ¬Before, + ScheduledEndTime: nil, + RecommendedAction: pb.RecommendedAction_RESTART_VM.String(), + NodeName: "test-node", + Metadata: map[string]string{ + "resourceGroup": "rg-test", + "vmName": "vm-test", + "providerID": "azure:///subscriptions/sub-123/resourceGroups/rg-test/providers/Microsoft.Compute/virtualMachines/vm-test", + "resourceID": "/subscriptions/sub-123/resourcegroups/rg-test/providers/Microsoft.Compute/virtualMachines/vm-test", + "impactType": "Restart", + "impactDurationInSec": "300", + "updateStatus": "Pending", + }, + } + + // 3. Call Normalize + event, err := normalizer.Normalize(update, nodeInfo) + require.NoError(t, err, "Normalize should not return an error") + require.NotNil(t, event, "Event should not be nil") + + // 4. Deep comparison (excluding time-based fields) + assert.Equal(t, expectedEvent.CSP, event.CSP) + assert.Equal(t, expectedEvent.ClusterName, event.ClusterName) + assert.Equal(t, expectedEvent.ResourceType, event.ResourceType) + assert.Equal(t, expectedEvent.ResourceID, event.ResourceID) + assert.Equal(t, expectedEvent.MaintenanceType, event.MaintenanceType) + assert.Equal(t, expectedEvent.Status, event.Status) + assert.Equal(t, expectedEvent.CSPStatus, event.CSPStatus) + assert.Equal(t, expectedEvent.ScheduledStartTime, event.ScheduledStartTime) + assert.Equal(t, expectedEvent.ScheduledEndTime, event.ScheduledEndTime) + assert.Equal(t, expectedEvent.RecommendedAction, event.RecommendedAction) + assert.Equal(t, expectedEvent.NodeName, event.NodeName) + assert.Equal(t, expectedEvent.Metadata, event.Metadata) + + // Verify time-based fields are set and reasonable + assert.NotEmpty(t, event.EventID, "EventID should be generated") + assert.Contains(t, event.EventID, "azure-rg-test-vm-test", "EventID should contain expected format") + assert.NotZero(t, event.EventReceivedTimestamp, "EventReceivedTimestamp should be set") + assert.NotZero(t, event.LastUpdatedTimestamp, "LastUpdatedTimestamp should be set") +} + +// TestAzureNormalizer_Normalize_InProgressUpdate tests normalizing an in-progress maintenance update +func TestAzureNormalizer_Normalize_InProgressUpdate(t *testing.T) { + normalizer := &AzureNormalizer{} + + // 1. Define inputs + inProgressStatus := armmaintenance.UpdateStatusInProgress + redeployImpact := armmaintenance.ImpactTypeRedeploy + + update := &armmaintenance.Update{ + Status: &inProgressStatus, + ImpactType: &redeployImpact, + } + + nodeInfo := map[string]interface{}{ + "nodeName": "node-2", + "providerID": "azure:///subscriptions/sub-456/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-prod", + "clusterName": "prod-cluster", + "resourceGroup": "rg-prod", + "vmName": "vm-prod", + "resourceID": "/subscriptions/sub-456/resourcegroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-prod", + } + + // 2. Define expected output (excluding time-based fields that are set during normalization) + expectedEvent := &model.MaintenanceEvent{ + CSP: model.CSPAzure, + ClusterName: "prod-cluster", + ResourceType: "VirtualMachine", + ResourceID: "/subscriptions/sub-456/resourcegroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-prod", + MaintenanceType: model.TypeScheduled, + Status: model.StatusMaintenanceOngoing, + CSPStatus: model.CSPStatusOngoing, + ScheduledStartTime: nil, + ScheduledEndTime: nil, + RecommendedAction: pb.RecommendedAction_RESTART_VM.String(), + NodeName: "node-2", + Metadata: map[string]string{ + "resourceGroup": "rg-prod", + "vmName": "vm-prod", + "providerID": "azure:///subscriptions/sub-456/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-prod", + "resourceID": "/subscriptions/sub-456/resourcegroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-prod", + "impactType": "Redeploy", + "updateStatus": "InProgress", + }, + } + + // 3. Call Normalize + event, err := normalizer.Normalize(update, nodeInfo) + require.NoError(t, err, "Normalize should not return an error") + require.NotNil(t, event, "Event should not be nil") + + // 4. Deep comparison (excluding time-based fields) + assert.Equal(t, expectedEvent.CSP, event.CSP) + assert.Equal(t, expectedEvent.ClusterName, event.ClusterName) + assert.Equal(t, expectedEvent.ResourceType, event.ResourceType) + assert.Equal(t, expectedEvent.ResourceID, event.ResourceID) + assert.Equal(t, expectedEvent.MaintenanceType, event.MaintenanceType) + assert.Equal(t, expectedEvent.Status, event.Status) + assert.Equal(t, expectedEvent.CSPStatus, event.CSPStatus) + assert.Equal(t, expectedEvent.ScheduledStartTime, event.ScheduledStartTime) + assert.Equal(t, expectedEvent.ScheduledEndTime, event.ScheduledEndTime) + assert.Equal(t, expectedEvent.RecommendedAction, event.RecommendedAction) + assert.Equal(t, expectedEvent.NodeName, event.NodeName) + assert.Equal(t, expectedEvent.Metadata, event.Metadata) + + // Verify time-based fields are set and reasonable + assert.NotEmpty(t, event.EventID, "EventID should be generated") + assert.Contains(t, event.EventID, "azure-rg-prod-vm-prod", "EventID should contain expected format") + assert.NotZero(t, event.EventReceivedTimestamp, "EventReceivedTimestamp should be set") + assert.NotZero(t, event.LastUpdatedTimestamp, "LastUpdatedTimestamp should be set") +} diff --git a/health-monitors/csp-health-monitor/pkg/event/normalizer.go b/health-monitors/csp-health-monitor/pkg/event/normalizer.go index bbef294f7..63f4466fd 100644 --- a/health-monitors/csp-health-monitor/pkg/event/normalizer.go +++ b/health-monitors/csp-health-monitor/pkg/event/normalizer.go @@ -37,6 +37,8 @@ func GetNormalizer(csp model.CSP) (Normalizer, error) { return &GCPNormalizer{}, nil // GCPNormalizer is defined in gcp_normalizer.go case model.CSPAWS: return &AWSNormalizer{}, nil // AWSNormalizer is defined in aws_normalizer.go + case model.CSPAzure: + return &AzureNormalizer{}, nil // AzureNormalizer is defined in azure_normalizer.go default: return nil, fmt.Errorf("no normalizer available for CSP: %s", csp) } diff --git a/health-monitors/csp-health-monitor/pkg/model/maintenance_event.go b/health-monitors/csp-health-monitor/pkg/model/maintenance_event.go index cc76209f2..85d25bb50 100644 --- a/health-monitors/csp-health-monitor/pkg/model/maintenance_event.go +++ b/health-monitors/csp-health-monitor/pkg/model/maintenance_event.go @@ -53,8 +53,9 @@ type ProviderStatus string // Constants for CSP types const ( - CSPGCP CSP = "gcp" - CSPAWS CSP = "aws" + CSPAzure CSP = "azure" + CSPGCP CSP = "gcp" + CSPAWS CSP = "aws" ) // Constants for maintenance types