diff --git a/.gitignore b/.gitignore index 8930d91fd..89a1489b6 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,7 @@ dist/ # IDE & Editor Configurations # ============================================================================ + ### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ### ### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ### # User-specific stuff diff --git a/.versions.yaml b/.versions.yaml index 785247e92..a3b3e71f1 100644 --- a/.versions.yaml +++ b/.versions.yaml @@ -46,6 +46,7 @@ go_tools: setup_envtest: 'latest' goimports: 'v0.30.0' crane: 'v0.20.2' + controller_gen: 'v0.20.0' # Protocol Buffers / gRPC protobuf: diff --git a/data-models/Makefile b/data-models/Makefile index fad25d777..a93eb034e 100644 --- a/data-models/Makefile +++ b/data-models/Makefile @@ -37,10 +37,36 @@ DOCKER_EXTRA_ARGS := include ../make/common.mk include ../make/go.mk +# API and CRD paths +API_DIR := api/v1alpha1 +CRD_OUTPUT_DIR := $(REPO_ROOT)/distros/kubernetes/nvsentinel/crds + + # ============================================================================= # MODULE-SPECIFIC TARGETS # ============================================================================= +# generate: Generate deepcopy, CRD types, and other Kubernetes boilerplate +# Depends on tools being installed +.PHONY: generate +generate: ## Generate CRDs and move them to Helm chart directory + @echo "Generating CRDs for $(API_DIR)..." + @# Install controller-gen if not present + @which controller-gen > /dev/null || (echo "Installing controller-gen..." && \ + go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)) + @# Generate deepcopy files for API types + go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) object paths=./$(API_DIR) + @# Generate CRDs directly into API_DIR + go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) \ + crd paths=./$(API_DIR) output:crd:dir=./$(API_DIR) + @# Move generated CRDs to Helm chart directory + @echo "Moving generated CRDs to $(CRD_OUTPUT_DIR)..." + @mkdir -p $(CRD_OUTPUT_DIR) + @mv ./$(API_DIR)/*.yaml $(CRD_OUTPUT_DIR)/ || true + @echo "CRDs generated and moved to $(CRD_OUTPUT_DIR)" + @ls -1 $(CRD_OUTPUT_DIR)/*.yaml || echo "No CRD YAMLs generated" + + # Generate Go protobuf files for data-models (shared across all Go modules) .PHONY: protos-generate protos-generate: protos-clean diff --git a/data-models/api/v1alpha1/groupversion_info.go b/data-models/api/v1alpha1/groupversion_info.go new file mode 100644 index 000000000..a504061bf --- /dev/null +++ b/data-models/api/v1alpha1/groupversion_info.go @@ -0,0 +1,34 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package v1alpha1 contains API Schema definitions for the healthevents v1alpha1 API group +// +kubebuilder:object:generate=true +// +groupName=healthevents.dgxc.nvidia.com +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "healthevents.dgxc.nvidia.com", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/data-models/api/v1alpha1/healthevent_types.go b/data-models/api/v1alpha1/healthevent_types.go new file mode 100644 index 000000000..c2bbab7c1 --- /dev/null +++ b/data-models/api/v1alpha1/healthevent_types.go @@ -0,0 +1,107 @@ +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +kubebuilder:validation:Enum=NotStarted;InProgress;Failed;Succeeded;AlreadyDrained;UnQuarantined;Quarantined;AlreadyQuarantined;Cancelled +type Status string + +const ( + StatusNotStarted Status = "NotStarted" + StatusInProgress Status = "InProgress" + StatusFailed Status = "Failed" + StatusSucceeded Status = "Succeeded" + StatusAlreadyDrained Status = "AlreadyDrained" + + StatusUnQuarantined Status = "UnQuarantined" + StatusQuarantined Status = "Quarantined" + StatusAlreadyQuarantined Status = "AlreadyQuarantined" + StatusCancelled Status = "Cancelled" +) + +// +kubebuilder:validation:Enum=NONE;COMPONENT_RESET;CONTACT_SUPPORT;RUN_FIELDDIAG;RESTART_VM;RESTART_BM;REPLACE_VM;RUN_DCGMEUD;UNKNOWN +type RecommendedAction string + +const ( + RecommendedActionNone RecommendedAction = "NONE" + RecommendedActionComponentReset RecommendedAction = "COMPONENT_RESET" + RecommendedActionContactSupport RecommendedAction = "CONTACT_SUPPORT" + RecommendedActionRunFieldDiag RecommendedAction = "RUN_FIELDDIAG" + RecommendedActionRestartVM RecommendedAction = "RESTART_VM" + RecommendedActionRestartBM RecommendedAction = "RESTART_BM" + RecommendedActionReplaceVM RecommendedAction = "REPLACE_VM" + RecommendedActionRunDCGMEUD RecommendedAction = "RUN_DCGMEUD" + RecommendedActionUnknown RecommendedAction = "UNKNOWN" +) + +// OperationStatus represents the status of a sub-operation +type OperationStatus struct { + // +kubebuilder:validation:Required + Status Status `json:"status,omitempty"` + + Message string `json:"message,omitempty"` +} + +// +// ======================= +// HealthEvent Status +// ======================= +// + +// HealthEventStatus defines the observed state of HealthEvent +type HealthEventStatus struct { + // Whether the node has been quarantined + NodeQuarantined *Status `json:"nodeQuarantined,omitempty"` + + // Status of user pod eviction + UserPodsEvictionStatus OperationStatus `json:"userPodsEvictionStatus"` + + // Whether the fault has been remediated + FaultRemediated *bool `json:"faultRemediated,omitempty"` + + // Timestamp of the last remediation attempt + LastRemediationTimestamp *metav1.Time `json:"lastRemediationTimestamp,omitempty"` +} + +// +// ======================= +// HealthEvent Spec +// ======================= +// + +// HealthEventSpec defines the desired state of HealthEvent +type HealthEventSpec struct { + // NodeName is the name of the target node + NodeName string `json:"nodeName,omitempty"` + + // Recommended remediation action + RecommendedAction RecommendedAction `json:"recommendedAction,omitempty"` + + // Whether the fault is fatal + IsFatal *bool `json:"isFatal,omitempty"` + + // Whether the system is healthy + IsHealthy *bool `json:"isHealthy,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +type HealthEvent struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec HealthEventSpec `json:"spec,omitempty"` + Status HealthEventStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true +type HealthEventList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []HealthEvent `json:"items"` +} + +func init() { + SchemeBuilder.Register(&HealthEvent{}, &HealthEventList{}) +} diff --git a/data-models/api/v1alpha1/zz_generated.deepcopy.go b/data-models/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 000000000..10d853647 --- /dev/null +++ b/data-models/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,138 @@ +//go:build !ignore_autogenerated + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HealthEvent) DeepCopyInto(out *HealthEvent) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthEvent. +func (in *HealthEvent) DeepCopy() *HealthEvent { + if in == nil { + return nil + } + out := new(HealthEvent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *HealthEvent) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HealthEventList) DeepCopyInto(out *HealthEventList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]HealthEvent, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthEventList. +func (in *HealthEventList) DeepCopy() *HealthEventList { + if in == nil { + return nil + } + out := new(HealthEventList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *HealthEventList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HealthEventSpec) DeepCopyInto(out *HealthEventSpec) { + *out = *in + if in.IsFatal != nil { + in, out := &in.IsFatal, &out.IsFatal + *out = new(bool) + **out = **in + } + if in.IsHealthy != nil { + in, out := &in.IsHealthy, &out.IsHealthy + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthEventSpec. +func (in *HealthEventSpec) DeepCopy() *HealthEventSpec { + if in == nil { + return nil + } + out := new(HealthEventSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HealthEventStatus) DeepCopyInto(out *HealthEventStatus) { + *out = *in + if in.NodeQuarantined != nil { + in, out := &in.NodeQuarantined, &out.NodeQuarantined + *out = new(Status) + **out = **in + } + out.UserPodsEvictionStatus = in.UserPodsEvictionStatus + if in.FaultRemediated != nil { + in, out := &in.FaultRemediated, &out.FaultRemediated + *out = new(bool) + **out = **in + } + if in.LastRemediationTimestamp != nil { + in, out := &in.LastRemediationTimestamp, &out.LastRemediationTimestamp + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthEventStatus. +func (in *HealthEventStatus) DeepCopy() *HealthEventStatus { + if in == nil { + return nil + } + out := new(HealthEventStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OperationStatus) DeepCopyInto(out *OperationStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperationStatus. +func (in *OperationStatus) DeepCopy() *OperationStatus { + if in == nil { + return nil + } + out := new(OperationStatus) + in.DeepCopyInto(out) + return out +} diff --git a/data-models/go.mod b/data-models/go.mod index 48bc63234..f4d9ccbb6 100644 --- a/data-models/go.mod +++ b/data-models/go.mod @@ -1,19 +1,41 @@ module github.com/nvidia/nvsentinel/data-models -go 1.25 +go 1.25.0 toolchain go1.25.3 require ( google.golang.org/grpc v1.77.0 google.golang.org/protobuf v1.36.11 + k8s.io/apimachinery v0.35.0 + sigs.k8s.io/controller-runtime v0.22.4 ) require ( - golang.org/x/net v0.47.0 // indirect - golang.org/x/sys v0.38.0 // indirect - golang.org/x/text v0.31.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/onsi/gomega v1.38.3 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/text v0.32.0 // indirect + golang.org/x/tools v0.40.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/api v0.35.0 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect ) // Local replacements for internal modules diff --git a/data-models/go.sum b/data-models/go.sum index 2098b0c6b..d2d456822 100644 --- a/data-models/go.sum +++ b/data-models/go.sum @@ -1,13 +1,57 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.3 h1:eTX+W6dobAYfFeGC2PV6RwXRu/MyT+cQguijutvkpSM= +github.com/onsi/gomega v1.38.3/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= @@ -20,12 +64,22 @@ go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6 go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= -golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= -golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= -golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= -golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= -golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= +golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= +golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= +golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk= @@ -34,3 +88,30 @@ google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/data-models/pkg/conversion/healthevent_conversion_test.go b/data-models/pkg/conversion/healthevent_conversion_test.go new file mode 100644 index 000000000..effc81fea --- /dev/null +++ b/data-models/pkg/conversion/healthevent_conversion_test.go @@ -0,0 +1,234 @@ +package conversion + +import ( + "encoding/json" + "reflect" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + v1alpha1 "github.com/nvidia/nvsentinel/data-models/api/v1alpha1" + "github.com/nvidia/nvsentinel/data-models/pkg/model" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" + "k8s.io/utils/ptr" +) + +// TestHealthEventStatus_RoundTrip ensures that converting a HealthEventStatus +// model to a CRD and back to the model preserves all field values. +func TestHealthEventStatus_RoundTrip(t *testing.T) { + now := time.Now().UTC() + + orig := &model.HealthEventStatus{ + NodeQuarantined: func() *model.Status { + s := model.Status("test") + return &s + }(), + UserPodsEvictionStatus: model.OperationStatus{ + Status: model.Status("running"), + Message: "evicting user pods", + }, + FaultRemediated: ptr.To(true), + LastRemediationTimestamp: &now, + } + + crd := ModelHealthEventStatusToCRD(orig) + if crd == nil { + t.Fatalf("ModelHealthEventStatusToCRD returned nil") + } + + back := CRDHealthEventStatusToModel(crd) + if back == nil { + t.Fatalf("CRDHealthEventStatusToModel returned nil") + } + + if !reflect.DeepEqual(orig, back) { + t.Fatalf( + "health event status round-trip mismatch\noriginal: %+v\nconverted: %+v", + orig, + back, + ) + } +} + +// TestHealthEventStatus_JSONParity ensures that marshaling model -> JSON -> CRD -> JSON preserves all fields. +func TestHealthEventStatus_JSONParity(t *testing.T) { + now := time.Now() + trueBool := true + statusInProgress := model.StatusInProgress + + // Fully populate the model object + m := &model.HealthEventStatus{ + NodeQuarantined: &statusInProgress, + UserPodsEvictionStatus: model.OperationStatus{Status: model.StatusFailed, Message: "Eviction failed"}, + FaultRemediated: &trueBool, + LastRemediationTimestamp: func() *time.Time { + u := now.UTC().Truncate(time.Second) // normalize UTC & truncate nanoseconds + return &u + }(), + } + + // Marshal model to JSON + modelJSON, err := json.Marshal(m) + if err != nil { + t.Fatalf("failed to marshal model to JSON: %v", err) + } + + // Convert to CRD + crd := ModelHealthEventStatusToCRD(m) + + // Normalize CRD timestamp + if crd.LastRemediationTimestamp != nil { + t := crd.LastRemediationTimestamp.Time.UTC().Truncate(time.Second) + crd.LastRemediationTimestamp = &metav1.Time{Time: t} + } + + // Marshal CRD to JSON + crdJSON, err := json.Marshal(crd) + if err != nil { + t.Fatalf("failed to marshal CRD to JSON: %v", err) + } + + // Compare JSON + if string(modelJSON) != string(crdJSON) { + t.Errorf("JSON mismatch between model and CRD.\nModel JSON: %s\nCRD JSON: %s", + string(modelJSON), string(crdJSON)) + } +} + +// TestHealthEventStatus_JSONRoundTrip performs a full round-trip at the JSON level: +// Model → CRD → Model → JSON. Ensures that no information is lost in CRD conversion. +func TestHealthEventStatus_JSONRoundTrip(t *testing.T) { + now := time.Now() + falseBool := false + statusQuarantined := model.Quarantined + + // Original model + original := &model.HealthEventStatus{ + NodeQuarantined: &statusQuarantined, + UserPodsEvictionStatus: model.OperationStatus{Status: model.StatusSucceeded, Message: "Eviction done"}, + FaultRemediated: &falseBool, + LastRemediationTimestamp: &now, + } + + // Convert to CRD + crd := ModelHealthEventStatusToCRD(original) + + // Convert back to model + backToModel := CRDHealthEventStatusToModel(crd) + + // Marshal both to JSON + originalJSON, err := json.Marshal(original) + if err != nil { + t.Fatalf("failed to marshal original model: %v", err) + } + backJSON, err := json.Marshal(backToModel) + if err != nil { + t.Fatalf("failed to marshal round-trip model: %v", err) + } + + if string(originalJSON) != string(backJSON) { + t.Errorf("Round-trip JSON mismatch.\nOriginal: %s\nBack: %s", + string(originalJSON), string(backJSON)) + } +} + +// TestHealthEventSpec_JSONTagsUnified validates that the JSON tags on the CRD +// and Proto fields match the expected mapping. This prevents accidental tag +// mismatches when adding or changing fields. +func TestHealthEventSpec_JSONTagsUnified(t *testing.T) { + specType := reflect.TypeOf(v1alpha1.HealthEventSpec{}) + protoType := reflect.TypeOf(protos.HealthEvent{}) + + for fieldName, expectedJSON := range ProtoFieldsAffectingSpec { + // CRD check + crdField, ok := specType.FieldByName(fieldName) + if !ok { + t.Errorf("CRD missing field %s", fieldName) + continue + } + crdTag := crdField.Tag.Get("json") + if crdTag != expectedJSON && crdTag != expectedJSON+",omitempty" { + t.Errorf("CRD field %s JSON tag mismatch: got %q, want %q", fieldName, crdTag, expectedJSON) + } + + // Proto check + protoField, ok := protoType.FieldByName(fieldName) + if !ok { + t.Errorf("Proto missing field %s", fieldName) + continue + } + protoTag := protoField.Tag.Get("json") + if protoTag != expectedJSON && protoTag != expectedJSON+",omitempty" { + t.Errorf("Proto field %s JSON tag mismatch: got %q, want %q", fieldName, protoTag, expectedJSON) + } + } +} + +// TestHealthEventSpec_RoundTrip verifies that converting a Proto HealthEvent +// to a CRD spec and back preserves all field values. +func TestHealthEventSpec_RoundTrip(t *testing.T) { + p := &protos.HealthEvent{ + NodeName: "node-1", + IsFatal: true, + IsHealthy: false, + RecommendedAction: protos.RecommendedAction_RESTART_VM, + } + spec := ProtoHealthEventToCRDSpec(p) + back := CRDSpecToProtoHealthEvent(spec) + + origJSON, _ := json.Marshal(p) + backJSON, _ := json.Marshal(back) + if string(origJSON) != string(backJSON) { + t.Fatalf("Proto ↔ CRD round-trip mismatch\nOriginal: %s\nBack: %s", origJSON, backJSON) + } +} + +// TestHealthEventSpec_RecommendedActionEnum validates that all enum values +// of RecommendedAction round-trip correctly between Proto and CRD. +func TestHealthEventSpec_RecommendedActionEnum(t *testing.T) { + for i, name := range protos.RecommendedAction_name { + p := protos.RecommendedAction(i) + crd := ProtoRecommendedActionToCRD(p) + back := CRDRecommendedActionToProto(crd) + if back != p { + t.Errorf("enum round-trip failed for %s: got %v, want %v", name, back, p) + } + } +} + +// TestHealthEventSpec_RoundTripValues ensures Proto ↔ CRD conversions preserve values +func TestHealthEventSpec_RoundTripValues(t *testing.T) { + p := &protos.HealthEvent{ + NodeName: "node-1", + IsFatal: true, + IsHealthy: false, + RecommendedAction: protos.RecommendedAction_RESTART_VM, + } + + // Proto → CRD → Proto + spec := ProtoHealthEventToCRDSpec(p) + back := CRDSpecToProtoHealthEvent(spec) + + if !reflect.DeepEqual(p, back) { + origJSON, _ := json.Marshal(p) + backJSON, _ := json.Marshal(back) + t.Fatalf("Proto ↔ CRD round-trip mismatch\nOriginal: %s\nBack: %s", + origJSON, backJSON) + } + + // CRD → Proto → CRD + crdSpec := v1alpha1.HealthEventSpec{ + NodeName: "node-2", + IsFatal: ptr.To(false), + IsHealthy: ptr.To(true), + RecommendedAction: v1alpha1.RecommendedActionRestartVM, + } + backCRD := ProtoHealthEventToCRDSpec(CRDSpecToProtoHealthEvent(crdSpec)) + if !reflect.DeepEqual(crdSpec, backCRD) { + origJSON, _ := json.Marshal(crdSpec) + backJSON, _ := json.Marshal(backCRD) + t.Fatalf("CRD ↔ Proto round-trip mismatch\nOriginal: %s\nBack: %s", + origJSON, backJSON) + } +} diff --git a/data-models/pkg/conversion/healthevent_coversion.go b/data-models/pkg/conversion/healthevent_coversion.go new file mode 100644 index 000000000..47342926f --- /dev/null +++ b/data-models/pkg/conversion/healthevent_coversion.go @@ -0,0 +1,150 @@ +package conversion + +import ( + v1alpha1 "github.com/nvidia/nvsentinel/data-models/api/v1alpha1" + "github.com/nvidia/nvsentinel/data-models/pkg/model" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +// ======================= +// Enum conversions +// ======================= +// + +// NOTE: These assume model.Status and v1alpha1.Status +// have identical underlying values. +func ModelStatusToCRD(s model.Status) v1alpha1.Status { + return v1alpha1.Status(s) +} + +func CRDStatusToModel(s v1alpha1.Status) model.Status { + return model.Status(s) +} + +// +// ======================= +// OperationStatus conversions +// ======================= +// + +func ModelOpStatusToCRD(m model.OperationStatus) v1alpha1.OperationStatus { + return v1alpha1.OperationStatus{ + Status: ModelStatusToCRD(m.Status), + Message: m.Message, + } +} + +func CRDOpStatusToModel(c v1alpha1.OperationStatus) model.OperationStatus { + + return model.OperationStatus{ + Status: CRDStatusToModel(c.Status), + Message: c.Message, + } +} + +// +// ======================= +// HealthEventStatus conversions +// ======================= +// + +func ModelHealthEventStatusToCRD(m *model.HealthEventStatus) *v1alpha1.HealthEventStatus { + if m == nil { + return nil + } + + out := &v1alpha1.HealthEventStatus{ + NodeQuarantined: func() *v1alpha1.Status { + if m.NodeQuarantined == nil { + return nil + } + s := ModelStatusToCRD(*m.NodeQuarantined) + return &s + }(), + + UserPodsEvictionStatus: ModelOpStatusToCRD(m.UserPodsEvictionStatus), + FaultRemediated: m.FaultRemediated, + } + + if m.LastRemediationTimestamp != nil { + t := metav1.NewTime(*m.LastRemediationTimestamp) + out.LastRemediationTimestamp = &t + } + + return out +} + +func CRDHealthEventStatusToModel(c *v1alpha1.HealthEventStatus) *model.HealthEventStatus { + if c == nil { + return nil + } + + out := &model.HealthEventStatus{ + NodeQuarantined: func() *model.Status { + if c.NodeQuarantined == nil { + return nil + } + s := CRDStatusToModel(*c.NodeQuarantined) + return &s + }(), + + UserPodsEvictionStatus: CRDOpStatusToModel(c.UserPodsEvictionStatus), + FaultRemediated: c.FaultRemediated, + } + + if c.LastRemediationTimestamp != nil { + t := c.LastRemediationTimestamp.Time + out.LastRemediationTimestamp = &t + } + + return out +} + +func ProtoHealthEventToCRDSpec(p *protos.HealthEvent) v1alpha1.HealthEventSpec { + if p == nil { + return v1alpha1.HealthEventSpec{} + } + + spec := v1alpha1.HealthEventSpec{ + NodeName: p.NodeName, + RecommendedAction: ProtoRecommendedActionToCRD(p.RecommendedAction), + } + + // bool → *bool + spec.IsFatal = &p.IsFatal + spec.IsHealthy = &p.IsHealthy + + return spec +} + +func CRDSpecToProtoHealthEvent(c v1alpha1.HealthEventSpec) *protos.HealthEvent { + p := &protos.HealthEvent{ + NodeName: c.NodeName, + RecommendedAction: CRDRecommendedActionToProto(c.RecommendedAction), + } + + // *bool → bool (default false if nil) + if c.IsFatal != nil { + p.IsFatal = *c.IsFatal + } + + if c.IsHealthy != nil { + p.IsHealthy = *c.IsHealthy + } + + return p +} + +func ProtoRecommendedActionToCRD(p protos.RecommendedAction) v1alpha1.RecommendedAction { + return v1alpha1.RecommendedAction( + protos.RecommendedAction_name[int32(p)], + ) +} + +func CRDRecommendedActionToProto(c v1alpha1.RecommendedAction) protos.RecommendedAction { + return protos.RecommendedAction( + protos.RecommendedAction_value[string(c)], + ) +} diff --git a/data-models/pkg/conversion/healthevent_spec_contract.go b/data-models/pkg/conversion/healthevent_spec_contract.go new file mode 100644 index 000000000..648c4d1ee --- /dev/null +++ b/data-models/pkg/conversion/healthevent_spec_contract.go @@ -0,0 +1,31 @@ +package conversion + +// ProtoFieldsAffectingSpec defines the proto.HealthEvent fields +// that MUST be reflected in the CRD Spec. +// +// If a field is added to proto.HealthEvent and is not listed here +// or explicitly ignored, tests MUST fail. +var ProtoFieldsAffectingSpec = map[string]string{ + "NodeName": "nodeName", + "RecommendedAction": "recommendedAction", + "IsFatal": "isFatal", + "IsHealthy": "isHealthy", +} + +// ProtoObservationOnlyFields are proto fields that are explicitly +// NOT part of the CRD Spec contract. +// +// These are informational / telemetry / status-only fields. +var ProtoObservationOnlyFields = map[string]struct{}{ + "Version": {}, + "Agent": {}, + "ComponentClass": {}, + "CheckName": {}, + "Message": {}, + "ErrorCode": {}, + "EntitiesImpacted": {}, + "Metadata": {}, + "GeneratedTimestamp": {}, + "QuarantineOverrides": {}, + "DrainOverrides": {}, +} diff --git a/data-models/pkg/conversion/healthevent_spec_contract_test.go b/data-models/pkg/conversion/healthevent_spec_contract_test.go new file mode 100644 index 000000000..a8d9ad182 --- /dev/null +++ b/data-models/pkg/conversion/healthevent_spec_contract_test.go @@ -0,0 +1,63 @@ +package conversion + +import ( + "reflect" + "testing" + + v1alpha1 "github.com/nvidia/nvsentinel/data-models/api/v1alpha1" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" +) + +// TestProtoHealthEvent_SpecCoverage ensures that all proto.HealthEvent +// fields are either: +// 1. explicitly included in the CRD Spec contract, or +// 2. explicitly classified as observation-only +// +// Any new proto field MUST be categorized, or this test fails. +func TestProtoHealthEvent_SpecCoverage(t *testing.T) { + protoType := reflect.TypeOf(protos.HealthEvent{}) + + for i := 0; i < protoType.NumField(); i++ { + field := protoType.Field(i) + + // Skip unexported / internal proto fields + if field.PkgPath != "" { + continue + } + + name := field.Name + + if _, ok := ProtoFieldsAffectingSpec[name]; ok { + continue + } + + if _, ok := ProtoObservationOnlyFields[name]; ok { + continue + } + + t.Fatalf( + "proto.HealthEvent field %q is not classified: "+ + "decide whether it belongs in CRD Spec or is observation-only", + name, + ) + } +} + +func TestCRDFieldsMappedToProto(t *testing.T) { + crdType := reflect.TypeOf(v1alpha1.HealthEventSpec{}) + + for i := 0; i < crdType.NumField(); i++ { + field := crdType.Field(i) + name := field.Name + + // Skip unexported/internal fields + if field.PkgPath != "" { + continue + } + + // Check that CRD field exists in ProtoFieldsAffectingSpec + if _, ok := ProtoFieldsAffectingSpec[name]; !ok { + t.Errorf("CRD field %q is not mapped to any proto field in ProtoFieldsAffectingSpec", name) + } + } +} diff --git a/data-models/pkg/conversion/parity_test.go b/data-models/pkg/conversion/parity_test.go new file mode 100644 index 000000000..026251dbe --- /dev/null +++ b/data-models/pkg/conversion/parity_test.go @@ -0,0 +1,136 @@ +package conversion + +import ( + "reflect" + "testing" + + v1alpha1 "github.com/nvidia/nvsentinel/data-models/api/v1alpha1" + "github.com/nvidia/nvsentinel/data-models/pkg/model" +) + +// TestHealthEventStatus_RecursiveFieldParity ensures that every exported field +// in the model struct exists in the corresponding CRD struct, recursively. +// This catches additions in nested structs too. +func TestHealthEventStatus_RecursiveFieldParity(t *testing.T) { + modelFields := exportedFieldNamesRecursive(reflect.TypeOf(model.HealthEventStatus{})) + crdFields := exportedFieldNamesRecursive(reflect.TypeOf(v1alpha1.HealthEventStatus{})) + + // Check that all model fields exist in CRD + for f := range modelFields { + if _, ok := crdFields[f]; !ok { + t.Fatalf("field %q exists in model.HealthEventStatus but not in v1alpha1.HealthEventStatus", f) + } + } + + // Check that all CRD fields exist in model + for f := range crdFields { + if _, ok := modelFields[f]; !ok { + t.Fatalf("field %q exists in v1alpha1.HealthEventStatus but not in model.HealthEventStatus", f) + } + } +} + +// exportedFieldNamesRecursive returns a map of all exported fields in a struct, +// recursively including nested structs as "Parent.Child" keys. +func exportedFieldNamesRecursive(t reflect.Type) map[string]struct{} { + fields := make(map[string]struct{}) + + for i := 0; i < t.NumField(); i++ { + f := t.Field(i) + if f.PkgPath != "" { // unexported + continue + } + + // Add top-level field + fields[f.Name] = struct{}{} + + // Recurse into nested struct (skip time.Time or metav1.Time) + if f.Type.Kind() == reflect.Struct && + f.Type.String() != "time.Time" && + f.Type.String() != "k8s.io/apimachinery/pkg/apis/meta/v1.Time" { + for sub := range exportedFieldNamesRecursive(f.Type) { + fields[f.Name+"."+sub] = struct{}{} + } + } + } + return fields +} + +func TestHealthEventStatus_FieldParity(t *testing.T) { + modelType := reflect.TypeOf(model.HealthEventStatus{}) + crdType := reflect.TypeOf(v1alpha1.HealthEventStatus{}) + + modelFields := exportedFieldNames(modelType) + crdFields := exportedFieldNames(crdType) + + assertSameFieldSet( + t, + "model.HealthEventStatus", + modelFields, + "v1alpha1.HealthEventStatus", + crdFields, + ) +} + +func TestStatusEnum_Parity(t *testing.T) { + modelEnums := []model.Status{ + model.StatusNotStarted, + model.StatusInProgress, + model.StatusFailed, + model.StatusSucceeded, + model.AlreadyDrained, + } + + crdEnums := []v1alpha1.Status{ + v1alpha1.StatusNotStarted, + v1alpha1.StatusInProgress, + v1alpha1.StatusFailed, + v1alpha1.StatusSucceeded, + v1alpha1.StatusAlreadyDrained, + } + + if len(modelEnums) != len(crdEnums) { + t.Fatalf("model and CRD Status enums differ in length") + } + + for i := range modelEnums { + if v1alpha1.Status(modelEnums[i]) != crdEnums[i] { + t.Fatalf("enum mismatch: %v vs %v", modelEnums[i], crdEnums[i]) + } + } +} + +func exportedFieldNames(t reflect.Type) map[string]struct{} { + fields := make(map[string]struct{}) + + for i := 0; i < t.NumField(); i++ { + f := t.Field(i) + + // Only exported fields + if f.PkgPath == "" { + fields[f.Name] = struct{}{} + } + } + + return fields +} + +func assertSameFieldSet( + t *testing.T, + leftName string, + left map[string]struct{}, + rightName string, + right map[string]struct{}, +) { + for f := range left { + if _, ok := right[f]; !ok { + t.Fatalf("field %q exists in %s but not in %s", f, leftName, rightName) + } + } + + for f := range right { + if _, ok := left[f]; !ok { + t.Fatalf("field %q exists in %s but not in %s", f, rightName, leftName) + } + } +} diff --git a/data-models/pkg/model/health_event_extentions.go b/data-models/pkg/model/health_event_extentions.go index c91a10220..f3022735a 100644 --- a/data-models/pkg/model/health_event_extentions.go +++ b/data-models/pkg/model/health_event_extentions.go @@ -28,6 +28,7 @@ const ( StatusFailed Status = "Failed" StatusSucceeded Status = "Succeeded" AlreadyDrained Status = "AlreadyDrained" + AvinashNewStatus Status = "AvinashTest" ) const ( @@ -38,16 +39,16 @@ const ( ) type OperationStatus struct { - Status Status `bson:"status"` - Message string `bson:"message,omitempty"` + Status Status `bson:"status" json:"status"` + Message string `bson:"message,omitempty" json:"message,omitempty"` } type HealthEventStatus struct { - NodeQuarantined *Status `bson:"nodequarantined" json:"nodequarantined,omitempty"` - UserPodsEvictionStatus OperationStatus `bson:"userpodsevictionstatus" json:"userpodsevictionstatus"` - FaultRemediated *bool `bson:"faultremediated" json:"faultremediated,omitempty"` + NodeQuarantined *Status `bson:"nodeQuarantined" json:"nodeQuarantined,omitempty"` + UserPodsEvictionStatus OperationStatus `bson:"userpodsEvictionStatus" json:"userPodsEvictionStatus"` + FaultRemediated *bool `bson:"faultRemediated" json:"faultRemediated,omitempty"` //nolint:lll // Long line due to struct tags for both bson and json serialization - LastRemediationTimestamp *time.Time `bson:"lastremediationtimestamp,omitempty" json:"lastremediationtimestamp,omitempty"` + LastRemediationTimestamp *time.Time `bson:"lastRemediationTimestamp,omitempty" json:"lastRemediationTimestamp,omitempty"` } type HealthEventWithStatus struct { diff --git a/data-models/pkg/protos/health_event.pb.go b/data-models/pkg/protos/health_event.pb.go index 9dec31f00..2e7f8c440 100644 --- a/data-models/pkg/protos/health_event.pb.go +++ b/data-models/pkg/protos/health_event.pb.go @@ -21,13 +21,14 @@ package protos import ( + reflect "reflect" + sync "sync" + unsafe "unsafe" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" emptypb "google.golang.org/protobuf/types/known/emptypb" timestamppb "google.golang.org/protobuf/types/known/timestamppb" - reflect "reflect" - sync "sync" - unsafe "unsafe" ) const ( diff --git a/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthevents.yaml b/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthevents.yaml new file mode 100644 index 000000000..f67634188 --- /dev/null +++ b/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthevents.yaml @@ -0,0 +1,114 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.0 + name: healthevents.healthevents.dgxc.nvidia.com +spec: + group: healthevents.dgxc.nvidia.com + names: + kind: HealthEvent + listKind: HealthEventList + plural: healthevents + singular: healthevent + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: HealthEventSpec defines the desired state of HealthEvent + properties: + isFatal: + description: Whether the fault is fatal + type: boolean + isHealthy: + description: Whether the system is healthy + type: boolean + nodeName: + description: NodeName is the name of the target node + type: string + recommendedAction: + description: Recommended remediation action + enum: + - NONE + - COMPONENT_RESET + - CONTACT_SUPPORT + - RUN_FIELDDIAG + - RESTART_VM + - RESTART_BM + - REPLACE_VM + - RUN_DCGMEUD + - UNKNOWN + type: string + type: object + status: + description: HealthEventStatus defines the observed state of HealthEvent + properties: + faultRemediated: + description: Whether the fault has been remediated + type: boolean + lastRemediationTimestamp: + description: Timestamp of the last remediation attempt + format: date-time + type: string + nodeQuarantined: + description: Whether the node has been quarantined + enum: + - NotStarted + - InProgress + - Failed + - Succeeded + - AlreadyDrained + - UnQuarantined + - Quarantined + - AlreadyQuarantined + - Cancelled + type: string + userPodsEvictionStatus: + description: Status of user pod eviction + properties: + message: + type: string + status: + enum: + - NotStarted + - InProgress + - Failed + - Succeeded + - AlreadyDrained + - UnQuarantined + - Quarantined + - AlreadyQuarantined + - Cancelled + type: string + required: + - status + type: object + required: + - userPodsEvictionStatus + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthstatuses.yaml b/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthstatuses.yaml new file mode 100644 index 000000000..de00ff16b --- /dev/null +++ b/distros/kubernetes/nvsentinel/crds/healthevents.dgxc.nvidia.com_healthstatuses.yaml @@ -0,0 +1,181 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.0 + name: healthstatuses.healthevents.dgxc.nvidia.com +spec: + group: healthevents.dgxc.nvidia.com + names: + kind: HealthStatus + listKind: HealthStatusList + plural: healthstatuses + singular: healthstatus + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + agent: + type: string + checkName: + type: string + componentClass: + type: string + drainOverrides: + properties: + force: + type: boolean + skip: + type: boolean + type: object + entitiesImpacted: + items: + properties: + entityType: + type: string + entityValue: + type: string + type: object + type: array + errorCode: + items: + type: string + type: array + generatedTimestamp: + description: "A Timestamp represents a point in time independent of + any time zone or local\ncalendar, encoded as a count of seconds + and fractions of seconds at\nnanosecond resolution. The count is + relative to an epoch at UTC midnight on\nJanuary 1, 1970, in the + proleptic Gregorian calendar which extends the\nGregorian calendar + backwards to year one.\n\nAll minutes are 60 seconds long. Leap + seconds are \"smeared\" so that no leap\nsecond table is needed + for interpretation, using a [24-hour linear\nsmear](https://developers.google.com/time/smear).\n\nThe + range is from 0001-01-01T00:00:00Z to 9999-12-31T23:59:59.999999999Z. + By\nrestricting to that range, we ensure that we can convert to + and from [RFC\n3339](https://www.ietf.org/rfc/rfc3339.txt) date + strings.\n\n# Examples\n\nExample 1: Compute Timestamp from POSIX + `time()`.\n\n\tTimestamp timestamp;\n\ttimestamp.set_seconds(time(NULL));\n\ttimestamp.set_nanos(0);\n\nExample + 2: Compute Timestamp from POSIX `gettimeofday()`.\n\n\tstruct timeval + tv;\n\tgettimeofday(&tv, NULL);\n\n\tTimestamp timestamp;\n\ttimestamp.set_seconds(tv.tv_sec);\n\ttimestamp.set_nanos(tv.tv_usec + * 1000);\n\nExample 3: Compute Timestamp from Win32 `GetSystemTimeAsFileTime()`.\n\n\tFILETIME + ft;\n\tGetSystemTimeAsFileTime(&ft);\n\tUINT64 ticks = (((UINT64)ft.dwHighDateTime) + << 32) | ft.dwLowDateTime;\n\n\t// A Windows tick is 100 nanoseconds. + Windows epoch 1601-01-01T00:00:00Z\n\t// is 11644473600 seconds + before Unix epoch 1970-01-01T00:00:00Z.\n\tTimestamp timestamp;\n\ttimestamp.set_seconds((INT64) + ((ticks / 10000000) - 11644473600LL));\n\ttimestamp.set_nanos((INT32) + ((ticks % 10000000) * 100));\n\nExample 4: Compute Timestamp from + Java `System.currentTimeMillis()`.\n\n\tlong millis = System.currentTimeMillis();\n\n\tTimestamp + timestamp = Timestamp.newBuilder().setSeconds(millis / 1000)\n\t + \ .setNanos((int) ((millis % 1000) * 1000000)).build();\n\nExample + 5: Compute Timestamp from Java `Instant.now()`.\n\n\tInstant now + = Instant.now();\n\n\tTimestamp timestamp =\n\t Timestamp.newBuilder().setSeconds(now.getEpochSecond())\n\t + \ .setNanos(now.getNano()).build();\n\nExample 6: Compute + Timestamp from current time in Python.\n\n\ttimestamp = Timestamp()\n\ttimestamp.GetCurrentTime()\n\n# + JSON Mapping\n\nIn JSON format, the Timestamp type is encoded as + a string in the\n[RFC 3339](https://www.ietf.org/rfc/rfc3339.txt) + format. That is, the\nformat is \"{year}-{month}-{day}T{hour}:{min}:{sec}[.{frac_sec}]Z\"\nwhere + {year} is always expressed using four digits while {month}, {day},\n{hour}, + {min}, and {sec} are zero-padded to two digits each. The fractional\nseconds, + which can go up to 9 digits (i.e. up to 1 nanosecond resolution),\nare + optional. The \"Z\" suffix indicates the timezone (\"UTC\"); the + timezone\nis required. A proto3 JSON serializer should always use + UTC (as indicated by\n\"Z\") when printing the Timestamp type and + a proto3 JSON parser should be\nable to accept both UTC and other + timezones (as indicated by an offset).\n\nFor example, \"2017-01-15T01:30:15.01Z\" + encodes 15.01 seconds past\n01:30 UTC on January 15, 2017.\n\nIn + JavaScript, one can convert a Date object to this format using the\nstandard\n[toISOString()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Date/toISOString)\nmethod. + In Python, a standard `datetime.datetime` object can be converted\nto + this format using\n[`strftime`](https://docs.python.org/2/library/time.html#time.strftime) + with\nthe time format spec '%Y-%m-%dT%H:%M:%S.%fZ'. Likewise, in + Java, one can use\nthe Joda Time's [`ISODateTimeFormat.dateTime()`](\nhttp://joda-time.sourceforge.net/apidocs/org/joda/time/format/ISODateTimeFormat.html#dateTime()\n) + to obtain a formatter capable of generating timestamps in this format." + properties: + nanos: + description: |- + Non-negative fractions of a second at nanosecond resolution. This field is + the nanosecond portion of the duration, not an alternative to seconds. + Negative second values with fractions must still have non-negative nanos + values that count forward in time. Must be between 0 and 999,999,999 + inclusive. + format: int32 + type: integer + seconds: + description: |- + Represents seconds of UTC time since Unix epoch 1970-01-01T00:00:00Z. Must + be between -315576000000 and 315576000000 inclusive (which corresponds to + 0001-01-01T00:00:00Z to 9999-12-31T23:59:59Z). + format: int64 + type: integer + type: object + isFatal: + type: boolean + isHealthy: + type: boolean + message: + type: string + metadata: + additionalProperties: + type: string + type: object + nodeName: + type: string + quarantineOverrides: + properties: + force: + type: boolean + skip: + type: boolean + type: object + recommendedAction: + format: int32 + type: integer + version: + format: int32 + type: integer + type: object + status: + properties: + faultremediated: + type: boolean + lastremediationtimestamp: + format: date-time + type: string + nodequarantined: + type: string + userpodsevictionstatus: + properties: + message: + type: string + status: + type: string + type: object + required: + - userpodsevictionstatus + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/janitor/Makefile b/janitor/Makefile index 95176d773..4a608becd 100644 --- a/janitor/Makefile +++ b/janitor/Makefile @@ -58,7 +58,7 @@ all: lint-test ## Run lint-test (default target) generate: ## Generate CRDs and move them to Helm chart directory @echo "Generating CRDs for janitor..." @# Install controller-gen if not present - @which controller-gen > /dev/null || (echo "Installing controller-gen..." && go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest) + @which controller-gen > /dev/null || (echo "Installing controller-gen..." && go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)) @# Generate CRDs into api/v1alpha1 directory @# Note: Generated CRD YAML files do not include license headers (this is expected) @# YAML files are excluded from license header checks via main Makefile: -ignore '**/*.yaml' diff --git a/make/common.mk b/make/common.mk index 135891e1e..e8ec73b59 100644 --- a/make/common.mk +++ b/make/common.mk @@ -64,6 +64,9 @@ get-version = $(shell yq '.$(1)' $(VERSIONS_FILE) 2>/dev/null || echo "") # Setup-envtest version (used in modules with kubebuilder tests) SETUP_ENVTEST_VERSION := $(call get-version,go_tools.setup_envtest) +# Controller-gen version +CONTROLLER_GEN_VERSION := $(call get-version,go_tools.controller_gen) + # Go binary and tools (standardized versions) GO := go GOLANGCI_LINT := golangci-lint