-
Notifications
You must be signed in to change notification settings - Fork 33
k8s API for healtheventwithstatus model #640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| // Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| // Package v1alpha1 contains API Schema definitions for the healthevents v1alpha1 API group | ||
| // +kubebuilder:object:generate=true | ||
| // +groupName=healthevents.dgxc.nvidia.com | ||
| package v1alpha1 | ||
|
|
||
| import ( | ||
| "k8s.io/apimachinery/pkg/runtime/schema" | ||
| "sigs.k8s.io/controller-runtime/pkg/scheme" | ||
| ) | ||
|
|
||
| var ( | ||
| // GroupVersion is group version used to register these objects | ||
| GroupVersion = schema.GroupVersion{Group: "healthevents.dgxc.nvidia.com", Version: "v1alpha1"} | ||
|
|
||
| // SchemeBuilder is used to add go types to the GroupVersionKind scheme | ||
| SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} | ||
|
|
||
| // AddToScheme adds the types in this group-version to the given scheme. | ||
| AddToScheme = SchemeBuilder.AddToScheme | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| // Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| package v1alpha1 | ||
|
|
||
| import ( | ||
| "github.com/nvidia/nvsentinel/data-models/pkg/model" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| ) | ||
|
|
||
| // +kubebuilder:validation:Enum=NotStarted;InProgress;Failed;Succeeded;AlreadyDrained;UnQuarantined;Quarantined;AlreadyQuarantined;Cancelled | ||
| type Status = model.Status | ||
|
|
||
| type OperationStatus = model.OperationStatus | ||
|
|
||
| // HealthEventSnapshot represents a read-only snapshot of a reported health event. | ||
| // This data is observational and originates outside Kubernetes. | ||
| type HealthEventSnapshot struct { | ||
| // Version of the reported health event schema | ||
| Version uint32 `json:"version,omitempty"` | ||
|
|
||
| // Reporting agent that generated the event | ||
| Agent string `json:"agent,omitempty"` | ||
|
|
||
| // Component class that raised the event | ||
| ComponentClass string `json:"componentClass,omitempty"` | ||
|
|
||
| // Specific check or rule that triggered the event | ||
| CheckName string `json:"checkName,omitempty"` | ||
|
|
||
| // Indicates whether the event is fatal | ||
| IsFatal bool `json:"isFatal,omitempty"` | ||
|
|
||
| // Indicates whether the system was reported healthy | ||
| IsHealthy bool `json:"isHealthy,omitempty"` | ||
|
|
||
| // Human-readable event message | ||
| Message string `json:"message,omitempty"` | ||
|
|
||
| // Recommended action provided by the reporting system | ||
| RecommendedAction string `json:"recommendedAction,omitempty"` | ||
|
|
||
| // Error codes associated with this health event | ||
| ErrorCode []string `json:"errorCode,omitempty"` | ||
|
|
||
| // Additional key-value metadata provided by the agent | ||
| Metadata map[string]string `json:"metadata,omitempty"` | ||
|
|
||
| // Time at which the event was generated by the source | ||
| GeneratedTimestamp *metav1.Time `json:"generatedTimestamp,omitempty"` | ||
| } | ||
|
|
||
| // RemediationStatus captures the observed state of remediation workflows | ||
| type RemediationStatus struct { | ||
| // Indicates whether the node is quarantined | ||
| NodeQuarantined *Status `json:"nodeQuarantined,omitempty"` | ||
|
|
||
| // Status of user pods eviction process | ||
| UserPodsEvictionStatus *OperationStatus `json:"userPodsEvictionStatus,omitempty"` | ||
|
|
||
| // Whether the fault has been remediated | ||
| FaultRemediated *bool `json:"faultRemediated,omitempty"` | ||
|
|
||
| // Timestamp of the last remediation attempt | ||
| LastRemediationTimestamp *metav1.Time `json:"lastRemediationTimestamp,omitempty"` | ||
| } | ||
|
|
||
| // HealthEventStatus defines the observed state of HealthStatus | ||
| type HealthEventStatus struct { | ||
| // Snapshot of the reported health event | ||
| Event *HealthEventSnapshot `json:"event,omitempty"` | ||
|
|
||
| // Observed remediation state | ||
| Remediation *RemediationStatus `json:"remediation,omitempty"` | ||
| } | ||
|
|
||
| // HealthEventSpec defines the desired state of HealthStatus | ||
| type HealthEventSpec struct { | ||
| // Unique identifier for the health event | ||
| // +kubebuilder:validation:Required | ||
| EventID string `json:"eventID"` | ||
|
|
||
| // Node associated with this health event | ||
| // +kubebuilder:validation:Required | ||
| NodeName string `json:"nodeName"` | ||
| } | ||
|
|
||
| // +kubebuilder:object:root=true | ||
| // +kubebuilder:subresource:status | ||
| type HealthStatus struct { | ||
|
||
| metav1.TypeMeta `json:",inline"` | ||
| metav1.ObjectMeta `json:"metadata,omitempty"` | ||
|
|
||
| Spec HealthEventSpec `json:"spec,omitempty"` | ||
| Status HealthEventStatus `json:"status,omitempty"` | ||
| } | ||
|
|
||
| // +kubebuilder:object:root=true | ||
| type HealthStatusList struct { | ||
| metav1.TypeMeta `json:",inline"` | ||
| metav1.ListMeta `json:"metadata,omitempty"` | ||
| Items []HealthStatus `json:"items"` | ||
| } | ||
|
|
||
| func init() { | ||
| SchemeBuilder.Register(&HealthStatus{}, &HealthStatusList{}) | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still looks like we're redefining the types, we ideally don't want to maintain two copies of the same types and keep them in sync