Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ dist/
# IDE & Editor Configurations
# ============================================================================


### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
# User-specific stuff
Expand Down
1 change: 1 addition & 0 deletions .versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ go_tools:
setup_envtest: 'latest'
goimports: 'v0.30.0'
crane: 'v0.20.2'
controller_gen: 'v0.20.0'

# Protocol Buffers / gRPC
protobuf:
Expand Down
26 changes: 26 additions & 0 deletions data-models/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,36 @@ DOCKER_EXTRA_ARGS :=
include ../make/common.mk
include ../make/go.mk

# API and CRD paths
API_DIR := api/v1alpha1
CRD_OUTPUT_DIR := $(REPO_ROOT)/distros/kubernetes/nvsentinel/crds


# =============================================================================
# MODULE-SPECIFIC TARGETS
# =============================================================================

# generate: Generate deepcopy, CRD types, and other Kubernetes boilerplate
# Depends on tools being installed
.PHONY: generate
generate: ## Generate CRDs and move them to Helm chart directory
@echo "Generating CRDs for $(API_DIR)..."
@# Install controller-gen if not present
@which controller-gen > /dev/null || (echo "Installing controller-gen..." && \
go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION))
@# Generate deepcopy files for API types
go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) object paths=./$(API_DIR)
@# Generate CRDs directly into API_DIR
go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) \
crd paths=./$(API_DIR) output:crd:dir=./$(API_DIR)
@# Move generated CRDs to Helm chart directory
@echo "Moving generated CRDs to $(CRD_OUTPUT_DIR)..."
@mkdir -p $(CRD_OUTPUT_DIR)
@mv ./$(API_DIR)/*.yaml $(CRD_OUTPUT_DIR)/ || true
@echo "CRDs generated and moved to $(CRD_OUTPUT_DIR)"
@ls -1 $(CRD_OUTPUT_DIR)/*.yaml || echo "No CRD YAMLs generated"


# Generate Go protobuf files for data-models (shared across all Go modules)
.PHONY: protos-generate
protos-generate: protos-clean
Expand Down
34 changes: 34 additions & 0 deletions data-models/api/v1alpha1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package v1alpha1 contains API Schema definitions for the healthevents v1alpha1 API group
// +kubebuilder:object:generate=true
// +groupName=healthevents.dgxc.nvidia.com
package v1alpha1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "healthevents.dgxc.nvidia.com", Version: "v1alpha1"}

// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
118 changes: 118 additions & 0 deletions data-models/api/v1alpha1/healthstatus_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

import (
"github.com/nvidia/nvsentinel/data-models/pkg/model"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// +kubebuilder:validation:Enum=NotStarted;InProgress;Failed;Succeeded;AlreadyDrained;UnQuarantined;Quarantined;AlreadyQuarantined;Cancelled
type Status = model.Status

type OperationStatus = model.OperationStatus

// HealthEventSnapshot represents a read-only snapshot of a reported health event.
// This data is observational and originates outside Kubernetes.
type HealthEventSnapshot struct {
// Version of the reported health event schema
Version uint32 `json:"version,omitempty"`

// Reporting agent that generated the event
Agent string `json:"agent,omitempty"`

// Component class that raised the event
ComponentClass string `json:"componentClass,omitempty"`

// Specific check or rule that triggered the event
CheckName string `json:"checkName,omitempty"`

// Indicates whether the event is fatal
IsFatal bool `json:"isFatal,omitempty"`

// Indicates whether the system was reported healthy
IsHealthy bool `json:"isHealthy,omitempty"`

// Human-readable event message
Message string `json:"message,omitempty"`

// Recommended action provided by the reporting system
RecommendedAction string `json:"recommendedAction,omitempty"`

// Error codes associated with this health event
ErrorCode []string `json:"errorCode,omitempty"`

// Additional key-value metadata provided by the agent
Metadata map[string]string `json:"metadata,omitempty"`

// Time at which the event was generated by the source
GeneratedTimestamp *metav1.Time `json:"generatedTimestamp,omitempty"`
}

// RemediationStatus captures the observed state of remediation workflows
type RemediationStatus struct {
// Indicates whether the node is quarantined
NodeQuarantined *Status `json:"nodeQuarantined,omitempty"`

// Status of user pods eviction process
UserPodsEvictionStatus *OperationStatus `json:"userPodsEvictionStatus,omitempty"`

// Whether the fault has been remediated
FaultRemediated *bool `json:"faultRemediated,omitempty"`

// Timestamp of the last remediation attempt
LastRemediationTimestamp *metav1.Time `json:"lastRemediationTimestamp,omitempty"`
}

// HealthEventStatus defines the observed state of HealthStatus
type HealthEventStatus struct {
// Snapshot of the reported health event
Event *HealthEventSnapshot `json:"event,omitempty"`

// Observed remediation state
Remediation *RemediationStatus `json:"remediation,omitempty"`
}

// HealthEventSpec defines the desired state of HealthStatus
type HealthEventSpec struct {
// Unique identifier for the health event
// +kubebuilder:validation:Required
EventID string `json:"eventID"`

// Node associated with this health event
// +kubebuilder:validation:Required
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still looks like we're redefining the types, we ideally don't want to maintain two copies of the same types and keep them in sync

NodeName string `json:"nodeName"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
type HealthStatus struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so my thought was this struct would look like this:

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
type HealthStatus struct {
  metav1.TypeMeta   `json:",inline"`
  metav1.ObjectMeta `json:"metadata,omitempty"`

  Spec   model.HealthEvent   `json:"spec,omitempty"`
  Status model.HealthEventStatus `json:"status,omitempty"`
}

So you would not define your own spec and status objects we would just use the existing ones.

There's some implications for api-versioning going forward if we want to adhere to best practices that we should discuss with the NVIDIA folks if we do it this way but it seems like a clean way to share the object that the other datasources use.

metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec HealthEventSpec `json:"spec,omitempty"`
Status HealthEventStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true
type HealthStatusList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []HealthStatus `json:"items"`
}

func init() {
SchemeBuilder.Register(&HealthStatus{}, &HealthStatusList{})
}
173 changes: 173 additions & 0 deletions data-models/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading