Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 98 additions & 30 deletions data-models/pkg/protos/health_event.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions data-models/protobufs/health_event.proto
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ message HealthEvents {
repeated HealthEvent events = 2;
}

// ProcessingStrategy defines how downstream modules should handle the event.
// EXECUTE_REMEDIATION: normal behavior; downstream modules may update cluster state.
// STORE_ONLY: observability-only behavior; event should be persisted/exported but should not modify cluster resources.
enum ProcessingStrategy {
UNSPECIFIED = 0;
EXECUTE_REMEDIATION = 1;
STORE_ONLY = 2;
}

enum RecommendedAction {
NONE = 0;
COMPONENT_RESET = 2;
Expand Down Expand Up @@ -66,6 +75,7 @@ message HealthEvent {
string nodeName = 13;
BehaviourOverrides quarantineOverrides = 14;
BehaviourOverrides drainOverrides = 15;
ProcessingStrategy processingStrategy = 16;
}

message BehaviourOverrides {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ spec:
- "{{ join "," $root.Values.enabledChecks }}"
- "--metadata-path"
- "{{ $root.Values.global.metadataPath }}"
- "--processing-strategy"
- {{ $root.Values.processingStrategy }}
resources:
{{- toYaml $root.Values.resources | nindent 12 }}
ports:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,10 @@ tolerations: []
journalHostPath: /var/log

logLevel: info

# Processing strategy for health events
# valid values: EXECUTE_REMEDIATION, STORE_ONLY
# default: EXECUTE_REMEDIATION
# EXECUTE_REMEDIATION: normal behavior; downstream modules may update cluster state.
# STORE_ONLY: observability-only behavior; event should be persisted/exported but should not modify cluster resources (i.e., no node conditions, no quarantine, no drain, no remediation).
processingStrategy: EXECUTE_REMEDIATION
1 change: 1 addition & 0 deletions event-exporter/pkg/transformer/cloudevents.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func ToCloudEvent(event *pb.HealthEvent, metadata map[string]string) (*CloudEven
"entitiesImpacted": entities,
"generatedTimestamp": timestamp,
"nodeName": event.NodeName,
"processingStrategy": event.ProcessingStrategy.String(),
}

if len(event.Metadata) > 0 {
Expand Down
4 changes: 4 additions & 0 deletions event-exporter/pkg/transformer/cloudevents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func TestToCloudEvent(t *testing.T) {
Force: false,
Skip: true,
},
ProcessingStrategy: pb.ProcessingStrategy_STORE_ONLY,
},
metadata: map[string]string{
"cluster": "prod-cluster-1",
Expand Down Expand Up @@ -102,6 +103,9 @@ func TestToCloudEvent(t *testing.T) {
if healthEvent["recommendedAction"] != "RESTART_VM" {
t.Errorf("recommendedAction = %v, want %v", healthEvent["recommendedAction"], "RESTART_VM")
}
if healthEvent["processingStrategy"] != "STORE_ONLY" {
t.Errorf("processingStrategy = %v, want STORE_ONLY", healthEvent["processingStrategy"])
}

entities := healthEvent["entitiesImpacted"].([]map[string]any)
if len(entities) != 2 {
Expand Down
1 change: 1 addition & 0 deletions fault-quarantine/pkg/evaluator/rule_evaluator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ func TestRoundTrip(t *testing.T) {
"nanos": float64(eventTime.GetNanos()),
},
"nodeName": "test-node",
"processingStrategy": float64(0),
"quarantineOverrides": nil,
"drainOverrides": nil,
}
Expand Down
2 changes: 1 addition & 1 deletion fault-quarantine/pkg/initializer/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func InitializeAll(ctx context.Context, params InitializationParams) (*Component
}

builder := client.GetPipelineBuilder()
pipeline := builder.BuildAllHealthEventInsertsPipeline()
pipeline := builder.BuildProcessableHealthEventInsertsPipeline()

var tomlCfg config.TomlConfig
if err := configmanager.LoadTOMLConfig(params.TomlConfigPath, &tomlCfg); err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,9 @@ func (e *Engine) mapMaintenanceEventToHealthEvent(
Metadata: event.Metadata, // Pass along metadata
NodeName: event.NodeName, // K8s node name
GeneratedTimestamp: timestamppb.New(time.Now()),
// TODO: Remove hardcoded processing strategy and make it configurable via the config file.
// PR: https://github.com/NVIDIA/NVSentinel/pull/641
ProcessingStrategy: pb.ProcessingStrategy_EXECUTE_REMEDIATION,
}

return healthEvent, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ func TestMapMaintenanceEventToHealthEvent(t *testing.T) {
EntitiesImpacted: []*pb.Entity{
{EntityType: "gce_instance", EntityValue: "instance-123"},
},
Metadata: map[string]string{"key": "value"},
NodeName: "node-a",
Metadata: map[string]string{"key": "value"},
NodeName: "node-a",
ProcessingStrategy: pb.ProcessingStrategy_EXECUTE_REMEDIATION,
},
},
{
Expand All @@ -268,7 +269,8 @@ func TestMapMaintenanceEventToHealthEvent(t *testing.T) {
EntitiesImpacted: []*pb.Entity{
{EntityType: "EC2", EntityValue: "i-abcdef"},
},
NodeName: "node-b",
NodeName: "node-b",
ProcessingStrategy: pb.ProcessingStrategy_EXECUTE_REMEDIATION,
},
},
{
Expand Down Expand Up @@ -331,7 +333,8 @@ func TestMapMaintenanceEventToHealthEvent(t *testing.T) {
EntitiesImpacted: []*pb.Entity{
{EntityType: "gce_instance", EntityValue: "instance-789"},
},
NodeName: "node-e",
NodeName: "node-e",
ProcessingStrategy: pb.ProcessingStrategy_EXECUTE_REMEDIATION,
},
},
}
Expand Down
Loading
Loading