Skip to content

Commit 56608f2

Browse files
committed
MachineHealthCheck supports checking Machine conditions
MachineHealthCheck currently only allows checking Node conditions to validate if a machine is healthy. However, machine conditions capture conditions that do not exist on nodes, for example, control plane node conditions such as EtcdPodHealthy, SchedulerPodHealthy that can indicate if a controlplane machine has been created correctly. Adding support for Machine conditions enables us to perform remediation during control plane upgrades. This PR introduces a new fieldas part of the MachineHealthCheckSpec: - `UnhealthyMachineConditions` This will mirror the behavior of `UnhealthyNodeConditions` but the MachineHealthCheck controller will instead check the machine conditions.
1 parent d44a8b0 commit 56608f2

18 files changed

+701
-3
lines changed

api/core/v1beta1/machinehealthcheck_types.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,14 @@ type MachineHealthCheckSpec struct {
6969
// +kubebuilder:validation:MaxItems=100
7070
UnhealthyConditions []UnhealthyCondition `json:"unhealthyConditions,omitempty"`
7171

72+
// unhealthyMachineConditions contains a list of the machine conditions that determine
73+
// whether a node is considered unhealthy. The conditions are combined in a
74+
// logical OR, i.e. if any of the conditions is met, the node is unhealthy.
75+
//
76+
// +optional
77+
// +kubebuilder:validation:MaxItems=100
78+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
79+
7280
// maxUnhealthy specifies the maximum number of unhealthy machines allowed.
7381
// Any further remediation is only allowed if at most "maxUnhealthy" machines selected by
7482
// "selector" are not healthy.
@@ -148,6 +156,34 @@ type UnhealthyCondition struct {
148156

149157
// ANCHOR_END: UnhealthyCondition
150158

159+
// ANCHOR: UnhealthyMachineCondition
160+
161+
// UnhealthyMachineCondition represents a Node condition type and value with a timeout
162+
// specified as a duration. When the named condition has been in the given
163+
// status for at least the timeout value, a node is considered unhealthy.
164+
type UnhealthyMachineCondition struct {
165+
// type of Node condition
166+
// +kubebuilder:validation:Type=string
167+
// +kubebuilder:validation:MinLength=1
168+
// +required
169+
Type string `json:"type"`
170+
171+
// status of the condition, one of True, False, Unknown.
172+
// +kubebuilder:validation:Type=string
173+
// +kubebuilder:validation:MinLength=1
174+
// +required
175+
Status metav1.ConditionStatus `json:"status"`
176+
177+
// timeout is the duration that a node must be in a given status for,
178+
// after which the node is considered unhealthy.
179+
// For example, with a value of "1h", the node must match the status
180+
// for at least 1 hour before being considered unhealthy.
181+
// +required
182+
Timeout metav1.Duration `json:"timeout"`
183+
}
184+
185+
// ANCHOR_END: UnhealthyMachineCondition
186+
151187
// ANCHOR: MachineHealthCheckStatus
152188

153189
// MachineHealthCheckStatus defines the observed state of MachineHealthCheck.

api/core/v1beta1/zz_generated.conversion.go

Lines changed: 36 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/core/v1beta1/zz_generated.deepcopy.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/core/v1beta1/zz_generated.openapi.go

Lines changed: 54 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/core/v1beta2/machine_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,10 @@ const (
276276
// defined by a MachineHealthCheck object.
277277
MachineHealthCheckUnhealthyNodeReason = "UnhealthyNode"
278278

279+
// MachineHealthCheckUnhealthyMachineReason surfaces when the machine does not pass the health checks
280+
// defined by a MachineHealthCheck object.
281+
MachineHealthCheckUnhealthyMachineReason = "UnhealthyMachine"
282+
279283
// MachineHealthCheckNodeStartupTimeoutReason surfaces when the node hosted on the machine does not appear within
280284
// the timeout defined by a MachineHealthCheck object.
281285
MachineHealthCheckNodeStartupTimeoutReason = "NodeStartupTimeout"

api/core/v1beta2/machinehealthcheck_types.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,14 @@ type MachineHealthCheckSpec struct {
6969
// +kubebuilder:validation:MaxItems=100
7070
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
7171

72+
// unhealthyMachineConditions contains a list of the machine conditions that determine
73+
// whether a node is considered unhealthy. The conditions are combined in a
74+
// logical OR, i.e. if any of the conditions is met, the node is unhealthy.
75+
//
76+
// +optional
77+
// +kubebuilder:validation:MaxItems=100
78+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
79+
7280
// maxUnhealthy specifies the maximum number of unhealthy machines allowed.
7381
// Any further remediation is only allowed if at most "maxUnhealthy" machines selected by
7482
// "selector" are not healthy.
@@ -148,6 +156,34 @@ type UnhealthyNodeCondition struct {
148156

149157
// ANCHOR_END: UnhealthyNodeCondition
150158

159+
// ANCHOR: UnhealthyMachineCondition
160+
161+
// UnhealthyMachineCondition represents a Node condition type and value with a timeout
162+
// specified as a duration. When the named condition has been in the given
163+
// status for at least the timeout value, a node is considered unhealthy.
164+
type UnhealthyMachineCondition struct {
165+
// type of Node condition
166+
// +kubebuilder:validation:Type=string
167+
// +kubebuilder:validation:MinLength=1
168+
// +required
169+
Type string `json:"type"`
170+
171+
// status of the condition, one of True, False, Unknown.
172+
// +kubebuilder:validation:Type=string
173+
// +kubebuilder:validation:MinLength=1
174+
// +required
175+
Status metav1.ConditionStatus `json:"status"`
176+
177+
// timeout is the duration that a node must be in a given status for,
178+
// after which the node is considered unhealthy.
179+
// For example, with a value of "1h", the node must match the status
180+
// for at least 1 hour before being considered unhealthy.
181+
// +required
182+
Timeout metav1.Duration `json:"timeout"`
183+
}
184+
185+
// ANCHOR_END: UnhealthyMachineCondition
186+
151187
// ANCHOR: MachineHealthCheckStatus
152188

153189
// MachineHealthCheckStatus defines the observed state of MachineHealthCheck.

api/core/v1beta2/v1beta1_condition_consts.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ const (
161161

162162
// UnhealthyNodeConditionV1Beta1Reason is the reason used when a machine's node has one of the MachineHealthCheck's unhealthy conditions.
163163
UnhealthyNodeConditionV1Beta1Reason = "UnhealthyNode"
164+
165+
// UnhealthyMachineConditionV1Beta1Reason is the reason used when a machine has one of the MachineHealthCheck's unhealthy conditions.
166+
UnhealthyMachineConditionV1Beta1Reason = "UnhealthyMachine"
164167
)
165168

166169
const (

api/core/v1beta2/zz_generated.deepcopy.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)