Skip to content

Commit 5e298fa

Browse files
committed
fix: ensure DaemonSet mount pods tolerate node termination longer than application pods
- Add explicit tolerationSeconds (12 hours) for node.kubernetes.io/not-ready and unreachable taints - This ensures mount pods remain available during graceful node shutdown - Application pods will be evicted first (default 300s toleration) while mount pods persist - Prevents storage access issues during node maintenance and termination
1 parent de2aa78 commit 5e298fa

File tree

1 file changed

+44
-0
lines changed

1 file changed

+44
-0
lines changed

pkg/juicefs/mount/builder/daemonset.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,50 @@ func (d *DaemonSetBuilder) NewMountDaemonSet(dsName string) (*appsv1.DaemonSet,
134134
ds.Spec.Template.Spec.Affinity.NodeAffinity = d.jfsSetting.StorageClassNodeAffinity
135135
}
136136

137+
// Fix tolerations for DaemonSet pods to ensure they don't terminate before application pods
138+
// During node shutdown, we need the mount pods to stay alive longer than application pods
139+
tolerations := ds.Spec.Template.Spec.Tolerations
140+
hasNotReadyToleration := false
141+
hasUnreachableToleration := false
142+
143+
// Check existing tolerations and update if needed
144+
for i := range tolerations {
145+
if tolerations[i].Key == "node.kubernetes.io/not-ready" && tolerations[i].Effect == corev1.TaintEffectNoExecute {
146+
// Set a high toleration time (12 hours) to ensure mount pod survives during node shutdown
147+
tolerationSeconds := int64(43200)
148+
tolerations[i].TolerationSeconds = &tolerationSeconds
149+
hasNotReadyToleration = true
150+
}
151+
if tolerations[i].Key == "node.kubernetes.io/unreachable" && tolerations[i].Effect == corev1.TaintEffectNoExecute {
152+
// Set a high toleration time (12 hours) to ensure mount pod survives during node shutdown
153+
tolerationSeconds := int64(43200)
154+
tolerations[i].TolerationSeconds = &tolerationSeconds
155+
hasUnreachableToleration = true
156+
}
157+
}
158+
159+
// Add tolerations if they don't exist
160+
if !hasNotReadyToleration {
161+
tolerationSeconds := int64(43200) // 12 hours
162+
tolerations = append(tolerations, corev1.Toleration{
163+
Key: "node.kubernetes.io/not-ready",
164+
Operator: corev1.TolerationOpExists,
165+
Effect: corev1.TaintEffectNoExecute,
166+
TolerationSeconds: &tolerationSeconds,
167+
})
168+
}
169+
if !hasUnreachableToleration {
170+
tolerationSeconds := int64(43200) // 12 hours
171+
tolerations = append(tolerations, corev1.Toleration{
172+
Key: "node.kubernetes.io/unreachable",
173+
Operator: corev1.TolerationOpExists,
174+
Effect: corev1.TaintEffectNoExecute,
175+
TolerationSeconds: &tolerationSeconds,
176+
})
177+
}
178+
179+
ds.Spec.Template.Spec.Tolerations = tolerations
180+
137181
return ds, nil
138182
}
139183

0 commit comments

Comments
 (0)