Skip to content

Commit 2314413

Browse files
authored
Merge pull request kubernetes#109396 from SergeyKanzhelev/testDurationCheck
check for the test duraiton to make NodeProblemDetector test reliable
2 parents 10486a7 + 4eb2c57 commit 2314413

File tree

1 file changed

+63
-10
lines changed

1 file changed

+63
-10
lines changed

test/e2e/node/node_problem_detector.go

+63-10
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
6060
})
6161

6262
ginkgo.It("should run without error", func() {
63-
e2eskipper.SkipUnlessSSHKeyPresent()
64-
6563
ginkgo.By("Getting all nodes and their SSH-able IP addresses")
6664
readyNodes, err := e2enode.GetReadySchedulableNodes(f.ClientSet)
6765
framework.ExpectNoError(err)
@@ -98,6 +96,13 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
9896
rssStats := make(map[string][]float64)
9997
workingSetStats := make(map[string][]float64)
10098

99+
// Some tests suites running for days.
100+
// This test is not marked as Disruptive or Serial so we do not want to
101+
// restart the kubelet during the test to check for KubeletStart event
102+
// detection. We use heuristic here to check if we need to validate for the
103+
// KubeletStart event since there is no easy way to check when test has actually started.
104+
checkForKubeletStart := false
105+
101106
for _, host := range hosts {
102107
cpuUsageStats[host] = []float64{}
103108
uptimeStats[host] = []float64{}
@@ -119,12 +124,56 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
119124
gomega.Expect(result.Stdout).To(gomega.ContainSubstring("node-problem-detector"))
120125

121126
ginkgo.By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host))
122-
journalctlCmd := "sudo journalctl -u node-problem-detector"
127+
journalctlCmd := "sudo journalctl -r -u node-problem-detector"
123128
result, err = e2essh.SSH(journalctlCmd, host, framework.TestContext.Provider)
124129
framework.ExpectNoError(err)
125130
framework.ExpectEqual(result.Code, 0)
126131
gomega.Expect(result.Stdout).NotTo(gomega.ContainSubstring("node-problem-detector.service: Failed"))
127132

133+
// Let's assume that node problem detector has started the same time as kubelet
134+
// We only will check for the KubeletStart if parsing of date here succeed
135+
// This is an optimization to not ssh one more time to get kubelet's start time.
136+
// Also we assume specific datetime format to simplify the logic
137+
output := result.Stdout
138+
139+
// searching for the line "Apr 14 04:47:42 gke-cluster-1-default-pool-b1565719-eqht systemd[1]: Started Kubernetes node problem detector." and fallback to
140+
idx := strings.Index(output, "Started Kubernetes node problem detector")
141+
if idx != -1 {
142+
output = output[:idx]
143+
idx = strings.LastIndex(output, "\n")
144+
145+
if idx != -1 {
146+
output = output[0:15]
147+
}
148+
149+
st, err := time.Parse("Jan 02 15:04:05", output)
150+
st = st.AddDate(time.Now().Year(), 0, 0)
151+
152+
if err == nil {
153+
checkForKubeletStart = time.Since(st) < time.Hour
154+
}
155+
} else {
156+
// fallback to searching the line:
157+
// -- Logs begin at Thu 2022-04-28 17:32:39 UTC, end at Thu 2022-04-28 17:40:05 UTC. --
158+
idx := strings.Index(output, ", end at ")
159+
160+
output = output[:idx]
161+
idx = strings.LastIndex(output, "-- Logs begin at ")
162+
if idx != -1 {
163+
output = output[17:]
164+
}
165+
166+
st, err := time.Parse("Mon 2006-01-02 15:04:05 MST", output)
167+
168+
if err == nil {
169+
checkForKubeletStart = time.Since(st) < time.Hour
170+
}
171+
}
172+
173+
if !checkForKubeletStart {
174+
ginkgo.By("KubeletStart event will NOT be checked")
175+
}
176+
128177
if isStandaloneMode[host] {
129178
cpuUsage, uptime := getCPUStat(f, host)
130179
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
@@ -152,13 +201,17 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
152201
return verifyEvents(f, eventListOptions, 1, "DockerHung", node.Name)
153202
}, pollTimeout, pollInterval).Should(gomega.Succeed())
154203

155-
// Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+.
156-
// Since Kubelet may be restarted for a few times after node is booted. We just check the event
157-
// is detected, but do not check how many times Kubelet is started.
158-
ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name))
159-
gomega.Eventually(func() error {
160-
return verifyEventExists(f, eventListOptions, "KubeletStart", node.Name)
161-
}, pollTimeout, pollInterval).Should(gomega.Succeed())
204+
if checkForKubeletStart {
205+
// Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+.
206+
// Since Kubelet may be restarted for a few times after node is booted. We just check the event
207+
// is detected, but do not check how many times Kubelet is started.
208+
//
209+
// Some test suites run for hours and KubeletStart event will already be cleaned up
210+
ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name))
211+
gomega.Eventually(func() error {
212+
return verifyEventExists(f, eventListOptions, "KubeletStart", node.Name)
213+
}, pollTimeout, pollInterval).Should(gomega.Succeed())
214+
}
162215
}
163216

164217
ginkgo.By("Gather node-problem-detector cpu and memory stats")

0 commit comments

Comments
 (0)