Skip to content

Commit 970ac8c

Browse files
committed
test: add e2e tests for cluster-health-analyzer operand
Validate that the Monitoring UIPlugin with ClusterHealthAnalyzer enabled deploys the health-analyzer and correctly processes alerts into incident metrics. The test creates a crashing pod, waits for the corresponding PrometheusRule alert to fire, then verifies the cluster_health_components_map metric is exposed with the expected labels. Also extends the framework's AssertPromQLResult with configurable timeout and poll interval options (AssertPromQLResultWithOptions). Made-with: Cursor
1 parent 3a30aca commit 970ac8c

File tree

3 files changed

+359
-3
lines changed

3 files changed

+359
-3
lines changed

test/e2e/framework/assertions.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,12 +422,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
422422
// It returns an error if the request fails. Otherwise the result is passed to
423423
// the callback function for additional checks.
424424
func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
425+
return f.AssertPromQLResultWithOptions(t, expr, callback)
426+
}
427+
428+
// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
429+
// WithTimeout and WithPollInterval options to override the default polling
430+
// parameters.
431+
func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
425432
t.Helper()
433+
option := AssertOption{
434+
PollInterval: 20 * time.Second,
435+
WaitTimeout: 3 * DefaultTestTimeout,
436+
}
437+
for _, fn := range fns {
438+
fn(&option)
439+
}
426440
var (
427441
pollErr error
428442
v model.Value
429443
)
430-
if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
444+
if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
431445
v, pollErr = f.getPromQLResult(context.Background(), expr)
432446
if pollErr != nil {
433447
t.Logf("error from getPromQLResult(): %s", pollErr)
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strconv"
7+
"testing"
8+
"time"
9+
10+
monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
11+
"github.com/prometheus/common/model"
12+
"gotest.tools/v3/assert"
13+
appsv1 "k8s.io/api/apps/v1"
14+
corev1 "k8s.io/api/core/v1"
15+
"k8s.io/apimachinery/pkg/api/errors"
16+
"k8s.io/apimachinery/pkg/api/resource"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/util/intstr"
19+
"k8s.io/apimachinery/pkg/util/wait"
20+
"k8s.io/utils/ptr"
21+
"sigs.k8s.io/controller-runtime/pkg/client"
22+
23+
uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
24+
"github.com/rhobs/observability-operator/test/e2e/framework"
25+
)
26+
27+
const (
28+
healthAnalyzerDeploymentName = "health-analyzer"
29+
prometheusRuleNamespace = "openshift-monitoring"
30+
)
31+
32+
func clusterHealthAnalyzer(t *testing.T) {
33+
monv1.AddToScheme(f.K8sClient.Scheme())
34+
35+
plugin := newMonitoringUIPlugin(t)
36+
err := f.K8sClient.Create(context.Background(), plugin)
37+
assert.NilError(t, err, "failed to create monitoring UIPlugin")
38+
39+
t.Cleanup(func() {
40+
if t.Failed() {
41+
dumpClusterHealthAnalyzerDebug(t, plugin.Name)
42+
}
43+
})
44+
45+
t.Log("Waiting for health-analyzer deployment to become ready...")
46+
haDeployment := appsv1.Deployment{}
47+
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
48+
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
49+
50+
suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
51+
ruleName := "e2e-crashloop-" + suffix
52+
alertName := "E2ECrashLoop" + suffix
53+
deployName := "e2e-crasher-" + suffix
54+
55+
rule := newCrashLoopRule(t, ruleName, alertName, deployName)
56+
err = f.K8sClient.Create(context.Background(), rule)
57+
assert.NilError(t, err, "failed to create PrometheusRule")
58+
59+
dep := newCrashingDeployment(t, deployName)
60+
err = f.K8sClient.Create(context.Background(), dep)
61+
assert.NilError(t, err, "failed to create crashing deployment")
62+
63+
t.Log("Waiting for pod to enter CrashLoopBackOff...")
64+
assertPodCrashLooping(t, deployName, e2eTestNamespace, 10*time.Second, 3*time.Minute)
65+
66+
t.Log("Waiting for alert to fire in Prometheus...")
67+
alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
68+
err = f.AssertPromQLResultWithOptions(t, alertQuery,
69+
func(v model.Value) error {
70+
vec, ok := v.(model.Vector)
71+
if !ok || len(vec) == 0 {
72+
return fmt.Errorf("expected firing alert, got: %v", v)
73+
}
74+
return nil
75+
},
76+
framework.WithPollInterval(30*time.Second),
77+
framework.WithTimeout(10*time.Minute),
78+
)
79+
assert.NilError(t, err, "alert %s never fired", alertName)
80+
81+
t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
82+
incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s"}`, alertName)
83+
err = f.AssertPromQLResultWithOptions(t, incidentQuery,
84+
func(v model.Value) error {
85+
vec, ok := v.(model.Vector)
86+
if !ok || len(vec) == 0 {
87+
return fmt.Errorf("expected incident metric, got: %v", v)
88+
}
89+
for _, sample := range vec {
90+
if string(sample.Metric["src_alertname"]) != alertName {
91+
return fmt.Errorf("expected src_alertname=%s, got %s", alertName, sample.Metric["src_alertname"])
92+
}
93+
if string(sample.Metric["src_severity"]) != "warning" {
94+
return fmt.Errorf("expected src_severity=warning, got %s", sample.Metric["src_severity"])
95+
}
96+
}
97+
return nil
98+
},
99+
framework.WithPollInterval(30*time.Second),
100+
framework.WithTimeout(15*time.Minute),
101+
)
102+
assert.NilError(t, err, "incident metric for %s never appeared", alertName)
103+
}
104+
105+
func newMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
106+
plugin := &uiv1.UIPlugin{
107+
ObjectMeta: metav1.ObjectMeta{
108+
Name: "monitoring",
109+
},
110+
Spec: uiv1.UIPluginSpec{
111+
Type: uiv1.TypeMonitoring,
112+
Monitoring: &uiv1.MonitoringConfig{
113+
// TODO: switch to ClusterHealthAnalyzer once the controller supports it on main
114+
// ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
115+
Incidents: &uiv1.IncidentsReference{
116+
Enabled: true,
117+
},
118+
},
119+
},
120+
}
121+
122+
existing := &uiv1.UIPlugin{}
123+
err := f.K8sClient.Get(context.Background(), client.ObjectKey{Name: plugin.Name}, existing)
124+
if err == nil {
125+
t.Log("UIPlugin 'monitoring' already exists, deleting before recreation...")
126+
f.K8sClient.Delete(context.Background(), existing)
127+
waitForUIPluginDeletion(existing)
128+
} else if !errors.IsNotFound(err) {
129+
t.Fatalf("failed to check for existing UIPlugin: %v", err)
130+
}
131+
132+
f.CleanUp(t, func() {
133+
f.K8sClient.Delete(context.Background(), plugin)
134+
waitForUIPluginDeletion(plugin)
135+
})
136+
return plugin
137+
}
138+
139+
func newCrashLoopRule(t *testing.T, ruleName, alertName, podPrefix string) *monv1.PrometheusRule {
140+
rule := &monv1.PrometheusRule{
141+
ObjectMeta: metav1.ObjectMeta{
142+
Name: ruleName,
143+
Namespace: prometheusRuleNamespace,
144+
Labels: map[string]string{
145+
"app.kubernetes.io/name": "kube-prometheus",
146+
"app.kubernetes.io/part-of": "openshift-monitoring",
147+
"prometheus": "k8s",
148+
"role": "alert-rules",
149+
},
150+
},
151+
Spec: monv1.PrometheusRuleSpec{
152+
Groups: []monv1.RuleGroup{{
153+
Name: "crashloop-test-" + ruleName,
154+
Rules: []monv1.Rule{{
155+
Alert: alertName,
156+
Expr: intstr.FromString(fmt.Sprintf(
157+
`max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace="%s", pod=~"%s.*", job="kube-state-metrics"}[5m]) >= 1`,
158+
e2eTestNamespace, podPrefix)),
159+
For: ptr.To(monv1.Duration("1m")),
160+
Labels: map[string]string{"severity": "warning"},
161+
Annotations: map[string]string{
162+
"summary": "Pod is crash looping.",
163+
},
164+
}},
165+
}},
166+
},
167+
}
168+
f.CleanUp(t, func() {
169+
f.K8sClient.Delete(context.Background(), rule)
170+
})
171+
return rule
172+
}
173+
174+
func newCrashingDeployment(t *testing.T, name string) *appsv1.Deployment {
175+
dep := &appsv1.Deployment{
176+
ObjectMeta: metav1.ObjectMeta{
177+
Name: name,
178+
Namespace: e2eTestNamespace,
179+
Labels: map[string]string{"app": name},
180+
},
181+
Spec: appsv1.DeploymentSpec{
182+
Replicas: ptr.To(int32(1)),
183+
Selector: &metav1.LabelSelector{
184+
MatchLabels: map[string]string{"app": name},
185+
},
186+
Template: corev1.PodTemplateSpec{
187+
ObjectMeta: metav1.ObjectMeta{
188+
Labels: map[string]string{"app": name},
189+
},
190+
Spec: corev1.PodSpec{
191+
Containers: []corev1.Container{{
192+
Name: "crasher",
193+
Image: "registry.access.redhat.com/ubi9-minimal:latest",
194+
Command: []string{"sh", "-c", "exit 1"},
195+
Resources: corev1.ResourceRequirements{
196+
Requests: corev1.ResourceList{
197+
corev1.ResourceCPU: resource.MustParse("1m"),
198+
corev1.ResourceMemory: resource.MustParse("4Mi"),
199+
},
200+
Limits: corev1.ResourceList{
201+
corev1.ResourceCPU: resource.MustParse("10m"),
202+
corev1.ResourceMemory: resource.MustParse("16Mi"),
203+
},
204+
},
205+
}},
206+
},
207+
},
208+
},
209+
}
210+
f.CleanUp(t, func() {
211+
f.K8sClient.Delete(context.Background(), dep)
212+
})
213+
return dep
214+
}
215+
216+
func assertPodCrashLooping(t *testing.T, deploymentName, namespace string, pollInterval, timeout time.Duration) {
217+
t.Helper()
218+
err := wait.PollUntilContextTimeout(context.Background(), pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
219+
var pods corev1.PodList
220+
if err := f.K8sClient.List(ctx, &pods,
221+
client.InNamespace(namespace),
222+
client.MatchingLabels{"app": deploymentName},
223+
); err != nil {
224+
return false, nil
225+
}
226+
for i := range pods.Items {
227+
for _, cs := range pods.Items[i].Status.ContainerStatuses {
228+
if cs.State.Waiting != nil && cs.State.Waiting.Reason == "CrashLoopBackOff" {
229+
return true, nil
230+
}
231+
}
232+
}
233+
return false, nil
234+
})
235+
if err != nil {
236+
t.Fatalf("pod with label app=%s in %s never entered CrashLoopBackOff: %v", deploymentName, namespace, err)
237+
}
238+
}
239+
240+
func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) {
241+
t.Helper()
242+
ctx := context.Background()
243+
244+
t.Log("=== BEGIN DEBUG DUMP ===")
245+
246+
// Dump UIPlugin status
247+
var plugin uiv1.UIPlugin
248+
if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil {
249+
t.Logf("Failed to get UIPlugin %q: %v", pluginName, err)
250+
} else {
251+
t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion)
252+
t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type)
253+
if plugin.Spec.Monitoring != nil {
254+
if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil {
255+
t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled)
256+
}
257+
if plugin.Spec.Monitoring.Incidents != nil {
258+
t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled)
259+
}
260+
}
261+
if len(plugin.Status.Conditions) == 0 {
262+
t.Log("UIPlugin has no status conditions")
263+
}
264+
for _, c := range plugin.Status.Conditions {
265+
t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message)
266+
}
267+
}
268+
269+
// List all UIPlugins
270+
var plugins uiv1.UIPluginList
271+
if err := f.K8sClient.List(ctx, &plugins); err != nil {
272+
t.Logf("Failed to list UIPlugins: %v", err)
273+
} else {
274+
t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items))
275+
for _, p := range plugins.Items {
276+
t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions))
277+
}
278+
}
279+
280+
// List all deployments in the operator namespace
281+
var deployments appsv1.DeploymentList
282+
if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(uiPluginInstallNS)); err != nil {
283+
t.Logf("Failed to list deployments in %s: %v", uiPluginInstallNS, err)
284+
} else {
285+
t.Logf("Deployments in namespace %s: %d", uiPluginInstallNS, len(deployments.Items))
286+
for _, d := range deployments.Items {
287+
t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d",
288+
d.Name, ptrInt32(d.Spec.Replicas), d.Status.ReadyReplicas, d.Status.AvailableReplicas)
289+
for _, c := range d.Status.Conditions {
290+
t.Logf(" condition: type=%s status=%s reason=%s message=%s",
291+
c.Type, c.Status, c.Reason, c.Message)
292+
}
293+
}
294+
}
295+
296+
// List all pods in the operator namespace
297+
var pods corev1.PodList
298+
if err := f.K8sClient.List(ctx, &pods, client.InNamespace(uiPluginInstallNS)); err != nil {
299+
t.Logf("Failed to list pods in %s: %v", uiPluginInstallNS, err)
300+
} else {
301+
t.Logf("Pods in namespace %s: %d", uiPluginInstallNS, len(pods.Items))
302+
for _, p := range pods.Items {
303+
t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase)
304+
for _, cs := range p.Status.ContainerStatuses {
305+
if cs.State.Running != nil {
306+
t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount)
307+
} else if cs.State.Waiting != nil {
308+
t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s",
309+
cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message)
310+
} else if cs.State.Terminated != nil {
311+
t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d",
312+
cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
313+
}
314+
}
315+
}
316+
}
317+
318+
// List events in the operator namespace
319+
var events corev1.EventList
320+
if err := f.K8sClient.List(ctx, &events, client.InNamespace(uiPluginInstallNS)); err != nil {
321+
t.Logf("Failed to list events in %s: %v", uiPluginInstallNS, err)
322+
} else {
323+
t.Logf("Events in namespace %s: %d", uiPluginInstallNS, len(events.Items))
324+
for _, e := range events.Items {
325+
t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d",
326+
e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count)
327+
}
328+
}
329+
330+
t.Log("=== END DEBUG DUMP ===")
331+
}
332+
333+
func ptrInt32(p *int32) int32 {
334+
if p == nil {
335+
return 0
336+
}
337+
return *p
338+
}

test/e2e/uiplugin_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ func TestUIPlugin(t *testing.T) {
3434
name: "Create dashboards UIPlugin",
3535
scenario: dashboardsUIPlugin,
3636
},
37+
{
38+
name: "Cluster health analyzer",
39+
scenario: clusterHealthAnalyzer,
40+
},
3741
}
3842

3943
for _, tc := range ts {
@@ -63,13 +67,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
6367
}
6468
f.CleanUp(t, func() {
6569
f.K8sClient.Delete(context.Background(), db)
66-
waitForDBUIPluginDeletion(db)
70+
waitForUIPluginDeletion(db)
6771
})
6872

6973
return db
7074
}
7175

72-
func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
76+
func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
7377
return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
7478
err = f.K8sClient.Get(context.Background(),
7579
client.ObjectKey{Name: db.Name},

0 commit comments

Comments
 (0)