fix(e2e): resolve instability issues in end-to-end tests

1. Avoid misunderstanding the pod as being ready when it is still deleting or in a 0/1 state. 2. Avoid misunderstanding the TektonConfig as being ready when it is still in the process of being reconcile. 3. Add the verification of PVC deletion in the undeploy method. Avoid the creation of a new hub when the old PVC is still in the deleting state, which leads to the failure of starting the new hub.
tektoncd · Jan 17, 2025 · 3ba08f3 · 3ba08f3
1 parent 30442eb
commit 3ba08f3
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 4 deletions.
diff --git a/test/e2e/common/05_tektonhubdeployment_test.go b/test/e2e/common/05_tektonhubdeployment_test.go
@@ -55,6 +55,7 @@ type TektonHubTestSuite struct {
 	resourceNames      utils.ResourceNames
 	clients            *utils.Clients
 	deployments        []string
+	pvcs               []string
 	dbMigrationJobName string
 	interval           time.Duration
 	timeout            time.Duration
@@ -79,6 +80,9 @@ func NewTektonHubTestSuite(t *testing.T) *TektonHubTestSuite {
 			"tekton-hub-api",
 			"tekton-hub-ui",
 		},
+		pvcs: []string{
+			"tekton-hub-api",
+		},
 		dbMigrationJobName: "tekton-hub-db-migration",
 		interval:           5 * time.Second,
 		timeout:            5 * time.Minute,
@@ -302,6 +306,16 @@ func (s *TektonHubTestSuite) undeploy(databaseNamespace string) {
 		err := resources.WaitForDeploymentDeletion(s.clients.KubeClient, deploymentName, namespace, pollInterval, timeout)
 		require.NoError(t, err)
 	}
+	// verify pvcs are removed
+	for _, pvcName := range s.pvcs {
+		namespace := s.resourceNames.TargetNamespace
+		if databaseNamespace != "" {
+			// no need to verify external database removal
+			continue
+		}
+		err := resources.WaitForPVCDeletion(s.clients.KubeClient, pvcName, namespace, pollInterval, timeout)
+		require.NoError(t, err)
+	}
 	// verify migration job is removed
 	err := resources.WaitForJobDeletion(s.clients.KubeClient, s.dbMigrationJobName, s.resourceNames.TargetNamespace, pollInterval, timeout)
 	require.NoError(t, err)

diff --git a/test/resources/deployment.go b/test/resources/deployment.go
@@ -126,3 +126,19 @@ func WaitForDeploymentDeletion(kubeClient kubernetes.Interface, name, namespace
 
 	return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc)
 }
+
+// WaitForPVCDeletion waits for the PVC to be deleted.
+func WaitForPVCDeletion(kubeClient kubernetes.Interface, name, namespace string, interval, timeout time.Duration) error {
+	verifyFunc := func(ctx context.Context) (bool, error) {
+		_, err := kubeClient.CoreV1().PersistentVolumeClaims(namespace).Get(context.TODO(), name, metav1.GetOptions{})
+		if err != nil {
+			if apierrs.IsNotFound(err) {
+				return true, nil
+			}
+			return false, err
+		}
+		return false, nil
+	}
+
+	return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc)
+}
diff --git a/test/resources/pod.go b/test/resources/pod.go
@@ -21,6 +21,7 @@ import (
 	"time"
 
 	core "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
@@ -37,6 +38,18 @@ func DeletePodByLabelSelector(kubeClient kubernetes.Interface, labelSelector, na
 		if err != nil {
 			return err
 		}
+		// wait for the pod to be deleted
+		err = wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 3*time.Minute, true,
+			func(ctx context.Context) (bool, error) {
+				_, err := kubeClient.CoreV1().Pods(namespace).Get(context.TODO(), pod.GetName(), metav1.GetOptions{})
+				if err != nil && apierrors.IsNotFound(err) {
+					return true, nil
+				}
+				return false, nil
+			})
+		if err != nil {
+			return err
+		}
 	}
 	return nil
 }
@@ -49,11 +62,17 @@ func WaitForPodByLabelSelector(kubeClient kubernetes.Interface, labelSelector, n
 		}
 
 		for _, pod := range pods.Items {
-			if pod.Status.Phase == core.PodRunning {
-				return true, nil
+			if pod.Status.Phase != core.PodRunning {
+				return false, nil
+			}
+			// Only when the Ready status of the Pod is True is the Pod considered Ready
+			for _, c := range pod.Status.Conditions {
+				if c.Type == core.PodReady && c.Status != core.ConditionTrue {
+					return false, nil
+				}
 			}
 		}
-		return false, nil
+		return true, nil
 	}
 
 	return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc)

diff --git a/test/resources/tektonconfigs.go b/test/resources/tektonconfigs.go
@@ -181,6 +181,7 @@ func verifyNoTektonConfigCR(clients *utils.Clients) error {
 }
 
 func WaitForTektonConfigReady(client operatorV1alpha1.TektonConfigInterface, name string, interval, timeout time.Duration) error {
+	readyCount := 0
 	isReady := func(ctx context.Context) (bool, error) {
 		configCR, err := client.Get(context.TODO(), name, metav1.GetOptions{})
 		if err != nil {
@@ -189,7 +190,22 @@ func WaitForTektonConfigReady(client operatorV1alpha1.TektonConfigInterface, nam
 			}
 			return false, err
 		}
-		return configCR.Status.IsReady(), nil
+		if configCR.Status.IsReady() {
+			readyCount++
+		} else {
+			readyCount = 0
+		}
+		// This needs to be confirmed multiple times because:
+		// 1. The previous Pod may have just restarted, and the new Pod has not
+		//    yet entered the Reconcile logic to modify the resource status.
+		// 2. Even if the Pod has started properly, it may not have entered the
+		//    Reconcile logic yet, which could also lead to instability.
+		//    For example:
+		//      In the testing logic of TektonHub, it immediately deletes the
+		//      CR of TektonHub and then polls whether the CR has been cleaned up.
+		//      At this point, the tekton-operator enters the Reconcile logic and
+		//      automatically creates the CR of TektonHub again, causing the test to fail.
+		return readyCount >= 3, nil
 	}
 
 	return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, isReady)