From 22c9563a2694e8b4416e5bbfc16d4867c6e75df0 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 20 Jan 2025 12:24:55 +0800 Subject: [PATCH] test(robot): add test case Check If Nodes Are Under Memory Pressure After Cluster Restart Signed-off-by: Yang Chiu --- e2e/keywords/metrics.resource | 12 ++++++ e2e/libs/keywords/metrics_keywords.py | 49 ++++++++++++++++++++++++ e2e/libs/metrics/metrics.py | 22 +++++++++++ e2e/libs/node/node.py | 12 ++++++ e2e/tests/negative/cluster_restart.robot | 45 ++++++++++++++++++++++ 5 files changed, 140 insertions(+) create mode 100644 e2e/keywords/metrics.resource create mode 100644 e2e/libs/keywords/metrics_keywords.py create mode 100644 e2e/libs/metrics/metrics.py diff --git a/e2e/keywords/metrics.resource b/e2e/keywords/metrics.resource new file mode 100644 index 000000000..7b2be3c62 --- /dev/null +++ b/e2e/keywords/metrics.resource @@ -0,0 +1,12 @@ +*** Settings *** +Documentation Metrics Keywords + +Library ../libs/keywords/metrics_keywords.py + +*** Keywords *** +Check if nodes are under memory pressure + ${worker_nodes} = get_worker_nodes + FOR ${worker_node} IN @{worker_nodes} + get_node_memory_usage_in_percentage ${worker_node} + check_if_node_under_memory_pressure ${worker_node} + END diff --git a/e2e/libs/keywords/metrics_keywords.py b/e2e/libs/keywords/metrics_keywords.py new file mode 100644 index 000000000..823abdaac --- /dev/null +++ b/e2e/libs/keywords/metrics_keywords.py @@ -0,0 +1,49 @@ +import time + +from node import Node +from metrics.metrics import get_node_metrics +from utility.utility import get_retry_count_and_interval +from utility.utility import logging + + +class metrics_keywords: + + def __init__(self): + self.node = Node() + retry_count, retry_interval = get_retry_count_and_interval() + + def get_node_total_memory_in_mi(self, node_name): + total_memory = self.node.get_node_total_memory(node_name) + if "Ki" in total_memory: + total_memory = int(total_memory.replace("Ki", "")) + total_memory = total_memory / 1024 + elif "Mi" in total_memory: + total_memory = int(total_memory.replace("Mi", "")) + logging(f'Got node {node_name} total memory: {total_memory} Mi') + return total_memory + + def get_node_memory_usage_in_mi(self, node_name): + memory_usage = get_node_metrics(node_name, 'memory') + if "Ki" in memory_usage: + memory_usage = int(memory_usage.replace("Ki", "")) + memory_usage = memory_usage / 1024 + elif "Mi" in memory_usage: + memory_usage = int(memory_usage.replace("Mi", "")) + logging(f'Got node {node_name} memory usage: {memory_usage} Mi') + return memory_usage + + def get_node_memory_usage_in_percentage(self, node_name): + memory_usage_in_mi = self.get_node_memory_usage_in_mi(node_name) + total_memory_in_mi = self.get_node_total_memory_in_mi(node_name) + memory_usage_in_percentage = memory_usage_in_mi / total_memory_in_mi * 100 + logging(f'Got node {node_name} memory usage: {memory_usage_in_percentage} %') + return memory_usage_in_percentage + + def check_if_node_under_memory_pressure(self, node_name): + logging(f"Checking if node {node_name} is under memory pressure") + condition_status = self.node.get_node_condition(node_name, "MemoryPressure") + if condition_status == "True": + logging(f"Node {node_name} is under memory pressure") + time.sleep(self.retry_count) + assert False, f"Node {node_name} is under memory pressure" + diff --git a/e2e/libs/metrics/metrics.py b/e2e/libs/metrics/metrics.py new file mode 100644 index 000000000..bb9dc81ec --- /dev/null +++ b/e2e/libs/metrics/metrics.py @@ -0,0 +1,22 @@ +import time + +from kubernetes import client +from kubernetes.client.rest import ApiException + +from utility.utility import get_retry_count_and_interval +from utility.utility import logging + +def get_node_metrics(node_name, metrics_name): + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + api = client.CustomObjectsApi() + try: + node_metrics = api.list_cluster_custom_object("metrics.k8s.io", "v1beta1", "nodes") + for node in node_metrics['items']: + if node_name == node['metadata']['name']: + logging(f"Got node {node_name} metrics {metrics_name} = {node['usage'][metrics_name]}") + return node['usage'][metrics_name] + except ApiException as e: + logging(f"Failed to get node {node_name} metrics {metrics_name}: {e}") + time.sleep(retry_interval) + assert False, f"Failed to get node {node_name} metrics {metrics_name}" diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py index cf62a1a7c..a169db40d 100644 --- a/e2e/libs/node/node.py +++ b/e2e/libs/node/node.py @@ -120,6 +120,18 @@ def get_node_cpu_cores(self, node_name): node = self.get_node_by_name(node_name) return node.status.capacity['cpu'] + def get_node_total_memory(self, node_name): + node = self.get_node_by_name(node_name) + return node.status.capacity['memory'] + + def get_node_condition(self, node_name, condition_type): + node = self.get_node_by_name(node_name) + for condition in node.status.conditions: + if condition.type == condition_type: + logging(f"Got node {node_name} condition {condition_type}: {condition}") + return condition.status + assert False, f"Failed to get node {node_name} condition {condition_type}: {node}" + def list_node_names_by_volumes(self, volume_names): volume_keywords = BuiltIn().get_library_instance('volume_keywords') volume_nodes = {} diff --git a/e2e/tests/negative/cluster_restart.robot b/e2e/tests/negative/cluster_restart.robot index 02626de0c..c728ccc6f 100644 --- a/e2e/tests/negative/cluster_restart.robot +++ b/e2e/tests/negative/cluster_restart.robot @@ -12,7 +12,9 @@ Resource ../keywords/storageclass.resource Resource ../keywords/persistentvolumeclaim.resource Resource ../keywords/statefulset.resource Resource ../keywords/workload.resource +Resource ../keywords/backup.resource Resource ../keywords/setting.resource +Resource ../keywords/metrics.resource Test Setup Set test environment Test Teardown Cleanup test resources @@ -77,3 +79,46 @@ Restart Cluster While Workload Heavy Writing And Check statefulset 4 works And Check statefulset 5 works END + +Check If Nodes Are Under Memory Pressure After Cluster Restart + [Tags] cluster + Given Create storageclass longhorn-test with dataEngine=${DATA_ENGINE} + And Create storageclass strict-local with numberOfReplicas=1 dataLocality=strict-local dataEngine=${DATA_ENGINE} + And Create storageclass nfs-4-2 with nfsOptions=vers=4.2,noresvport,timeo=450,retrans=8 dataEngine=${DATA_ENGINE} + And Create storageclass nfs-hard-mount with nfsOptions=hard,timeo=50,retrans=1 dataEngine=${DATA_ENGINE} + And Create storageclass nfs-soft-mount with nfsOptions=soft,timeo=250,retrans=5 dataEngine=${DATA_ENGINE} + And Create statefulset 0 using RWO volume with longhorn-test storageclass + And Create statefulset 1 using RWX volume with longhorn-test storageclass + And Create statefulset 2 using RWO volume with strict-local storageclass + And Create statefulset 3 using RWX volume with nfs-4-2 storageclass + And Create statefulset 4 using RWX volume with nfs-hard-mount storageclass + And Create statefulset 5 using RWX volume with nfs-soft-mount storageclass + And Write 1024 MB data to file data.bin in statefulset 0 + And Write 1024 MB data to file data.bin in statefulset 1 + And Write 1024 MB data to file data.bin in statefulset 2 + And Write 1024 MB data to file data.bin in statefulset 3 + And Write 1024 MB data to file data.bin in statefulset 4 + And Write 1024 MB data to file data.bin in statefulset 5 + + FOR ${i} IN RANGE ${LOOP_COUNT} + + And Create backup ${i} for statefulset 0 volume + And Create backup ${i} for statefulset 1 volume + And Create backup ${i} for statefulset 2 volume + And Create backup ${i} for statefulset 3 volume + And Create backup ${i} for statefulset 4 volume + And Create backup ${i} for statefulset 5 volume + + When Restart cluster + And Wait for longhorn ready + And Wait for workloads pods stable + ... statefulset 0 statefulset 1 statefulset 2 statefulset 3 statefulset 4 statefulset 5 + + Then Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works + And Check statefulset 3 works + And Check statefulset 4 works + And Check statefulset 5 works + And Check if nodes are under memory pressure + END