Enabled long-running benchmarks (with watchdog enforcing deletion) vi… (#313)

filipecosta90 · web-flow · commit 03f688dceb7f · 2022-03-14T13:28:43.000Z
* Enabled long-running benchmarks (with watchdog enforcing deletion) via timeout_secods property on benchmark definition

* Fixes per flake8 review
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "redisbench-admin"
-version = "0.7.11"
+version = "0.7.12"
 description = "Redis benchmark run helper. A wrapper around Redis and Redis Modules benchmark tools ( ftsb_redisearch, memtier_benchmark, redis-benchmark, aibench, etc... )."
 authors = ["filipecosta90 <filipecosta.90@gmail.com>","Redis Performance Group <performance@redis.com>"]
 readme = "README.md"
diff --git a/redisbench_admin/run_remote/remote_env.py b/redisbench_admin/run_remote/remote_env.py
@@ -24,6 +24,7 @@ def remote_env_setup(
     tf_github_sha,
     tf_setup_name_sufix,
     tf_triggering_env,
+    tf_timeout_secs=7200,
 ):
     server_plaintext_port = 6379
     db_ssh_port = args.db_ssh_port
@@ -65,6 +66,7 @@ def remote_env_setup(
             tf_github_sha,
             tf_setup_name_sufix,
             tf_triggering_env,
+            tf_timeout_secs,
         )
     return (
         client_public_ip,
diff --git a/redisbench_admin/run_remote/run_remote.py b/redisbench_admin/run_remote/run_remote.py
@@ -51,6 +51,7 @@
 from redisbench_admin.utils.benchmark_config import (
     prepare_benchmark_definitions,
     get_metadata_tags,
+    process_benchmark_definitions_remote_timeouts,
 )
 from redisbench_admin.utils.redisgraph_benchmark_go import setup_remote_benchmark_agent
 from redisbench_admin.utils.remote import (
@@ -59,6 +60,7 @@
     check_ec2_env,
     get_project_ts_tags,
     push_data_to_redistimeseries,
+    fetch_remote_id_from_config,
 )
 
 from redisbench_admin.utils.utils import (
@@ -180,6 +182,17 @@ def run_remote_command_logic(args, project_name, project_version):
         )
         rts.ping()
 
+    remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
+        benchmark_definitions
+    )
+
+    for remote_id, termination_timeout_secs in remote_envs_timeout.items():
+        logging.info(
+            "Using a timeout of {} seconds for remote setup: {}".format(
+                termination_timeout_secs, remote_id
+            )
+        )
+
     # we have a map of test-type, dataset-name, topology, test-name
     benchmark_runs_plan = define_benchmark_plan(benchmark_definitions, default_specs)
 
@@ -250,6 +263,10 @@ def run_remote_command_logic(args, project_name, project_version):
                                 )
                             )
                             if "remote" in benchmark_config:
+                                remote_id = fetch_remote_id_from_config(
+                                    benchmark_config["remote"]
+                                )
+                                tf_timeout_secs = remote_envs_timeout[remote_id]
                                 client_artifacts = []
                                 client_artifacts_map = {}
                                 temporary_dir = get_tmp_folder_rnd()
@@ -274,6 +291,7 @@ def run_remote_command_logic(args, project_name, project_version):
                                     tf_github_sha,
                                     tf_setup_name_sufix,
                                     tf_triggering_env,
+                                    tf_timeout_secs,
                                 )
 
                                 # after we've created the env, even on error we should always teardown
diff --git a/redisbench_admin/run_remote/terraform.py b/redisbench_admin/run_remote/terraform.py
@@ -27,6 +27,7 @@ def terraform_spin_or_reuse_env(
     tf_github_sha,
     tf_setup_name_sufix,
     tf_triggering_env,
+    tf_timeout_secs=7200,
 ):
     (
         remote_setup,
@@ -62,6 +63,7 @@ def terraform_spin_or_reuse_env(
             tf_github_org,
             tf_github_repo,
             tf_triggering_env,
+            tf_timeout_secs,
         )
         remote_envs[remote_id] = tf
     else:
diff --git a/redisbench_admin/utils/benchmark_config.py b/redisbench_admin/utils/benchmark_config.py
@@ -12,7 +12,10 @@
 import yaml
 from jsonpath_ng import parse
 
-from redisbench_admin.utils.remote import validate_result_expectations
+from redisbench_admin.utils.remote import (
+    validate_result_expectations,
+    fetch_remote_id_from_config,
+)
 
 
 def parse_exporter_metrics_definition(
@@ -79,6 +82,21 @@ def prepare_benchmark_definitions(args):
     )
 
 
+def process_benchmark_definitions_remote_timeouts(benchmark_definitions):
+    remote_envs_timeout = {}
+    # prepare the timeout for each different remote type
+    for test_name, benchmark_config in benchmark_definitions.items():
+        if "remote" in benchmark_config:
+            remote_id = fetch_remote_id_from_config(benchmark_config["remote"])
+            termination_timeout_secs = get_termination_timeout_secs(benchmark_config)
+            if remote_id not in remote_envs_timeout:
+                remote_envs_timeout[remote_id] = 0
+            remote_envs_timeout[remote_id] = (
+                remote_envs_timeout[remote_id] + termination_timeout_secs
+            )
+    return remote_envs_timeout
+
+
 def get_defaults(defaults_filename):
     default_metrics = []
     exporter_timemetric_path = None
@@ -302,6 +320,13 @@ def get_metadata_tags(benchmark_config):
     return metadata_tags
 
 
+def get_termination_timeout_secs(benchmark_config):
+    timeout_seconds = 600
+    if "timeout_seconds" in benchmark_config:
+        timeout_seconds = int(benchmark_config["timeout_seconds"])
+    return timeout_seconds
+
+
 def extract_benchmark_type_from_config(
     benchmark_config,
     config_key="clientconfig",
diff --git a/redisbench_admin/utils/remote.py b/redisbench_admin/utils/remote.py
@@ -254,6 +254,7 @@ def setup_remote_environment(
     tf_github_org,
     tf_github_repo,
     tf_triggering_env,
+    tf_timeout_secs=7200,
 ):
     # key    = "benchmarks/infrastructure/tf-oss-redisgraph-standalone-r5.tfstate"
     _, _, _ = tf.init(
@@ -287,6 +288,7 @@ def setup_remote_environment(
             "github_org": tf_github_org,
             "github_repo": tf_github_repo,
             "triggering_env": tf_triggering_env,
+            "timeout_secs": tf_timeout_secs,
         },
     )
     return retrieve_tf_connection_vars(return_code, tf)
@@ -493,6 +495,16 @@ def get_run_full_filename(
     return benchmark_output_filename
 
 
+def fetch_remote_id_from_config(
+    remote_setup_config,
+):
+    setup = None
+    for remote_setup_property in remote_setup_config:
+        if "setup" in remote_setup_property:
+            setup = remote_setup_property["setup"]
+    return setup
+
+
 def fetch_remote_setup_from_config(
     remote_setup_config,
     repo="https://github.com/RedisLabsModules/testing-infrastructure.git",
diff --git a/redisbench_admin/watchdog/watchdog.py b/redisbench_admin/watchdog/watchdog.py
@@ -42,6 +42,19 @@ def get_ci_ec2_instances_by_state(ec2_client, ci_machines_prefix, requested_stat
     return count, state_instances
 
 
+def get_vname_timeout_secs(instance):
+    vm_name = ""
+    timeout_secs = None
+    for tag_dict in instance["Tags"]:
+        key = tag_dict["Key"]
+        key_v = tag_dict["Value"]
+        if key == "Name":
+            vm_name = key_v
+        if key == "timeout_secs":
+            timeout_secs = int(key_v)
+    return vm_name, timeout_secs
+
+
 def watchdog_dangling_ec2_instances(
     ec2_client, terminate_after_secs, ci_machines_prefix, dry_run
 ):
@@ -55,42 +68,64 @@ def watchdog_dangling_ec2_instances(
             instance_id = instance["InstanceId"]
             state = instance["State"]["Name"]
             if state != "terminated":
-                for tag_dict in instance["Tags"]:
-                    key = tag_dict["Key"]
-                    key_v = tag_dict["Value"]
-                    if key == "Name":
-                        if ci_machines_prefix in key_v:
-                            total_instances = total_instances + 1
-                            elapsed = current_datetime - launch_time
-                            will_terminate = False
-                            if elapsed.total_seconds() > terminate_after_secs:
-                                will_terminate = True
-
-                            logging.info(
-                                "Temporary machine {} {}. terminate? {}".format(
-                                    key_v, elapsed, will_terminate
-                                )
-                            )
-                            if will_terminate:
-                                logging.warning(
-                                    "Requesting to terminate instance with id {} given it ".format(
-                                        instance_id
-                                    )
-                                    + "surpassed the maximum allowed ci duration"
-                                )
-                                response = ec2_client.terminate_instances(
-                                    InstanceIds=[
-                                        instance_id,
-                                    ]
-                                )
-                                logging.info(
-                                    "Request to terminate instance with id {} reply: {}".format(
-                                        instance_id, response
-                                    )
-                                )
+                vm_name, timeout_secs = get_vname_timeout_secs(instance)
+                if timeout_secs is None:
+                    timeout_secs = terminate_after_secs
+                total_instances = termination_check(
+                    ci_machines_prefix,
+                    current_datetime,
+                    ec2_client,
+                    instance_id,
+                    launch_time,
+                    timeout_secs,
+                    total_instances,
+                    vm_name,
+                )
     logging.info("Detected a total of {} ci.bechmark VMs".format(total_instances))
 
 
+def termination_check(
+    ci_machines_prefix,
+    current_datetime,
+    ec2_client,
+    instance_id,
+    launch_time,
+    terminate_after_secs,
+    total_instances,
+    vm_name,
+):
+    if ci_machines_prefix in vm_name:
+        total_instances = total_instances + 1
+        elapsed = current_datetime - launch_time
+        will_terminate = False
+        if elapsed.total_seconds() > terminate_after_secs:
+            will_terminate = True
+
+        logging.info(
+            "Temporary machine {} {}. terminate? {}".format(
+                vm_name, elapsed, will_terminate
+            )
+        )
+        if will_terminate:
+            logging.warning(
+                "Requesting to terminate instance with id {} given it ".format(
+                    instance_id
+                )
+                + "surpassed the maximum allowed ci duration"
+            )
+            response = ec2_client.terminate_instances(
+                InstanceIds=[
+                    instance_id,
+                ]
+            )
+            logging.info(
+                "Request to terminate instance with id {} reply: {}".format(
+                    instance_id, response
+                )
+            )
+    return total_instances
+
+
 def watchdog_command_logic(args, project_name, project_version):
     logging.info(
         "Using: {project_name} {project_version}".format(
diff --git a/tests/test_benchmark_config.py b/tests/test_benchmark_config.py
@@ -1,13 +1,18 @@
+import argparse
 import json
 
 import yaml
 
+from redisbench_admin.run_remote.args import create_run_remote_arguments
 from redisbench_admin.utils.benchmark_config import (
     results_dict_kpi_check,
     check_required_modules,
     extract_redis_dbconfig_parameters,
     extract_benchmark_type_from_config,
     get_metadata_tags,
+    get_termination_timeout_secs,
+    prepare_benchmark_definitions,
+    process_benchmark_definitions_remote_timeouts,
 )
 
 
@@ -152,3 +157,77 @@ def test_get_metadata_tags():
         benchmark_config = yaml.safe_load(yml_file)
         metadata_tags = get_metadata_tags(benchmark_config)
         assert metadata_tags == {"includes_targets": "true", "test_type": "query"}
+
+
+def test_get_termination_timeout_secs():
+    with open("./tests/test_data/vecsim-memtier.yml", "r") as yml_file:
+        benchmark_config = yaml.safe_load(yml_file)
+        timeout_seconds = get_termination_timeout_secs(benchmark_config)
+        assert timeout_seconds == 600
+
+    with open("./tests/test_data/vecsim-memtier-timeout.yml", "r") as yml_file:
+        benchmark_config = yaml.safe_load(yml_file)
+        timeout_seconds = get_termination_timeout_secs(benchmark_config)
+        assert timeout_seconds == 1200
+
+
+def test_prepare_benchmark_definitions():
+    parser = argparse.ArgumentParser(
+        description="test",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser = create_run_remote_arguments(parser)
+    args = parser.parse_args(
+        args=[
+            "--github_actor",
+            "gh.user",
+            "--module_path",
+            "mymodule.so",
+            "--test-glob",
+            "./tests/test_data/benchmark_definitions/*.yml",
+        ]
+    )
+    (
+        result,
+        benchmark_definitions,
+        default_metrics,
+        exporter_timemetric_path,
+        default_specs,
+        clusterconfig,
+    ) = prepare_benchmark_definitions(args)
+    assert result == True
+    assert len(benchmark_definitions.keys()) == 2
+
+
+def test_process_benchmark_definitions_remote_timeouts():
+    parser = argparse.ArgumentParser(
+        description="test",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser = create_run_remote_arguments(parser)
+    args = parser.parse_args(
+        args=[
+            "--github_actor",
+            "gh.user",
+            "--module_path",
+            "mymodule.so",
+            "--test-glob",
+            "./tests/test_data/benchmark_definitions/*.yml",
+        ]
+    )
+    (
+        result,
+        benchmark_definitions,
+        default_metrics,
+        exporter_timemetric_path,
+        default_specs,
+        clusterconfig,
+    ) = prepare_benchmark_definitions(args)
+    assert result == True
+    assert len(benchmark_definitions.keys()) == 2
+    remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
+        benchmark_definitions
+    )
+    assert len(remote_envs_timeout.keys()) == 1
+    # we have the default timeout + the one specified
+    assert list(remote_envs_timeout.values())[0] == 600 + 1200
diff --git a/tests/test_data/benchmark_definitions/vecsim-memtier-timeout.yml b/tests/test_data/benchmark_definitions/vecsim-memtier-timeout.yml
diff --git a/tests/test_data/benchmark_definitions/vecsim-memtier.yml b/tests/test_data/benchmark_definitions/vecsim-memtier.yml
diff --git a/tests/test_data/vecsim-memtier-timeout.yml b/tests/test_data/vecsim-memtier-timeout.yml