Skip to content

Commit 03f688d

Browse files
Enabled long-running benchmarks (with watchdog enforcing deletion) vi… (#313)
* Enabled long-running benchmarks (with watchdog enforcing deletion) via timeout_secods property on benchmark definition * Fixes per flake8 review
1 parent 5c11f2c commit 03f688d

File tree

11 files changed

+264
-35
lines changed

11 files changed

+264
-35
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "redisbench-admin"
3-
version = "0.7.11"
3+
version = "0.7.12"
44
description = "Redis benchmark run helper. A wrapper around Redis and Redis Modules benchmark tools ( ftsb_redisearch, memtier_benchmark, redis-benchmark, aibench, etc... )."
55
authors = ["filipecosta90 <[email protected]>","Redis Performance Group <[email protected]>"]
66
readme = "README.md"

redisbench_admin/run_remote/remote_env.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def remote_env_setup(
2424
tf_github_sha,
2525
tf_setup_name_sufix,
2626
tf_triggering_env,
27+
tf_timeout_secs=7200,
2728
):
2829
server_plaintext_port = 6379
2930
db_ssh_port = args.db_ssh_port
@@ -65,6 +66,7 @@ def remote_env_setup(
6566
tf_github_sha,
6667
tf_setup_name_sufix,
6768
tf_triggering_env,
69+
tf_timeout_secs,
6870
)
6971
return (
7072
client_public_ip,

redisbench_admin/run_remote/run_remote.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
from redisbench_admin.utils.benchmark_config import (
5252
prepare_benchmark_definitions,
5353
get_metadata_tags,
54+
process_benchmark_definitions_remote_timeouts,
5455
)
5556
from redisbench_admin.utils.redisgraph_benchmark_go import setup_remote_benchmark_agent
5657
from redisbench_admin.utils.remote import (
@@ -59,6 +60,7 @@
5960
check_ec2_env,
6061
get_project_ts_tags,
6162
push_data_to_redistimeseries,
63+
fetch_remote_id_from_config,
6264
)
6365

6466
from redisbench_admin.utils.utils import (
@@ -180,6 +182,17 @@ def run_remote_command_logic(args, project_name, project_version):
180182
)
181183
rts.ping()
182184

185+
remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
186+
benchmark_definitions
187+
)
188+
189+
for remote_id, termination_timeout_secs in remote_envs_timeout.items():
190+
logging.info(
191+
"Using a timeout of {} seconds for remote setup: {}".format(
192+
termination_timeout_secs, remote_id
193+
)
194+
)
195+
183196
# we have a map of test-type, dataset-name, topology, test-name
184197
benchmark_runs_plan = define_benchmark_plan(benchmark_definitions, default_specs)
185198

@@ -250,6 +263,10 @@ def run_remote_command_logic(args, project_name, project_version):
250263
)
251264
)
252265
if "remote" in benchmark_config:
266+
remote_id = fetch_remote_id_from_config(
267+
benchmark_config["remote"]
268+
)
269+
tf_timeout_secs = remote_envs_timeout[remote_id]
253270
client_artifacts = []
254271
client_artifacts_map = {}
255272
temporary_dir = get_tmp_folder_rnd()
@@ -274,6 +291,7 @@ def run_remote_command_logic(args, project_name, project_version):
274291
tf_github_sha,
275292
tf_setup_name_sufix,
276293
tf_triggering_env,
294+
tf_timeout_secs,
277295
)
278296

279297
# after we've created the env, even on error we should always teardown

redisbench_admin/run_remote/terraform.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def terraform_spin_or_reuse_env(
2727
tf_github_sha,
2828
tf_setup_name_sufix,
2929
tf_triggering_env,
30+
tf_timeout_secs=7200,
3031
):
3132
(
3233
remote_setup,
@@ -62,6 +63,7 @@ def terraform_spin_or_reuse_env(
6263
tf_github_org,
6364
tf_github_repo,
6465
tf_triggering_env,
66+
tf_timeout_secs,
6567
)
6668
remote_envs[remote_id] = tf
6769
else:

redisbench_admin/utils/benchmark_config.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
import yaml
1313
from jsonpath_ng import parse
1414

15-
from redisbench_admin.utils.remote import validate_result_expectations
15+
from redisbench_admin.utils.remote import (
16+
validate_result_expectations,
17+
fetch_remote_id_from_config,
18+
)
1619

1720

1821
def parse_exporter_metrics_definition(
@@ -79,6 +82,21 @@ def prepare_benchmark_definitions(args):
7982
)
8083

8184

85+
def process_benchmark_definitions_remote_timeouts(benchmark_definitions):
86+
remote_envs_timeout = {}
87+
# prepare the timeout for each different remote type
88+
for test_name, benchmark_config in benchmark_definitions.items():
89+
if "remote" in benchmark_config:
90+
remote_id = fetch_remote_id_from_config(benchmark_config["remote"])
91+
termination_timeout_secs = get_termination_timeout_secs(benchmark_config)
92+
if remote_id not in remote_envs_timeout:
93+
remote_envs_timeout[remote_id] = 0
94+
remote_envs_timeout[remote_id] = (
95+
remote_envs_timeout[remote_id] + termination_timeout_secs
96+
)
97+
return remote_envs_timeout
98+
99+
82100
def get_defaults(defaults_filename):
83101
default_metrics = []
84102
exporter_timemetric_path = None
@@ -302,6 +320,13 @@ def get_metadata_tags(benchmark_config):
302320
return metadata_tags
303321

304322

323+
def get_termination_timeout_secs(benchmark_config):
324+
timeout_seconds = 600
325+
if "timeout_seconds" in benchmark_config:
326+
timeout_seconds = int(benchmark_config["timeout_seconds"])
327+
return timeout_seconds
328+
329+
305330
def extract_benchmark_type_from_config(
306331
benchmark_config,
307332
config_key="clientconfig",

redisbench_admin/utils/remote.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ def setup_remote_environment(
254254
tf_github_org,
255255
tf_github_repo,
256256
tf_triggering_env,
257+
tf_timeout_secs=7200,
257258
):
258259
# key = "benchmarks/infrastructure/tf-oss-redisgraph-standalone-r5.tfstate"
259260
_, _, _ = tf.init(
@@ -287,6 +288,7 @@ def setup_remote_environment(
287288
"github_org": tf_github_org,
288289
"github_repo": tf_github_repo,
289290
"triggering_env": tf_triggering_env,
291+
"timeout_secs": tf_timeout_secs,
290292
},
291293
)
292294
return retrieve_tf_connection_vars(return_code, tf)
@@ -493,6 +495,16 @@ def get_run_full_filename(
493495
return benchmark_output_filename
494496

495497

498+
def fetch_remote_id_from_config(
499+
remote_setup_config,
500+
):
501+
setup = None
502+
for remote_setup_property in remote_setup_config:
503+
if "setup" in remote_setup_property:
504+
setup = remote_setup_property["setup"]
505+
return setup
506+
507+
496508
def fetch_remote_setup_from_config(
497509
remote_setup_config,
498510
repo="https://github.com/RedisLabsModules/testing-infrastructure.git",

redisbench_admin/watchdog/watchdog.py

Lines changed: 68 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,19 @@ def get_ci_ec2_instances_by_state(ec2_client, ci_machines_prefix, requested_stat
4242
return count, state_instances
4343

4444

45+
def get_vname_timeout_secs(instance):
46+
vm_name = ""
47+
timeout_secs = None
48+
for tag_dict in instance["Tags"]:
49+
key = tag_dict["Key"]
50+
key_v = tag_dict["Value"]
51+
if key == "Name":
52+
vm_name = key_v
53+
if key == "timeout_secs":
54+
timeout_secs = int(key_v)
55+
return vm_name, timeout_secs
56+
57+
4558
def watchdog_dangling_ec2_instances(
4659
ec2_client, terminate_after_secs, ci_machines_prefix, dry_run
4760
):
@@ -55,42 +68,64 @@ def watchdog_dangling_ec2_instances(
5568
instance_id = instance["InstanceId"]
5669
state = instance["State"]["Name"]
5770
if state != "terminated":
58-
for tag_dict in instance["Tags"]:
59-
key = tag_dict["Key"]
60-
key_v = tag_dict["Value"]
61-
if key == "Name":
62-
if ci_machines_prefix in key_v:
63-
total_instances = total_instances + 1
64-
elapsed = current_datetime - launch_time
65-
will_terminate = False
66-
if elapsed.total_seconds() > terminate_after_secs:
67-
will_terminate = True
68-
69-
logging.info(
70-
"Temporary machine {} {}. terminate? {}".format(
71-
key_v, elapsed, will_terminate
72-
)
73-
)
74-
if will_terminate:
75-
logging.warning(
76-
"Requesting to terminate instance with id {} given it ".format(
77-
instance_id
78-
)
79-
+ "surpassed the maximum allowed ci duration"
80-
)
81-
response = ec2_client.terminate_instances(
82-
InstanceIds=[
83-
instance_id,
84-
]
85-
)
86-
logging.info(
87-
"Request to terminate instance with id {} reply: {}".format(
88-
instance_id, response
89-
)
90-
)
71+
vm_name, timeout_secs = get_vname_timeout_secs(instance)
72+
if timeout_secs is None:
73+
timeout_secs = terminate_after_secs
74+
total_instances = termination_check(
75+
ci_machines_prefix,
76+
current_datetime,
77+
ec2_client,
78+
instance_id,
79+
launch_time,
80+
timeout_secs,
81+
total_instances,
82+
vm_name,
83+
)
9184
logging.info("Detected a total of {} ci.bechmark VMs".format(total_instances))
9285

9386

87+
def termination_check(
88+
ci_machines_prefix,
89+
current_datetime,
90+
ec2_client,
91+
instance_id,
92+
launch_time,
93+
terminate_after_secs,
94+
total_instances,
95+
vm_name,
96+
):
97+
if ci_machines_prefix in vm_name:
98+
total_instances = total_instances + 1
99+
elapsed = current_datetime - launch_time
100+
will_terminate = False
101+
if elapsed.total_seconds() > terminate_after_secs:
102+
will_terminate = True
103+
104+
logging.info(
105+
"Temporary machine {} {}. terminate? {}".format(
106+
vm_name, elapsed, will_terminate
107+
)
108+
)
109+
if will_terminate:
110+
logging.warning(
111+
"Requesting to terminate instance with id {} given it ".format(
112+
instance_id
113+
)
114+
+ "surpassed the maximum allowed ci duration"
115+
)
116+
response = ec2_client.terminate_instances(
117+
InstanceIds=[
118+
instance_id,
119+
]
120+
)
121+
logging.info(
122+
"Request to terminate instance with id {} reply: {}".format(
123+
instance_id, response
124+
)
125+
)
126+
return total_instances
127+
128+
94129
def watchdog_command_logic(args, project_name, project_version):
95130
logging.info(
96131
"Using: {project_name} {project_version}".format(

tests/test_benchmark_config.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1+
import argparse
12
import json
23

34
import yaml
45

6+
from redisbench_admin.run_remote.args import create_run_remote_arguments
57
from redisbench_admin.utils.benchmark_config import (
68
results_dict_kpi_check,
79
check_required_modules,
810
extract_redis_dbconfig_parameters,
911
extract_benchmark_type_from_config,
1012
get_metadata_tags,
13+
get_termination_timeout_secs,
14+
prepare_benchmark_definitions,
15+
process_benchmark_definitions_remote_timeouts,
1116
)
1217

1318

@@ -152,3 +157,77 @@ def test_get_metadata_tags():
152157
benchmark_config = yaml.safe_load(yml_file)
153158
metadata_tags = get_metadata_tags(benchmark_config)
154159
assert metadata_tags == {"includes_targets": "true", "test_type": "query"}
160+
161+
162+
def test_get_termination_timeout_secs():
163+
with open("./tests/test_data/vecsim-memtier.yml", "r") as yml_file:
164+
benchmark_config = yaml.safe_load(yml_file)
165+
timeout_seconds = get_termination_timeout_secs(benchmark_config)
166+
assert timeout_seconds == 600
167+
168+
with open("./tests/test_data/vecsim-memtier-timeout.yml", "r") as yml_file:
169+
benchmark_config = yaml.safe_load(yml_file)
170+
timeout_seconds = get_termination_timeout_secs(benchmark_config)
171+
assert timeout_seconds == 1200
172+
173+
174+
def test_prepare_benchmark_definitions():
175+
parser = argparse.ArgumentParser(
176+
description="test",
177+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
178+
)
179+
parser = create_run_remote_arguments(parser)
180+
args = parser.parse_args(
181+
args=[
182+
"--github_actor",
183+
"gh.user",
184+
"--module_path",
185+
"mymodule.so",
186+
"--test-glob",
187+
"./tests/test_data/benchmark_definitions/*.yml",
188+
]
189+
)
190+
(
191+
result,
192+
benchmark_definitions,
193+
default_metrics,
194+
exporter_timemetric_path,
195+
default_specs,
196+
clusterconfig,
197+
) = prepare_benchmark_definitions(args)
198+
assert result == True
199+
assert len(benchmark_definitions.keys()) == 2
200+
201+
202+
def test_process_benchmark_definitions_remote_timeouts():
203+
parser = argparse.ArgumentParser(
204+
description="test",
205+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
206+
)
207+
parser = create_run_remote_arguments(parser)
208+
args = parser.parse_args(
209+
args=[
210+
"--github_actor",
211+
"gh.user",
212+
"--module_path",
213+
"mymodule.so",
214+
"--test-glob",
215+
"./tests/test_data/benchmark_definitions/*.yml",
216+
]
217+
)
218+
(
219+
result,
220+
benchmark_definitions,
221+
default_metrics,
222+
exporter_timemetric_path,
223+
default_specs,
224+
clusterconfig,
225+
) = prepare_benchmark_definitions(args)
226+
assert result == True
227+
assert len(benchmark_definitions.keys()) == 2
228+
remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
229+
benchmark_definitions
230+
)
231+
assert len(remote_envs_timeout.keys()) == 1
232+
# we have the default timeout + the one specified
233+
assert list(remote_envs_timeout.values())[0] == 600 + 1200

0 commit comments

Comments
 (0)