From d125187ea2d9a0fa71afcffa073b13e97509daa8 Mon Sep 17 00:00:00 2001 From: Geoffrey Yu Date: Wed, 17 Apr 2024 18:16:04 -0400 Subject: [PATCH] Add changing SLO experiment scripts and definitions (#494) --- .../15-e2e-scenarios-v2/slo_change/COND | 31 ++++ .../slo_change/run_workload.sh | 48 +++++ .../slo_change/run_workload_debug.sh | 49 +++++ .../slo_change/set_up_starting_blueprint.sh | 20 +++ .../slo_change/slo_change_config.yml | 168 ++++++++++++++++++ src/brad/config/system_event.py | 4 + src/brad/daemon/daemon.py | 8 +- .../set_up_starting_blueprint.py | 155 ++++++++++++++++ 8 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 experiments/15-e2e-scenarios-v2/slo_change/COND create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh create mode 100644 experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml create mode 100644 workloads/IMDB_extended/set_up_starting_blueprint.py diff --git a/experiments/15-e2e-scenarios-v2/slo_change/COND b/experiments/15-e2e-scenarios-v2/slo_change/COND new file mode 100644 index 00000000..64c440ea --- /dev/null +++ b/experiments/15-e2e-scenarios-v2/slo_change/COND @@ -0,0 +1,31 @@ +include("../common.cond") + +QUERIES = [99, 56, 32, 92, 91, 49, 30, 83, 94, 38, 87, 86, 76, 37, 31, 46, 58, 61, 62, 64, 69, 73, 74, 51, 57, 60] + +COMMON_CONFIGS = { + "physical-config-file": "config/physical_config_100gb.yml", + "schema-name": "imdb_extended_100g", + "ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK, + "txn-scale-factor": IMDB_100GB_SF, + "num-front-ends": 24, + "dataset-type": "100gb", + "ra-query-indexes": ",".join(map(str, QUERIES)) +} + +run_experiment( + name="brad_100g", + run="./run_workload.sh", + options={ + "system-config-file": "slo_change_config.yml", + **COMMON_CONFIGS, + }, +) + +run_command( + name="brad_100g_debug", + run="./run_workload_debug.sh", + options={ + "system-config-file": "slo_change_config.yml", + **COMMON_CONFIGS, + }, +) diff --git a/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh b/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh new file mode 100755 index 00000000..dedf788d --- /dev/null +++ b/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh @@ -0,0 +1,48 @@ +#! /bin/bash + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh + +# Arguments: +# --system-config-file +# --physical-config-file +# --query-indexes +extract_named_arguments $@ + +start_brad $system_config_file $physical_config_file +log_workload_point "brad_start_initiated" +sleep 30 + +log_workload_point "clients_starting" +# 8 clients, offset 16 (for the transactional clients) +start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16 +rana_pid=$runner_pid + +start_txn_runner_serial 16 # Implicit: --dataset-type +txn_pid=$runner_pid + +log_workload_point "clients_started" + +function inner_cancel_experiment() { + cancel_experiment $rana_pid $txn_pid +} + +trap "inner_cancel_experiment" INT +trap "inner_cancel_experiment" TERM + +# Sleep for 10 minutes and then change the SLOs. +sleep $(( 10 * 60 )) + +log_workload_point "changing_slo" +brad cli --command "BRAD_CHANGE_SLO 30.0 0.030" +log_workload_point "changed_slo" + +# Wait another hour before stopping. +sleep $(( 60 * 60 )) + +# Shut down everything now. +log_workload_point "experiment_workload_done" +>&2 echo "Experiment done. Shutting down runners..." +graceful_shutdown $rana_pid $txn_pid +log_workload_point "shutdown_complete" diff --git a/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh b/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh new file mode 100755 index 00000000..47982fbc --- /dev/null +++ b/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh @@ -0,0 +1,49 @@ +#! /bin/bash + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh + +# Arguments: +# --system-config-file +# --physical-config-file +# --query-indexes +extract_named_arguments $@ + +export BRAD_IGNORE_BLUEPRINT=1 +start_brad_debug $system_config_file $physical_config_file +log_workload_point "brad_start_initiated" +sleep 30 + +log_workload_point "clients_starting" +# 8 clients, offset 16 (for the transactional clients) +start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16 +rana_pid=$runner_pid + +start_txn_runner_serial 16 # Implicit: --dataset-type +txn_pid=$runner_pid + +log_workload_point "clients_started" + +function inner_cancel_experiment() { + cancel_experiment $rana_pid $txn_pid +} + +trap "inner_cancel_experiment" INT +trap "inner_cancel_experiment" TERM + +# Sleep for 2 minutes and then change the SLOs. +sleep $(( 2 * 60 )) + +log_workload_point "changing_slo" +brad cli --command "BRAD_CHANGE_SLO 30.0 0.030" +log_workload_point "changed_slo" + +# Wait another 10 mins before stopping. +sleep $(( 10 * 60 )) + +# Shut down everything now. +log_workload_point "experiment_workload_done" +>&2 echo "Experiment done. Shutting down runners..." +graceful_shutdown $rana_pid $txn_pid +log_workload_point "shutdown_complete" diff --git a/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh new file mode 100755 index 00000000..6a21a231 --- /dev/null +++ b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh @@ -0,0 +1,20 @@ +#! /bin/bash + +if [ -z $1 ]; then + >&2 echo "Usage: $0 path/to/physical/config.yml" + exit 1 +fi + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh + +python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \ + --schema-name imdb_extended_100g \ + --query-bank-file ../../../workloads/IMDB_100GB/regular_test/queries.sql \ + --aurora-queries "99,56,32,92,91" \ + --redshift-queries "49,30,83,94,38,87,86,76,37,31,46,58,61,62,64,69,73,74,51,57,60" \ + --redshift-provisioning "dc2.large:2" \ + --aurora-provisioning "db.t4g.medium:2" \ + --system-config-file slo_change_config.yml \ + --physical-config-file $1 diff --git a/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml b/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml new file mode 100644 index 00000000..0dbee531 --- /dev/null +++ b/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml @@ -0,0 +1,168 @@ +# This file contains configurations that are used by BRAD. These are default +# values and should be customized for specific situations. + +# BRAD's front end servers will listen for client connections on this interface +# and port. If `num_front_ends` is greater than one, subsequent front ends will +# listen on successive ports (e.g., 6584, 6585, etc.). +front_end_interface: "0.0.0.0" +front_end_port: 6583 +num_front_ends: 24 + +# Logging paths. If the value is in ALL_CAPS (with underscores), it is +# interpreted as an environment variable (BRAD will log to the path stored in +# the environment variable). + +# Where BRAD's daemon process will write its logs. +daemon_log_file: COND_OUT + +# Where BRAD's front end processes will write their logs. +front_end_log_path: COND_OUT + +# Where BRAD's blueprint planner will write debug logs. +planner_log_path: COND_OUT + +# Where BRAD's metrics loggers will write their logs. +metrics_log_path: COND_OUT + +# Probability that each transactional query will be logged. +txn_log_prob: 0.01 + +# Set to a non-zero value enable automatic data syncing. When this is set to 0, +# automatic syncing is disabled. +data_sync_period_seconds: 0 + +# BRAD's front end servers will report their metrics at regular intervals. +front_end_metrics_reporting_period_seconds: 30 +front_end_query_latency_buffer_size: 100 + +# `default` means to use the policy encoded in the blueprint. Other values will +# override the blueprint. +routing_policy: default + +# Whether to disable table movement for benchmark purposes (i.e., keep all +# tables on all engines.) +disable_table_movement: true + +# Epoch length for metrics and forecasting. This is the granularity at which +# metrics/forecasting will be performed. +epoch_length: + weeks: 0 + days: 0 + hours: 0 + minutes: 1 + +# Blueprint planning strategy. +strategy: fp_query_based_beam + +# Used to specify the period of time over which to use data for planning. +# Currrently, this is a "look behind" window for the workload. +planning_window: + weeks: 0 + days: 0 + hours: 1 + minutes: 0 + +# Used to aggregate metrics collected in the planning window. +metrics_agg: + method: ewm # 'mean' is another option + alpha: 0.86466472 # 1 - 1 / e^2 + +# Used during planning. +reinterpret_second_as: 1 + +# The query distribution must change by at least this much for a new blueprint +# to be accepted. +query_dist_change_frac: 0.1 + +# The search bound for the provisioning. +max_provisioning_multiplier: 2.5 + +# Flag options for blueprint planning. +use_io_optimized_aurora: true +use_recorded_routing_if_available: true +ensure_tables_together_on_one_engine: true + +# Loads used to prime the system when no information is available. +aurora_initialize_load_fraction: 0.25 +redshift_initialize_load_fraction: 0.25 + +# BRAD will not reduce predicted load lower than these values. Raise these +# values to be more conservative against mispredictions. +aurora_min_load_removal_fraction: 0.8 +redshift_min_load_removal_fraction: 0.9 + +# Blueprint planning performance ceilings. +# These will change to 30 s and 30 ms during the experiment. +query_latency_p90_ceiling_s: 60.0 +txn_latency_p90_ceiling_s: 0.060 + +# If set to true, BRAD will attempt to use the specified preset Redshift +# clusters instead of resizing the main Redshift cluster. +use_preset_redshift_clusters: true + +# Used for ordering blueprints during planning. +comparator: + type: benefit_perf_ceiling # or `perf_ceiling` + + benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator + weeks: 0 + days: 0 + hours: 3 + minutes: 0 + + penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator + penalty_power: 8 # Only used by the `benefit_perf_ceiling` comparator + +# Used for precomputed predictions. +std_datasets: + - name: regular + path: workloads/IMDB_100GB/regular_test/ + - name: adhoc + path: workloads/IMDB_100GB/adhoc_test/ + +aurora_max_query_factor: 4.0 +aurora_max_query_factor_replace: 10000.0 + +redshift_peak_load_threshold: 95.0 +redshift_peak_load_multiplier: 2.0 + +planner_max_workers: 16 +aurora_provisioning_search_distance: 900.0 +redshift_provisioning_search_distance: 900.0 + +# Blueprint planning trigger configs. + +triggers: + enabled: true + check_period_s: 90 # Triggers are checked every X seconds. + check_period_offset_s: 600 # Wait 10 mins before starting. + observe_new_blueprint_mins: 10 + + elapsed_time: + disabled: true + multiplier: 60 # Multiplier over `planning_window`. + + redshift_cpu: + lo: 15 + hi: 85 + sustained_epochs: 3 + + aurora_cpu: + lo: 15 + hi: 85 + sustained_epochs: 3 + + variable_costs: + disabled: true + threshold: 1.0 + + query_latency_ceiling: + ceiling_s: 60.0 + sustained_epochs: 3 + + txn_latency_ceiling: + ceiling_s: 0.060 + sustained_epochs: 3 + + recent_change: + delay_epochs: 5 diff --git a/src/brad/config/system_event.py b/src/brad/config/system_event.py index 21c80c7e..68c125f5 100644 --- a/src/brad/config/system_event.py +++ b/src/brad/config/system_event.py @@ -33,3 +33,7 @@ class SystemEvent(enum.Enum): # Use this for long running experiments. ReachedExpectedState = "reached_expected_state" + + # Used when a service level objective is changed while BRAD is running (used + # for experiments). + ChangedSlos = "changed_slos" diff --git a/src/brad/daemon/daemon.py b/src/brad/daemon/daemon.py index 360fda2f..5f22a2c3 100644 --- a/src/brad/daemon/daemon.py +++ b/src/brad/daemon/daemon.py @@ -733,7 +733,7 @@ async def _handle_internal_command(self, command: str) -> RowList: parts = command.split(" ") if self._temp_config is None: return [("Cannot change SLOs because TempConfig is missing.",)] - if len(parts) <= 3: + if len(parts) < 3: return [("Need to specify query and txn p90 SLOs",)] query_p90_s = float(parts[1]) @@ -757,6 +757,12 @@ async def _handle_internal_command(self, command: str) -> RowList: elif isinstance(t, TransactionLatencyCeiling): t.set_latency_ceiling(txn_p90_s) + if self._system_event_logger is not None: + self._system_event_logger.log( + SystemEvent.ChangedSlos, + f"query_p90_s={query_p90_s}; txn_p90_s={txn_p90_s}", + ) + return [ ( f"p90 SLOs changed to (query {query_p90_s:.3f} s), (txn {txn_p90_s:.3f} s)", diff --git a/workloads/IMDB_extended/set_up_starting_blueprint.py b/workloads/IMDB_extended/set_up_starting_blueprint.py new file mode 100644 index 00000000..dc36cf2a --- /dev/null +++ b/workloads/IMDB_extended/set_up_starting_blueprint.py @@ -0,0 +1,155 @@ +import asyncio +import argparse +import logging + +from brad.asset_manager import AssetManager +from brad.blueprint import Blueprint +from brad.blueprint.manager import BlueprintManager +from brad.blueprint.provisioning import Provisioning +from brad.config.engine import Engine +from brad.config.file import ConfigFile +from brad.daemon.transition_orchestrator import TransitionOrchestrator +from brad.planner.enumeration.blueprint import EnumeratedBlueprint +from brad.query_rep import QueryRep +from brad.routing.abstract_policy import FullRoutingPolicy +from brad.routing.cached import CachedLocationPolicy +from brad.routing.policy import RoutingPolicy +from brad.routing.tree_based.forest_policy import ForestPolicy +from brad.utils import set_up_logging + +logger = logging.getLogger(__name__) + + +async def run_transition( + config: ConfigFile, + blueprint_mgr: BlueprintManager, + next_blueprint: Blueprint, +) -> None: + logger.info("Starting the transition...") + assert next_blueprint is not None + await blueprint_mgr.start_transition(next_blueprint, new_score=None) + orchestrator = TransitionOrchestrator(config, blueprint_mgr) + logger.info("Running the transition...") + await orchestrator.run_prepare_then_transition() + logger.info("Running the post-transition clean up...") + await orchestrator.run_clean_up_after_transition() + logger.info("Done!") + + +def parse_provisioning(raw: str) -> Provisioning: + parts = raw.split(":") + if len(parts) != 2: + raise RuntimeError(f"Invalid provisioning: {raw}") + return Provisioning(parts[0], int(parts[1])) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--system-config-file", + type=str, + required=True, + help="Path to BRAD's system configuration file.", + ) + parser.add_argument( + "--physical-config-file", + type=str, + required=True, + help="Path to BRAD's physical configuration file.", + ) + parser.add_argument( + "--schema-name", + type=str, + required=True, + help="The name of the schema to drop.", + ) + parser.add_argument( + "--query-bank-file", + type=str, + help="The query bank that the query indices refer to.", + ) + parser.add_argument( + "--athena-queries", type=str, help="Comma separated list of indices." + ) + parser.add_argument( + "--aurora-queries", type=str, help="Comma separated list of indices." + ) + parser.add_argument( + "--redshift-queries", type=str, help="Comma separated list of indices." + ) + parser.add_argument( + "--redshift-provisioning", type=str, help="Format: :" + ) + parser.add_argument( + "--aurora-provisioning", type=str, help="Format: :" + ) + args = parser.parse_args() + set_up_logging(debug_mode=True) + + # 1. Load the config. + config = ConfigFile.load_from_new_configs( + phys_config=args.physical_config_file, system_config=args.system_config_file + ) + + # 2. Load the existing blueprint. + assets = AssetManager(config) + blueprint_mgr = BlueprintManager(config, assets, args.schema_name) + blueprint_mgr.load_sync() + blueprint = blueprint_mgr.get_blueprint() + + # 3. Load the query bank. + queries = [] + with open(args.query_bank_file, "r", encoding="UTF-8") as file: + for line in file: + clean = line.strip() + if clean.endswith(";"): + clean = clean[:-1] + queries.append(QueryRep(clean)) + + # 4. Create the fixed routing policy. + query_map = {} + if args.athena_queries is not None: + for qidx_str in args.athena_queries.split(","): + qidx = int(qidx_str.strip()) + query_map[queries[qidx]] = Engine.Athena + if args.redshift_queries is not None: + for qidx_str in args.redshift_queries.split(","): + qidx = int(qidx_str.strip()) + query_map[queries[qidx]] = Engine.Redshift + if args.aurora_queries is not None: + for qidx_str in args.aurora_queries.split(","): + qidx = int(qidx_str.strip()) + query_map[queries[qidx]] = Engine.Aurora + clp = CachedLocationPolicy(query_map) + + # 5. Replace the policy. + enum_blueprint = EnumeratedBlueprint(blueprint) + definite_policy = asyncio.run( + ForestPolicy.from_assets( + args.schema_name, RoutingPolicy.ForestTableCardinality, assets + ) + ) + replaced_policy = FullRoutingPolicy( + indefinite_policies=[clp], definite_policy=definite_policy + ) + enum_blueprint.set_routing_policy(replaced_policy) + + # Ensure the provisioning is as expected. + enum_blueprint.set_aurora_provisioning(parse_provisioning(args.aurora_provisioning)) + enum_blueprint.set_redshift_provisioning( + parse_provisioning(args.redshift_provisioning) + ) + + # 6. Adjust the placement. + new_placement = {} + for table in blueprint.tables(): + new_placement[table.name] = [Engine.Aurora, Engine.Athena] + enum_blueprint.set_table_locations(new_placement) + + # 6. Transition to the new blueprint. + modified_blueprint = enum_blueprint.to_blueprint() + asyncio.run(run_transition(config, blueprint_mgr, modified_blueprint)) + + +if __name__ == "__main__": + main()