From d125187ea2d9a0fa71afcffa073b13e97509daa8 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Wed, 17 Apr 2024 18:16:04 -0400
Subject: [PATCH] Add changing SLO experiment scripts and definitions (#494)

---
 .../15-e2e-scenarios-v2/slo_change/COND       |  31 ++++
 .../slo_change/run_workload.sh                |  48 +++++
 .../slo_change/run_workload_debug.sh          |  49 +++++
 .../slo_change/set_up_starting_blueprint.sh   |  20 +++
 .../slo_change/slo_change_config.yml          | 168 ++++++++++++++++++
 src/brad/config/system_event.py               |   4 +
 src/brad/daemon/daemon.py                     |   8 +-
 .../set_up_starting_blueprint.py              | 155 ++++++++++++++++
 8 files changed, 482 insertions(+), 1 deletion(-)
 create mode 100644 experiments/15-e2e-scenarios-v2/slo_change/COND
 create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh
 create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh
 create mode 100755 experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
 create mode 100644 experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml
 create mode 100644 workloads/IMDB_extended/set_up_starting_blueprint.py

diff --git a/experiments/15-e2e-scenarios-v2/slo_change/COND b/experiments/15-e2e-scenarios-v2/slo_change/COND
new file mode 100644
index 00000000..64c440ea
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/slo_change/COND
@@ -0,0 +1,31 @@
+include("../common.cond")
+
+QUERIES = [99, 56, 32, 92, 91, 49, 30, 83, 94, 38, 87, 86, 76, 37, 31, 46, 58, 61, 62, 64, 69, 73, 74, 51, 57, 60]
+
+COMMON_CONFIGS = {
+  "physical-config-file": "config/physical_config_100gb.yml",
+  "schema-name": "imdb_extended_100g",
+  "ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
+  "txn-scale-factor": IMDB_100GB_SF,
+  "num-front-ends": 24,
+  "dataset-type": "100gb",
+  "ra-query-indexes": ",".join(map(str, QUERIES))
+}
+
+run_experiment(
+  name="brad_100g",
+  run="./run_workload.sh",
+  options={
+    "system-config-file": "slo_change_config.yml",
+    **COMMON_CONFIGS,
+  },
+)
+
+run_command(
+  name="brad_100g_debug",
+  run="./run_workload_debug.sh",
+  options={
+    "system-config-file": "slo_change_config.yml",
+    **COMMON_CONFIGS,
+  },
+)
diff --git a/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh b/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh
new file mode 100755
index 00000000..dedf788d
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh
@@ -0,0 +1,48 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+# Arguments:
+# --system-config-file
+# --physical-config-file
+# --query-indexes
+extract_named_arguments $@
+
+start_brad $system_config_file $physical_config_file
+log_workload_point "brad_start_initiated"
+sleep 30
+
+log_workload_point "clients_starting"
+# 8 clients, offset 16 (for the transactional clients)
+start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
+rana_pid=$runner_pid
+
+start_txn_runner_serial 16  # Implicit: --dataset-type
+txn_pid=$runner_pid
+
+log_workload_point "clients_started"
+
+function inner_cancel_experiment() {
+  cancel_experiment $rana_pid $txn_pid
+}
+
+trap "inner_cancel_experiment" INT
+trap "inner_cancel_experiment" TERM
+
+# Sleep for 10 minutes and then change the SLOs.
+sleep $(( 10 * 60 ))
+
+log_workload_point "changing_slo"
+brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
+log_workload_point "changed_slo"
+
+# Wait another hour before stopping.
+sleep $(( 60 * 60 ))
+
+# Shut down everything now.
+log_workload_point "experiment_workload_done"
+>&2 echo "Experiment done. Shutting down runners..."
+graceful_shutdown $rana_pid $txn_pid
+log_workload_point "shutdown_complete"
diff --git a/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh b/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh
new file mode 100755
index 00000000..47982fbc
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh
@@ -0,0 +1,49 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+# Arguments:
+# --system-config-file
+# --physical-config-file
+# --query-indexes
+extract_named_arguments $@
+
+export BRAD_IGNORE_BLUEPRINT=1
+start_brad_debug $system_config_file $physical_config_file
+log_workload_point "brad_start_initiated"
+sleep 30
+
+log_workload_point "clients_starting"
+# 8 clients, offset 16 (for the transactional clients)
+start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
+rana_pid=$runner_pid
+
+start_txn_runner_serial 16  # Implicit: --dataset-type
+txn_pid=$runner_pid
+
+log_workload_point "clients_started"
+
+function inner_cancel_experiment() {
+  cancel_experiment $rana_pid $txn_pid
+}
+
+trap "inner_cancel_experiment" INT
+trap "inner_cancel_experiment" TERM
+
+# Sleep for 2 minutes and then change the SLOs.
+sleep $(( 2 * 60 ))
+
+log_workload_point "changing_slo"
+brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
+log_workload_point "changed_slo"
+
+# Wait another 10 mins before stopping.
+sleep $(( 10 * 60 ))
+
+# Shut down everything now.
+log_workload_point "experiment_workload_done"
+>&2 echo "Experiment done. Shutting down runners..."
+graceful_shutdown $rana_pid $txn_pid
+log_workload_point "shutdown_complete"
diff --git a/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
new file mode 100755
index 00000000..6a21a231
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+
+if [ -z $1 ]; then
+  >&2 echo "Usage: $0 path/to/physical/config.yml"
+  exit 1
+fi
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
+  --schema-name imdb_extended_100g \
+  --query-bank-file ../../../workloads/IMDB_100GB/regular_test/queries.sql \
+  --aurora-queries "99,56,32,92,91" \
+  --redshift-queries "49,30,83,94,38,87,86,76,37,31,46,58,61,62,64,69,73,74,51,57,60" \
+  --redshift-provisioning "dc2.large:2" \
+  --aurora-provisioning "db.t4g.medium:2" \
+  --system-config-file slo_change_config.yml \
+  --physical-config-file $1
diff --git a/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml b/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml
new file mode 100644
index 00000000..0dbee531
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml
@@ -0,0 +1,168 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 24
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: COND_OUT
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: COND_OUT
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: COND_OUT
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: COND_OUT
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.01
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.9
+
+# Blueprint planning performance ceilings.
+# These will change to 30 s and 30 ms during the experiment.
+query_latency_p90_ceiling_s: 60.0
+txn_latency_p90_ceiling_s: 0.060
+
+# If set to true, BRAD will attempt to use the specified preset Redshift
+# clusters instead of resizing the main Redshift cluster.
+use_preset_redshift_clusters: true
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 3
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 8  # Only used by the `benefit_perf_ceiling` comparator
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: regular
+    path: workloads/IMDB_100GB/regular_test/
+  - name: adhoc
+    path: workloads/IMDB_100GB/adhoc_test/
+
+aurora_max_query_factor: 4.0
+aurora_max_query_factor_replace: 10000.0
+
+redshift_peak_load_threshold: 95.0
+redshift_peak_load_multiplier: 2.0
+
+planner_max_workers: 16
+aurora_provisioning_search_distance: 900.0
+redshift_provisioning_search_distance: 900.0
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: true
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 600  # Wait 10 mins before starting.
+  observe_new_blueprint_mins: 10
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 60.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.060
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5
diff --git a/src/brad/config/system_event.py b/src/brad/config/system_event.py
index 21c80c7e..68c125f5 100644
--- a/src/brad/config/system_event.py
+++ b/src/brad/config/system_event.py
@@ -33,3 +33,7 @@ class SystemEvent(enum.Enum):
 
     # Use this for long running experiments.
     ReachedExpectedState = "reached_expected_state"
+
+    # Used when a service level objective is changed while BRAD is running (used
+    # for experiments).
+    ChangedSlos = "changed_slos"
diff --git a/src/brad/daemon/daemon.py b/src/brad/daemon/daemon.py
index 360fda2f..5f22a2c3 100644
--- a/src/brad/daemon/daemon.py
+++ b/src/brad/daemon/daemon.py
@@ -733,7 +733,7 @@ async def _handle_internal_command(self, command: str) -> RowList:
             parts = command.split(" ")
             if self._temp_config is None:
                 return [("Cannot change SLOs because TempConfig is missing.",)]
-            if len(parts) <= 3:
+            if len(parts) < 3:
                 return [("Need to specify query and txn p90 SLOs",)]
 
             query_p90_s = float(parts[1])
@@ -757,6 +757,12 @@ async def _handle_internal_command(self, command: str) -> RowList:
                 elif isinstance(t, TransactionLatencyCeiling):
                     t.set_latency_ceiling(txn_p90_s)
 
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.ChangedSlos,
+                    f"query_p90_s={query_p90_s}; txn_p90_s={txn_p90_s}",
+                )
+
             return [
                 (
                     f"p90 SLOs changed to (query {query_p90_s:.3f} s), (txn {txn_p90_s:.3f} s)",
diff --git a/workloads/IMDB_extended/set_up_starting_blueprint.py b/workloads/IMDB_extended/set_up_starting_blueprint.py
new file mode 100644
index 00000000..dc36cf2a
--- /dev/null
+++ b/workloads/IMDB_extended/set_up_starting_blueprint.py
@@ -0,0 +1,155 @@
+import asyncio
+import argparse
+import logging
+
+from brad.asset_manager import AssetManager
+from brad.blueprint import Blueprint
+from brad.blueprint.manager import BlueprintManager
+from brad.blueprint.provisioning import Provisioning
+from brad.config.engine import Engine
+from brad.config.file import ConfigFile
+from brad.daemon.transition_orchestrator import TransitionOrchestrator
+from brad.planner.enumeration.blueprint import EnumeratedBlueprint
+from brad.query_rep import QueryRep
+from brad.routing.abstract_policy import FullRoutingPolicy
+from brad.routing.cached import CachedLocationPolicy
+from brad.routing.policy import RoutingPolicy
+from brad.routing.tree_based.forest_policy import ForestPolicy
+from brad.utils import set_up_logging
+
+logger = logging.getLogger(__name__)
+
+
+async def run_transition(
+    config: ConfigFile,
+    blueprint_mgr: BlueprintManager,
+    next_blueprint: Blueprint,
+) -> None:
+    logger.info("Starting the transition...")
+    assert next_blueprint is not None
+    await blueprint_mgr.start_transition(next_blueprint, new_score=None)
+    orchestrator = TransitionOrchestrator(config, blueprint_mgr)
+    logger.info("Running the transition...")
+    await orchestrator.run_prepare_then_transition()
+    logger.info("Running the post-transition clean up...")
+    await orchestrator.run_clean_up_after_transition()
+    logger.info("Done!")
+
+
+def parse_provisioning(raw: str) -> Provisioning:
+    parts = raw.split(":")
+    if len(parts) != 2:
+        raise RuntimeError(f"Invalid provisioning: {raw}")
+    return Provisioning(parts[0], int(parts[1]))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--system-config-file",
+        type=str,
+        required=True,
+        help="Path to BRAD's system configuration file.",
+    )
+    parser.add_argument(
+        "--physical-config-file",
+        type=str,
+        required=True,
+        help="Path to BRAD's physical configuration file.",
+    )
+    parser.add_argument(
+        "--schema-name",
+        type=str,
+        required=True,
+        help="The name of the schema to drop.",
+    )
+    parser.add_argument(
+        "--query-bank-file",
+        type=str,
+        help="The query bank that the query indices refer to.",
+    )
+    parser.add_argument(
+        "--athena-queries", type=str, help="Comma separated list of indices."
+    )
+    parser.add_argument(
+        "--aurora-queries", type=str, help="Comma separated list of indices."
+    )
+    parser.add_argument(
+        "--redshift-queries", type=str, help="Comma separated list of indices."
+    )
+    parser.add_argument(
+        "--redshift-provisioning", type=str, help="Format: <instance type>:<num. nodes>"
+    )
+    parser.add_argument(
+        "--aurora-provisioning", type=str, help="Format: <instance type>:<num. nodes>"
+    )
+    args = parser.parse_args()
+    set_up_logging(debug_mode=True)
+
+    # 1. Load the config.
+    config = ConfigFile.load_from_new_configs(
+        phys_config=args.physical_config_file, system_config=args.system_config_file
+    )
+
+    # 2. Load the existing blueprint.
+    assets = AssetManager(config)
+    blueprint_mgr = BlueprintManager(config, assets, args.schema_name)
+    blueprint_mgr.load_sync()
+    blueprint = blueprint_mgr.get_blueprint()
+
+    # 3. Load the query bank.
+    queries = []
+    with open(args.query_bank_file, "r", encoding="UTF-8") as file:
+        for line in file:
+            clean = line.strip()
+            if clean.endswith(";"):
+                clean = clean[:-1]
+            queries.append(QueryRep(clean))
+
+    # 4. Create the fixed routing policy.
+    query_map = {}
+    if args.athena_queries is not None:
+        for qidx_str in args.athena_queries.split(","):
+            qidx = int(qidx_str.strip())
+            query_map[queries[qidx]] = Engine.Athena
+    if args.redshift_queries is not None:
+        for qidx_str in args.redshift_queries.split(","):
+            qidx = int(qidx_str.strip())
+            query_map[queries[qidx]] = Engine.Redshift
+    if args.aurora_queries is not None:
+        for qidx_str in args.aurora_queries.split(","):
+            qidx = int(qidx_str.strip())
+            query_map[queries[qidx]] = Engine.Aurora
+    clp = CachedLocationPolicy(query_map)
+
+    # 5. Replace the policy.
+    enum_blueprint = EnumeratedBlueprint(blueprint)
+    definite_policy = asyncio.run(
+        ForestPolicy.from_assets(
+            args.schema_name, RoutingPolicy.ForestTableCardinality, assets
+        )
+    )
+    replaced_policy = FullRoutingPolicy(
+        indefinite_policies=[clp], definite_policy=definite_policy
+    )
+    enum_blueprint.set_routing_policy(replaced_policy)
+
+    # Ensure the provisioning is as expected.
+    enum_blueprint.set_aurora_provisioning(parse_provisioning(args.aurora_provisioning))
+    enum_blueprint.set_redshift_provisioning(
+        parse_provisioning(args.redshift_provisioning)
+    )
+
+    # 6. Adjust the placement.
+    new_placement = {}
+    for table in blueprint.tables():
+        new_placement[table.name] = [Engine.Aurora, Engine.Athena]
+    enum_blueprint.set_table_locations(new_placement)
+
+    # 6. Transition to the new blueprint.
+    modified_blueprint = enum_blueprint.to_blueprint()
+    asyncio.run(run_transition(config, blueprint_mgr, modified_blueprint))
+
+
+if __name__ == "__main__":
+    main()