Skip to content

Commit

Permalink
Add changing SLO experiment scripts and definitions (#494)
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffxy authored Apr 17, 2024
1 parent b3627da commit d125187
Show file tree
Hide file tree
Showing 8 changed files with 482 additions and 1 deletion.
31 changes: 31 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/COND
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
include("../common.cond")

QUERIES = [99, 56, 32, 92, 91, 49, 30, 83, 94, 38, 87, 86, 76, 37, 31, 46, 58, 61, 62, 64, 69, 73, 74, 51, 57, 60]

COMMON_CONFIGS = {
"physical-config-file": "config/physical_config_100gb.yml",
"schema-name": "imdb_extended_100g",
"ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
"txn-scale-factor": IMDB_100GB_SF,
"num-front-ends": 24,
"dataset-type": "100gb",
"ra-query-indexes": ",".join(map(str, QUERIES))
}

run_experiment(
name="brad_100g",
run="./run_workload.sh",
options={
"system-config-file": "slo_change_config.yml",
**COMMON_CONFIGS,
},
)

run_command(
name="brad_100g_debug",
run="./run_workload_debug.sh",
options={
"system-config-file": "slo_change_config.yml",
**COMMON_CONFIGS,
},
)
48 changes: 48 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#! /bin/bash

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

# Arguments:
# --system-config-file
# --physical-config-file
# --query-indexes
extract_named_arguments $@

start_brad $system_config_file $physical_config_file
log_workload_point "brad_start_initiated"
sleep 30

log_workload_point "clients_starting"
# 8 clients, offset 16 (for the transactional clients)
start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
rana_pid=$runner_pid

start_txn_runner_serial 16 # Implicit: --dataset-type
txn_pid=$runner_pid

log_workload_point "clients_started"

function inner_cancel_experiment() {
cancel_experiment $rana_pid $txn_pid
}

trap "inner_cancel_experiment" INT
trap "inner_cancel_experiment" TERM

# Sleep for 10 minutes and then change the SLOs.
sleep $(( 10 * 60 ))

log_workload_point "changing_slo"
brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
log_workload_point "changed_slo"

# Wait another hour before stopping.
sleep $(( 60 * 60 ))

# Shut down everything now.
log_workload_point "experiment_workload_done"
>&2 echo "Experiment done. Shutting down runners..."
graceful_shutdown $rana_pid $txn_pid
log_workload_point "shutdown_complete"
49 changes: 49 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#! /bin/bash

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

# Arguments:
# --system-config-file
# --physical-config-file
# --query-indexes
extract_named_arguments $@

export BRAD_IGNORE_BLUEPRINT=1
start_brad_debug $system_config_file $physical_config_file
log_workload_point "brad_start_initiated"
sleep 30

log_workload_point "clients_starting"
# 8 clients, offset 16 (for the transactional clients)
start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
rana_pid=$runner_pid

start_txn_runner_serial 16 # Implicit: --dataset-type
txn_pid=$runner_pid

log_workload_point "clients_started"

function inner_cancel_experiment() {
cancel_experiment $rana_pid $txn_pid
}

trap "inner_cancel_experiment" INT
trap "inner_cancel_experiment" TERM

# Sleep for 2 minutes and then change the SLOs.
sleep $(( 2 * 60 ))

log_workload_point "changing_slo"
brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
log_workload_point "changed_slo"

# Wait another 10 mins before stopping.
sleep $(( 10 * 60 ))

# Shut down everything now.
log_workload_point "experiment_workload_done"
>&2 echo "Experiment done. Shutting down runners..."
graceful_shutdown $rana_pid $txn_pid
log_workload_point "shutdown_complete"
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#! /bin/bash

if [ -z $1 ]; then
>&2 echo "Usage: $0 path/to/physical/config.yml"
exit 1
fi

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
--schema-name imdb_extended_100g \
--query-bank-file ../../../workloads/IMDB_100GB/regular_test/queries.sql \
--aurora-queries "99,56,32,92,91" \
--redshift-queries "49,30,83,94,38,87,86,76,37,31,46,58,61,62,64,69,73,74,51,57,60" \
--redshift-provisioning "dc2.large:2" \
--aurora-provisioning "db.t4g.medium:2" \
--system-config-file slo_change_config.yml \
--physical-config-file $1
168 changes: 168 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# This file contains configurations that are used by BRAD. These are default
# values and should be customized for specific situations.

# BRAD's front end servers will listen for client connections on this interface
# and port. If `num_front_ends` is greater than one, subsequent front ends will
# listen on successive ports (e.g., 6584, 6585, etc.).
front_end_interface: "0.0.0.0"
front_end_port: 6583
num_front_ends: 24

# Logging paths. If the value is in ALL_CAPS (with underscores), it is
# interpreted as an environment variable (BRAD will log to the path stored in
# the environment variable).

# Where BRAD's daemon process will write its logs.
daemon_log_file: COND_OUT

# Where BRAD's front end processes will write their logs.
front_end_log_path: COND_OUT

# Where BRAD's blueprint planner will write debug logs.
planner_log_path: COND_OUT

# Where BRAD's metrics loggers will write their logs.
metrics_log_path: COND_OUT

# Probability that each transactional query will be logged.
txn_log_prob: 0.01

# Set to a non-zero value enable automatic data syncing. When this is set to 0,
# automatic syncing is disabled.
data_sync_period_seconds: 0

# BRAD's front end servers will report their metrics at regular intervals.
front_end_metrics_reporting_period_seconds: 30
front_end_query_latency_buffer_size: 100

# `default` means to use the policy encoded in the blueprint. Other values will
# override the blueprint.
routing_policy: default

# Whether to disable table movement for benchmark purposes (i.e., keep all
# tables on all engines.)
disable_table_movement: true

# Epoch length for metrics and forecasting. This is the granularity at which
# metrics/forecasting will be performed.
epoch_length:
weeks: 0
days: 0
hours: 0
minutes: 1

# Blueprint planning strategy.
strategy: fp_query_based_beam

# Used to specify the period of time over which to use data for planning.
# Currrently, this is a "look behind" window for the workload.
planning_window:
weeks: 0
days: 0
hours: 1
minutes: 0

# Used to aggregate metrics collected in the planning window.
metrics_agg:
method: ewm # 'mean' is another option
alpha: 0.86466472 # 1 - 1 / e^2

# Used during planning.
reinterpret_second_as: 1

# The query distribution must change by at least this much for a new blueprint
# to be accepted.
query_dist_change_frac: 0.1

# The search bound for the provisioning.
max_provisioning_multiplier: 2.5

# Flag options for blueprint planning.
use_io_optimized_aurora: true
use_recorded_routing_if_available: true
ensure_tables_together_on_one_engine: true

# Loads used to prime the system when no information is available.
aurora_initialize_load_fraction: 0.25
redshift_initialize_load_fraction: 0.25

# BRAD will not reduce predicted load lower than these values. Raise these
# values to be more conservative against mispredictions.
aurora_min_load_removal_fraction: 0.8
redshift_min_load_removal_fraction: 0.9

# Blueprint planning performance ceilings.
# These will change to 30 s and 30 ms during the experiment.
query_latency_p90_ceiling_s: 60.0
txn_latency_p90_ceiling_s: 0.060

# If set to true, BRAD will attempt to use the specified preset Redshift
# clusters instead of resizing the main Redshift cluster.
use_preset_redshift_clusters: true

# Used for ordering blueprints during planning.
comparator:
type: benefit_perf_ceiling # or `perf_ceiling`

benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator
weeks: 0
days: 0
hours: 3
minutes: 0

penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator
penalty_power: 8 # Only used by the `benefit_perf_ceiling` comparator

# Used for precomputed predictions.
std_datasets:
- name: regular
path: workloads/IMDB_100GB/regular_test/
- name: adhoc
path: workloads/IMDB_100GB/adhoc_test/

aurora_max_query_factor: 4.0
aurora_max_query_factor_replace: 10000.0

redshift_peak_load_threshold: 95.0
redshift_peak_load_multiplier: 2.0

planner_max_workers: 16
aurora_provisioning_search_distance: 900.0
redshift_provisioning_search_distance: 900.0

# Blueprint planning trigger configs.

triggers:
enabled: true
check_period_s: 90 # Triggers are checked every X seconds.
check_period_offset_s: 600 # Wait 10 mins before starting.
observe_new_blueprint_mins: 10

elapsed_time:
disabled: true
multiplier: 60 # Multiplier over `planning_window`.

redshift_cpu:
lo: 15
hi: 85
sustained_epochs: 3

aurora_cpu:
lo: 15
hi: 85
sustained_epochs: 3

variable_costs:
disabled: true
threshold: 1.0

query_latency_ceiling:
ceiling_s: 60.0
sustained_epochs: 3

txn_latency_ceiling:
ceiling_s: 0.060
sustained_epochs: 3

recent_change:
delay_epochs: 5
4 changes: 4 additions & 0 deletions src/brad/config/system_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ class SystemEvent(enum.Enum):

# Use this for long running experiments.
ReachedExpectedState = "reached_expected_state"

# Used when a service level objective is changed while BRAD is running (used
# for experiments).
ChangedSlos = "changed_slos"
8 changes: 7 additions & 1 deletion src/brad/daemon/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ async def _handle_internal_command(self, command: str) -> RowList:
parts = command.split(" ")
if self._temp_config is None:
return [("Cannot change SLOs because TempConfig is missing.",)]
if len(parts) <= 3:
if len(parts) < 3:
return [("Need to specify query and txn p90 SLOs",)]

query_p90_s = float(parts[1])
Expand All @@ -757,6 +757,12 @@ async def _handle_internal_command(self, command: str) -> RowList:
elif isinstance(t, TransactionLatencyCeiling):
t.set_latency_ceiling(txn_p90_s)

if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.ChangedSlos,
f"query_p90_s={query_p90_s}; txn_p90_s={txn_p90_s}",
)

return [
(
f"p90 SLOs changed to (query {query_p90_s:.3f} s), (txn {txn_p90_s:.3f} s)",
Expand Down
Loading

0 comments on commit d125187

Please sign in to comment.