Skip to content

Commit

Permalink
Add table movement enabled scale down experiment and log movement pro…
Browse files Browse the repository at this point in the history
…gress (#507)

Part of #487.
  • Loading branch information
geoffxy authored May 4, 2024
1 parent 0fac521 commit 68954e7
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 1 deletion.
9 changes: 9 additions & 0 deletions experiments/15-e2e-scenarios-v2/scale_down/COND
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ run_experiment(
},
)

run_experiment(
name="brad_100g_tm",
run="./run_workload.sh",
options={
"system-config-file": "scale_down_config_tm.yml",
**COMMON_100G_CONFIGS,
},
)

run_command(
name="brad_100g_debug",
run="./run_workload_debug.sh",
Expand Down
164 changes: 164 additions & 0 deletions experiments/15-e2e-scenarios-v2/scale_down/scale_down_config_tm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# This file contains configurations that are used by BRAD. These are default
# values and should be customized for specific situations.

# BRAD's front end servers will listen for client connections on this interface
# and port. If `num_front_ends` is greater than one, subsequent front ends will
# listen on successive ports (e.g., 6584, 6585, etc.).
front_end_interface: "0.0.0.0"
front_end_port: 6583
num_front_ends: 8

# Logging paths. If the value is in ALL_CAPS (with underscores), it is
# interpreted as an environment variable (BRAD will log to the path stored in
# the environment variable).

# Where BRAD's daemon process will write its logs.
daemon_log_file: COND_OUT

# Where BRAD's front end processes will write their logs.
front_end_log_path: COND_OUT

# Where BRAD's blueprint planner will write debug logs.
planner_log_path: COND_OUT

# Where BRAD's metrics loggers will write their logs.
metrics_log_path: COND_OUT

# Probability that each transactional query will be logged.
txn_log_prob: 0.01

# Set to a non-zero value enable automatic data syncing. When this is set to 0,
# automatic syncing is disabled.
data_sync_period_seconds: 0

# BRAD's front end servers will report their metrics at regular intervals.
front_end_metrics_reporting_period_seconds: 30
front_end_query_latency_buffer_size: 100

# `default` means to use the policy encoded in the blueprint. Other values will
# override the blueprint.
routing_policy: default

# Whether to disable table movement for benchmark purposes (i.e., keep all
# tables on all engines.)
disable_table_movement: false
skip_sync_before_table_movement: true

# Epoch length for metrics and forecasting. This is the granularity at which
# metrics/forecasting will be performed.
epoch_length:
weeks: 0
days: 0
hours: 0
minutes: 1

# Blueprint planning strategy.
strategy: fp_query_based_beam

# Used to specify the period of time over which to use data for planning.
# Currrently, this is a "look behind" window for the workload.
planning_window:
weeks: 0
days: 0
hours: 1
minutes: 0

# Used to aggregate metrics collected in the planning window.
metrics_agg:
method: ewm # 'mean' is another option
alpha: 0.86466472 # 1 - 1 / e^2

# Used during planning.
reinterpret_second_as: 1

# The query distribution must change by at least this much for a new blueprint
# to be accepted.
query_dist_change_frac: 0.1

# The search bound for the provisioning.
max_provisioning_multiplier: 2.5

# Flag options for blueprint planning.
use_io_optimized_aurora: true
use_recorded_routing_if_available: true
ensure_tables_together_on_one_engine: true

# Loads used to prime the system when no information is available.
aurora_initialize_load_fraction: 0.25
redshift_initialize_load_fraction: 0.25

# BRAD will not reduce predicted load lower than these values. Raise these
# values to be more conservative against mispredictions.
aurora_min_load_removal_fraction: 0.8
redshift_min_load_removal_fraction: 0.8

# Blueprint planning performance ceilings.
query_latency_p90_ceiling_s: 30.0
txn_latency_p90_ceiling_s: 0.030

aurora_provisioning_search_distance: 900.0
redshift_provisioning_search_distance: 900.0

# Used for ordering blueprints during planning.
comparator:
type: benefit_perf_ceiling # or `perf_ceiling`

benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator
weeks: 0
days: 0
hours: 24
minutes: 0

penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator
penalty_power: 2 # Only used by the `benefit_perf_ceiling` comparator

aurora_max_query_factor: 4.0
aurora_max_query_factor_replace: 10000.0
redshift_peak_load_threshold: 99.0
redshift_peak_load_multiplier: 1.5

planner_max_workers: 16

# Used for precomputed predictions.
std_datasets:
- name: regular
path: workloads/IMDB_100GB/regular_test/
- name: adhoc
path: workloads/IMDB_100GB/adhoc_test/

# Blueprint planning trigger configs.

triggers:
enabled: true
check_period_s: 90 # Triggers are checked every X seconds.
check_period_offset_s: 360 # Wait 6 mins before starting.
observe_new_blueprint_mins: 3

elapsed_time:
disabled: true
multiplier: 60 # Multiplier over `planning_window`.

redshift_cpu:
lo: 15
hi: 85
sustained_epochs: 3

aurora_cpu:
lo: 15
hi: 85
sustained_epochs: 3

variable_costs:
disabled: true
threshold: 1.0

query_latency_ceiling:
ceiling_s: 30.0
sustained_epochs: 3

txn_latency_ceiling:
ceiling_s: 0.030
sustained_epochs: 3

recent_change:
delay_epochs: 5
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def main():
help="Comma separated list of indices.",
default="99,56,32,92,91,49,30,83,94,38,87,86,76,37,31,46",
)
parser.add_argument("--place-tables-both", action="store_true")
args = parser.parse_args()
set_up_logging(debug_mode=True)

Expand Down Expand Up @@ -130,7 +131,7 @@ def main():
new_placement = {}
aurora_txn = ["theatres", "showings", "ticket_orders", "movie_info", "aka_title"]
for table in blueprint.tables():
if table.name in aurora_txn:
if args.place_tables_both or table.name in aurora_txn:
new_placement[table.name] = [Engine.Aurora, Engine.Redshift]
else:
new_placement[table.name] = [Engine.Redshift]
Expand Down
6 changes: 6 additions & 0 deletions src/brad/config/system_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ class SystemEvent(enum.Enum):
# Used when a service level objective is changed while BRAD is running (used
# for experiments).
ChangedSlos = "changed_slos"

# Used to mark table movement progress.
PreTableMovementStarted = "pre_table_movement_started"
PreTableMovementCompleted = "pre_table_movement_completed"
PostTableMovementStarted = "post_table_movement_started"
PostTableMovementCompleted = "post_table_movement_completed"
35 changes: 35 additions & 0 deletions src/brad/daemon/transition_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ async def run_prepare_then_transition(
else:
logger.info("Not running table sync before movement.")

if self._system_event_logger is not None:
self._system_event_logger.log(SystemEvent.PreTableMovementStarted)

# 3. Create tables in new locations as needed
directory = self._blueprint_mgr.get_directory()

Expand Down Expand Up @@ -192,6 +195,8 @@ async def run_prepare_then_transition(
await asyncio.gather(*table_awaitables)

logger.info("Table movement complete.")
if self._system_event_logger is not None:
self._system_event_logger.log(SystemEvent.PreTableMovementCompleted)

# Close connections
await self._cxns.close()
Expand Down Expand Up @@ -480,6 +485,11 @@ async def _run_aurora_post_transition(
and len(table_diffs) > 0
and self._config.disable_table_movement is False
):
if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementStarted, "aurora"
)

views_to_drop = []
triggers_to_drop = []
tables_to_drop = []
Expand Down Expand Up @@ -511,6 +521,11 @@ async def _run_aurora_post_transition(
assert self._cxns is not None
self._cxns.get_connection(Engine.Aurora).cursor_sync().commit_sync()

if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementCompleted, "aurora"
)

# Change the provisioning.
if diff is not None:
if new.num_nodes() == 0:
Expand Down Expand Up @@ -611,6 +626,11 @@ async def _run_redshift_post_transition(
) -> None:
# Drop removed tables
if table_diffs is not None and self._config.disable_table_movement is False:
if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementStarted, "redshift"
)

to_drop = []
for table_diff in table_diffs:
if Engine.Redshift in table_diff.removed_locations():
Expand All @@ -623,6 +643,11 @@ async def _run_redshift_post_transition(
assert self._cxns is not None
self._cxns.get_connection(Engine.Redshift).cursor_sync().commit_sync()

if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementCompleted, "redshift"
)

# Pause the cluster if we are transitioning to 0 nodes.
if diff is not None:
if diff.new_num_nodes() == 0:
Expand All @@ -642,6 +667,11 @@ async def _run_athena_post_transition(
and self._config.disable_table_movement is False
and self._config.skip_athena_table_deletion is False
):
if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementStarted, "athena"
)

for table_diff in table_diffs:
if Engine.Athena in table_diff.removed_locations():
to_drop.append(table_diff.table_name())
Expand All @@ -650,6 +680,11 @@ async def _run_athena_post_transition(
ctx = self._new_execution_context()
await d.execute(ctx)

if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.PostTableMovementCompleted, "athena"
)

async def _enforce_table_diff_additions(self, diff: TableDiff) -> None:
# Unload table to S3
table_name = diff.table_name()
Expand Down

0 comments on commit 68954e7

Please sign in to comment.