Skip to content

Commit

Permalink
Scenario and experiment improvements (#334)
Browse files Browse the repository at this point in the history
* Improvements for e2e scenarios

* Updates

* Experiment fixes

* Adjust workload, print fewer warnings

* Adjustments

* Check in config changes
  • Loading branch information
geoffxy authored Oct 30, 2023
1 parent 2eefd45 commit fe547ba
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 62 deletions.
9 changes: 5 additions & 4 deletions config/planner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ query_dist_change_frac: 0.1

triggers:
enabled: true
check_period_s: 60 # Triggers are checked every X seconds.
check_period_offset_s: 20
check_period_s: 90 # Triggers are checked every X seconds.
check_period_offset_s: 360 # Wait 6 mins before starting.

elapsed_time:
disabled: true
Expand All @@ -41,7 +41,8 @@ triggers:
sustained_epochs: 3

variable_costs:
threshold: 0.5
disabled: true
threshold: 1.0

query_latency_ceiling:
ceiling_s: 30.0
Expand All @@ -52,7 +53,7 @@ triggers:
###

beam_size: 100
max_provisioning_multiplier: 3
max_provisioning_multiplier: 2.5
max_feasible_cpu: 85


Expand Down
1 change: 1 addition & 0 deletions experiments/15-e2e-scenarios-v2/common.cond
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Relative to individual experiment scenario directories.
IMDB_20GB_REGULAR_QUERY_BANK = "../../../workloads/IMDB_20GB/regular_test/queries.sql"
IMDB_100GB_REGULAR_QUERY_BANK = "../../../workloads/IMDB_100GB/regular_test/queries.sql"
IMDB_100GB_REGULAR_QUERY_FREQS = "../../../workloads/IMDB_100GB/regular_test/query_frequency.npy"
30 changes: 20 additions & 10 deletions experiments/15-e2e-scenarios-v2/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ function start_brad() {
--config-file $config_file \
--schema-name $schema_name \
--planner-config-file $planner_config_file \
--temp-config-file config/temp_config_sample.yml \
--temp-config-file config/temp_config.yml \
&
brad_pid=$!
popd
Expand All @@ -20,7 +20,7 @@ function start_brad_debug() {
--config-file $config_file \
--schema-name $schema_name \
--planner-config-file $planner_config_file \
--temp-config-file config/temp_config_sample.yml \
--temp-config-file config/temp_config.yml \
--debug \
&
brad_pid=$!
Expand Down Expand Up @@ -89,19 +89,25 @@ function start_repeating_olap_runner() {
local ra_gap_s=$2
local ra_gap_std_s=$3

local args=(
--num-clients $ra_clients
--num-front-ends $num_front_ends
--query-indexes $ra_query_indexes
--query-bank-file $ra_query_bank_file
--avg-gap-s $ra_gap_s
--avg-gap-std-s $ra_gap_std_s
)

if [[ ! -z $ra_query_frequency_path ]]; then
args+=(--query-frequency-path $ra_query_frequency_path)
fi

>&2 echo "[Repeating Analytics] Running with $ra_clients..."
results_dir=$COND_OUT/ra_${ra_clients}
mkdir -p $results_dir

log_workload_point "rana_${ra_clients}"
COND_OUT=$results_dir python3 ../../../workloads/IMDB_extended/run_repeating_analytics.py \
--num-clients $ra_clients \
--avg-gap-s $ra_gap_s \
--avg-gap-std-s $ra_gap_std_s \
--num-front-ends $num_front_ends \
--query-indexes $ra_query_indexes \
--query-bank-file $ra_query_bank_file \
&
COND_OUT=$results_dir python3 ../../../workloads/IMDB_extended/run_repeating_analytics.py "${args[@]}" &
rana_pid=$!
}

Expand Down Expand Up @@ -173,6 +179,10 @@ function extract_named_arguments() {
ra_gap_std_s=${phys_arg:15}
fi

if [[ $phys_arg =~ --ra-query-frequency-path=.+ ]]; then
ra_query_frequency_path=${phys_arg:26}
fi

if [[ $phys_arg =~ --num-front-ends=.+ ]]; then
num_front_ends=${phys_arg:17}
fi
Expand Down
42 changes: 29 additions & 13 deletions experiments/15-e2e-scenarios-v2/scale_down/COND
Original file line number Diff line number Diff line change
@@ -1,18 +1,5 @@
include("../common.cond")

run_experiment(
name="brad_100g",
run="./run_workload.sh",
options={
# TODO: Ideally, configurations are shared. Only keep AWS secrets separate.
"config-file": "config/config_100g.yml",
"planner-config-file": "config/planner.yml",
# TODO: Select regular query indexes
"ra-query-indexes": "0,1,2",
"ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
},
)

run_experiment(
name="brad_20g",
run="./run_workload.sh",
Expand Down Expand Up @@ -42,3 +29,32 @@ run_command(
"num-front-ends": 4,
},
)

run_experiment(
name="brad_100g",
run="./run_workload.sh",
options={
# TODO: Ideally, configurations are shared. Only keep AWS secrets separate.
"config-file": "config/config_large_100.yml",
"planner-config-file": "config/planner.yml",
"schema-name": "imdb_extended_100g",
# TODO: Select regular query indexes
"ra-query-indexes": ",".join(map(str, list(range(25, 35)) + list(range(75, 80)))),
"ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
"num-front-ends": 8,
},
)

run_command(
name="brad_100g_debug",
run="./run_workload_debug.sh",
options={
# TODO: Ideally, configurations are shared. Only keep AWS secrets separate.
"config-file": "config/config_large_100.yml",
"planner-config-file": "config/planner.yml",
"schema-name": "imdb_extended_100g",
"ra-query-indexes": ",".join(map(str, list(range(25, 35)) + list(range(75, 80)))),
"ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
"num-front-ends": 8,
},
)
8 changes: 4 additions & 4 deletions experiments/15-e2e-scenarios-v2/scale_down/run_workload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ log_workload_point "brad_start_initiated"
sleep 30

log_workload_point "clients_starting"
start_repeating_olap_runner 1 30 5 # Implicit: --query-indexes
start_txn_runner 2
start_repeating_olap_runner 4 15 5 # Implicit: --query-indexes
start_txn_runner 4
log_workload_point "clients_started"

# Wait until a re-plan and transition completes.
Expand All @@ -31,10 +31,10 @@ log_workload_point "clients_started"
# - Turn off Redshift
# Detection time is ~5 minutes
# Transition time is ~7 minutes
total_second_phase_time_s="$((20 * 60))"
total_second_phase_time_s="$((30 * 60))"
wait_start="$(date -u +%s)"

poll_file_for_event $COND_OUT/brad_daemon_events.csv "post_transition_completed" 15
poll_file_for_event $COND_OUT/brad_daemon_events.csv "post_transition_completed" 30
log_workload_point "after_replan"

wait_end="$(date -u +%s)"
Expand Down
10 changes: 6 additions & 4 deletions experiments/15-e2e-scenarios-v2/scale_down/run_workload_debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ extract_named_arguments $@
trap "cancel_experiment" INT
trap "cancel_experiment" TERM

start_brad $config_file $planner_config_file
sleep 30
# Useful for testing out blueprint planning without executing the transition.
export BRAD_IGNORE_BLUEPRINT=1
start_brad_debug $config_file $planner_config_file
sleep 10

start_repeating_olap_runner 1 30 5 # Implicit: --query-indexes
start_txn_runner 2
start_repeating_olap_runner 8 15 5 # Implicit: --query-indexes
start_txn_runner 8

echo "READY -- Sleeping for 1 hour. Hit Ctrl-C to stop."
sleep $((60 * 60))
6 changes: 6 additions & 0 deletions src/brad/daemon/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@

# Temporarily used.
PERSIST_BLUEPRINT_VAR = "BRAD_PERSIST_BLUEPRINT"
IGNORE_ALL_BLUEPRINTS_VAR = "BRAD_IGNORE_BLUEPRINT"


class BradDaemon:
Expand Down Expand Up @@ -342,6 +343,11 @@ async def _handle_new_blueprint(self, blueprint: Blueprint, score: Score) -> Non
"""
Informs the server about a new blueprint.
"""

if IGNORE_ALL_BLUEPRINTS_VAR in os.environ:
logger.info("Skipping all blueprints. Chosen blueprint: %s", blueprint)
return

if self._system_event_logger is not None:
self._system_event_logger.log(SystemEvent.NewBlueprintProposed)

Expand Down
3 changes: 1 addition & 2 deletions src/brad/planner/beam/query_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ async def _run_replan_impl(self, window_multiplier: int = 1) -> None:

# 5. Run beam search to formulate the table placements.
for j, query_idx in enumerate(query_indices[1:]):
if j % 100 == 0:
logger.debug("Processing index %d of %d", j, len(query_indices[1:]))
logger.debug("Processing index %d of %d", j, len(query_indices[1:]))

next_top_k: List[BlueprintCandidate] = []
query = analytical_queries[query_idx]
Expand Down
22 changes: 20 additions & 2 deletions src/brad/planner/compare/cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,10 @@ def is_better_than(left: ComparableBlueprint, right: ComparableBlueprint) -> boo
return True

# Query latency ceilings.
left_lat = _get_or_compute_p99_latency(left)
right_lat = _get_or_compute_p99_latency(right)
# left_lat = _get_or_compute_p99_latency(left)
# right_lat = _get_or_compute_p99_latency(right)
left_lat = _get_or_compute_nth_largest_latency(left, 2)
right_lat = _get_or_compute_nth_largest_latency(right, 2)

if left_lat > max_query_latency_s and right_lat > max_query_latency_s:
# Both are above the latency ceiling.
Expand Down Expand Up @@ -165,6 +167,22 @@ def _get_or_compute_max_latency(bp: ComparableBlueprint) -> float:
return max_lat


def _get_or_compute_nth_largest_latency(bp: ComparableBlueprint, n: int) -> float:
pred_lats = bp.get_predicted_analytical_latencies()
if len(pred_lats) < n:
actual_n = len(pred_lats)
else:
actual_n = n

stored = bp.get_memoized_value(f"{actual_n}_largest")
if stored is not None:
return stored

nth_largest = np.partition(pred_lats, -actual_n)[-actual_n]
bp.set_memoized_value(f"{actual_n}_largest", nth_largest)
return nth_largest


def _get_or_compute_p99_latency(bp: ComparableBlueprint) -> float:
stored = bp.get_memoized_value("p99_latency")
if stored is not None:
Expand Down
2 changes: 2 additions & 0 deletions src/brad/planner/scoring/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(
# across blueprints.
self.engine_latency_norm_factor: Dict[Engine, float] = {}

self.already_logged_txn_interference_warning = False

async def simulate_current_workload_routing(self, router: Router) -> None:
self.current_query_locations[Engine.Aurora].clear()
self.current_query_locations[Engine.Redshift].clear()
Expand Down
48 changes: 28 additions & 20 deletions src/brad/planner/scoring/performance/unified_aurora.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,28 @@ def compute(
# TODO: Possible edge cases here due to cumulative prediction error (e.g.,
# pred_txn_load > overall_writer_load violates our model's assumptions).
# We need a robust way to handle these potential errors.
if pred_txn_load > overall_writer_load:
logger.warning(
"Predicted transactional load higher than the overall writer load. "
"Overall load: %.2f, Client txn thpt: %.2f, Predicted txn load: %.2f",
overall_writer_load,
client_txns_per_s,
pred_txn_load,
)
if pred_txn_cpu_denorm > overall_writer_cpu_util_denorm:
logger.warning(
"Predicted transactional CPU denormalized utilization higher than the overall "
"CPU use. Overall use: %.2f, Client txn thpt: %.2f, Predicted CPU "
"use: %.2f",
overall_writer_cpu_util_denorm,
client_txns_per_s,
pred_txn_cpu_denorm,
)
if not ctx.already_logged_txn_interference_warning:
if pred_txn_load > overall_writer_load:
logger.warning(
"Predicted transactional load higher than the overall "
"writer load. Overall load: %.2f, Client txn thpt: %.2f, "
"Predicted txn load: %.2f",
overall_writer_load,
client_txns_per_s,
pred_txn_load,
)
ctx.already_logged_txn_interference_warning = True

if pred_txn_cpu_denorm > overall_writer_cpu_util_denorm:
logger.warning(
"Predicted transactional CPU denormalized utilization "
"higher than the overall CPU use. Overall use: %.2f, "
"Client txn thpt: %.2f, Predicted CPU use: %.2f",
overall_writer_cpu_util_denorm,
client_txns_per_s,
pred_txn_cpu_denorm,
)
ctx.already_logged_txn_interference_warning = True

# 2. Adjust the analytical portion of the system load for query movement
# (compute `query_factor``).
Expand Down Expand Up @@ -142,9 +147,12 @@ def compute(
)

else:
total_analytics_load = max(0, overall_writer_load - pred_txn_load)
# Analytics load should never be zero - so we impute a small value
# to work around mispredictions.
eps = 1e-3 if len(base_query_run_times) > 0 else 0.0
total_analytics_load = max(eps, overall_writer_load - pred_txn_load)
total_analytics_cpu_denorm = max(
0, overall_writer_cpu_util_denorm - pred_txn_cpu_denorm
eps, overall_writer_cpu_util_denorm - pred_txn_cpu_denorm
)

total_analytics_load *= query_factor
Expand Down Expand Up @@ -298,4 +306,4 @@ def add_debug_values(self, dest: Dict[str, int | float | str]) -> None:
dest.update(self.debug_values)


_AURORA_BASE_RESOURCE_VALUE = aurora_num_cpus(Provisioning("db.r6g.large", 1))
_AURORA_BASE_RESOURCE_VALUE = aurora_num_cpus(Provisioning("db.r6g.xlarge", 1))
2 changes: 1 addition & 1 deletion src/brad/planner/scoring/performance/unified_redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,4 @@ def add_debug_values(self, dest: Dict[str, int | float | str]) -> None:
dest.update(self.debug_values)


_REDSHIFT_BASE_RESOURCE_VALUE = redshift_num_cpus(Provisioning("dc2.large", 1))
_REDSHIFT_BASE_RESOURCE_VALUE = redshift_num_cpus(Provisioning("dc2.large", 2))
4 changes: 2 additions & 2 deletions workloads/IMDB_extended/run_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ def noop_handler(_signal, _frame):
if rand_backoff is None:
rand_backoff = RandomizedExponentialBackoff(
max_retries=100,
base_delay_s=2.0,
max_delay_s=timedelta(minutes=10).total_seconds(),
base_delay_s=0.1,
max_delay_s=timedelta(minutes=1).total_seconds(),
)

# Delay retrying in the case of a transient error (this
Expand Down

0 comments on commit fe547ba

Please sign in to comment.