Skip to content

Commit

Permalink
Adjust the Redshift load model to account for skew across nodes (#420)
Browse files Browse the repository at this point in the history
* Implement updated redshift load model

* Update tests

* Add old instances back, add additional test

* Fix execution bugs

* Adjust load models further

* Fix test
  • Loading branch information
geoffxy authored Dec 27, 2023
1 parent a94bab8 commit d063534
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 71 deletions.
16 changes: 11 additions & 5 deletions experiments/15-e2e-scenarios-v2/scale_up/scale_up_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ redshift_initialize_load_fraction: 0.25
# BRAD will not reduce predicted load lower than these values. Raise these
# values to be more conservative against mispredictions.
aurora_min_load_removal_fraction: 0.8
redshift_min_load_removal_fraction: 0.8
redshift_min_load_removal_fraction: 0.9

# Blueprint planning performance ceilings.
query_latency_p90_ceiling_s: 30.0
Expand All @@ -101,16 +101,16 @@ use_preset_redshift_clusters: true

# Used for ordering blueprints during planning.
comparator:
type: perf_ceiling # or `perf_ceiling`
type: benefit_perf_ceiling # or `perf_ceiling`

benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator
weeks: 0
days: 0
hours: 2
days: 1
hours: 0
minutes: 0

penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator
penalty_power: 8 # Only used by the `benefit_perf_ceiling` comparator
penalty_power: 2 # Only used by the `benefit_perf_ceiling` comparator

# Used for precomputed predictions.
std_datasets:
Expand All @@ -119,6 +119,12 @@ std_datasets:
- name: adhoc
path: workloads/IMDB_100GB/adhoc_test/

aurora_max_query_factor: 4.0
aurora_max_query_factor_replace: 10000.0

redshift_peak_load_threshold: 95.0
redshift_peak_load_multiplier: 2.0

# Blueprint planning trigger configs.

triggers:
Expand Down
21 changes: 20 additions & 1 deletion src/brad/config/planner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import math
import yaml
import logging
import numpy as np
import numpy.typing as npt
import importlib.resources as pkg_resources
from datetime import timedelta
from typing import Dict, Optional, Any
from typing import Dict, Optional, Any, Tuple
from brad.planner.strategy import PlanningStrategy
import brad.planner as brad_planner

Expand Down Expand Up @@ -311,3 +312,21 @@ def redshift_min_load_removal_fraction(self) -> float:
except KeyError:
logger.warning("Using default Redshift min load removal fraction: 0.75")
return 0.75

def aurora_max_query_factor(self) -> Tuple[float, float]:
try:
return (
self._raw["aurora_max_query_factor"],
self._raw["aurora_max_query_factor_replace"],
)
except KeyError:
return math.inf, math.inf

def redshift_peak_load_multiplier(self) -> Tuple[float, float]:
try:
return (
self._raw["redshift_peak_load_threshold"],
self._raw["redshift_peak_load_multiplier"],
)
except KeyError:
return 110.0, 1.0
3 changes: 2 additions & 1 deletion src/brad/planner/beam/query_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ async def _run_replan_impl(
"Selected blueprint details: %s", json.dumps(debug_values, indent=2)
)
logger.info(
"Metrics used during planning: %s", json.dumps(metrics._asdict(), indent=2)
"Metrics used during planning: %s",
json.dumps(metrics._asdict(), indent=2, default=str),
)

return best_blueprint, best_blueprint_score
Expand Down
3 changes: 2 additions & 1 deletion src/brad/planner/beam/table_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,8 @@ async def _run_replan_impl(
"Selected blueprint details: %s", json.dumps(debug_values, indent=2)
)
logger.info(
"Metrics used during planning: %s", json.dumps(metrics._asdict(), indent=2)
"Metrics used during planning: %s",
json.dumps(metrics._asdict(), indent=2, default=str),
)

return best_blueprint, best_blueprint_score
Expand Down
11 changes: 10 additions & 1 deletion src/brad/planner/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from brad.config.metrics import FrontEndMetric
from brad.config.planner import PlannerConfig
from brad.daemon.monitor import Monitor
from brad.daemon.redshift_metrics import relevant_redshift_node_dimensions
from brad.daemon.redshift_metrics import (
relevant_redshift_node_dimensions,
MAX_REDSHIFT_NODES,
)
from brad.utils.time_periods import elapsed_time, universal_now

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -677,6 +680,12 @@ def _extract_metrics_from_monitor(
redshift = redshift_source.read_k_most_recent(
k=epochs_to_extract, metric_ids=redshift_metric_ids
)
if blueprint.redshift_provisioning().num_nodes() > MAX_REDSHIFT_NODES:
logger.warning(
"Running with %d Redshift nodes. Only capturing metrics on %d.",
blueprint.redshift_provisioning().num_nodes(),
MAX_REDSHIFT_NODES,
)
else:
redshift_metric_ids = _REDSHIFT_METRICS.copy()
redshift = pd.DataFrame([], columns=_REDSHIFT_METRICS)
Expand Down
27 changes: 27 additions & 0 deletions src/brad/planner/scoring/data/redshift_instances.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,32 @@
"min_nodes": 2,
"max_nodes": 128,
"usd_per_hour": 4.80
},
{
"instance_type": "ra3.xlplus",
"vcpus": 4,
"memory_mib": 32768,
"io_gb_s": 0.65,
"min_nodes": 1,
"max_nodes": 16,
"usd_per_hour": 1.086
},
{
"instance_type": "ra3.4xlarge",
"vcpus": 12,
"memory_mib": 98304,
"io_gb_s": 2,
"min_nodes": 2,
"max_nodes": 32,
"usd_per_hour": 3.26
},
{
"instance_type": "ra3.16xlarge",
"vcpus": 48,
"memory_mib": 393216,
"io_gb_s": 8,
"min_nodes": 2,
"max_nodes": 128,
"usd_per_hour": 13.04
}
]
5 changes: 4 additions & 1 deletion src/brad/planner/scoring/performance/unified_aurora.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def compute(
query_factor = cls.query_movement_factor(
base_query_run_times, query_arrival_counts, ctx
)
max_factor, max_factor_replace = ctx.planner_config.aurora_max_query_factor()
if query_factor is not None and query_factor > max_factor:
query_factor = max_factor_replace
has_queries = base_query_run_times.shape[0] > 0
txn_cpu_denorm, ana_node_cpu_denorm = cls.predict_loads(
has_queries, curr_prov, next_prov, query_factor, ctx, debug_dict
Expand Down Expand Up @@ -285,7 +288,7 @@ def predict_query_latency_load_resources(
)
# Predicted running time is the query's execution time alone plus the
# expected wait time (due to system load)
return mean_service_time + wait_time
return prov_predicted_latency + wait_time

@staticmethod
def predict_query_latency_resources(
Expand Down
Loading

0 comments on commit d063534

Please sign in to comment.