From ccd45c438ae61c15d7bbe5e5c641d07300ef2a13 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Thu, 9 May 2024 23:43:14 -0400
Subject: [PATCH] Add experiment configs for CH-BenCHmark scenario (#509)

Part of #487.
---
 experiments/17-chbenchmark/common.sh          |  58 +++++-
 experiments/17-chbenchmark/debug/COND         |  25 +++
 .../17-chbenchmark/debug/debug_config.yml     |  12 +-
 experiments/17-chbenchmark/debug/run_full.sh  |  25 +++
 .../debug/set_up_starting_blueprint.sh        |  20 +++
 experiments/17-chbenchmark/scale_down/COND    |  24 +++
 .../17-chbenchmark/scale_down/brad.config     |   6 +
 .../scale_down/ch_scale_down_config.yml       | 167 ++++++++++++++++++
 .../17-chbenchmark/scale_down/run_full.sh     |  25 +++
 .../scale_down/set_up_starting_blueprint.sh   |  21 +++
 src/brad/config/file.py                       |   6 +
 src/brad/daemon/daemon.py                     |   6 +-
 src/brad/front_end/front_end.py               |  34 +++-
 src/brad/front_end/session.py                 |   5 +-
 .../set_up_starting_blueprint.py              |  18 +-
 workloads/chbenchmark/queries.sql             |  22 +++
 16 files changed, 457 insertions(+), 17 deletions(-)
 create mode 100755 experiments/17-chbenchmark/debug/run_full.sh
 create mode 100755 experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
 create mode 100644 experiments/17-chbenchmark/scale_down/COND
 create mode 100644 experiments/17-chbenchmark/scale_down/brad.config
 create mode 100644 experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
 create mode 100755 experiments/17-chbenchmark/scale_down/run_full.sh
 create mode 100755 experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
 create mode 100644 workloads/chbenchmark/queries.sql

diff --git a/experiments/17-chbenchmark/common.sh b/experiments/17-chbenchmark/common.sh
index 2db49e0e..95ee520c 100644
--- a/experiments/17-chbenchmark/common.sh
+++ b/experiments/17-chbenchmark/common.sh
@@ -13,6 +13,7 @@ function start_brad() {
 }
 
 function run_tpcc() {
+  local results_name=$1
   pushd ../../../workloads/chbenchmark/py-tpcc/
   local args=(
     --no-load
@@ -25,11 +26,66 @@ function run_tpcc() {
   if [[ ! -z $txn_zipfian_alpha ]]; then
     args+=(--zipfian-alpha $txn_zipfian_alpha)
   fi
-  RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc brad "${args[@]}" &
+  mkdir -p $COND_OUT/$results_name
+  RECORD_DETAILED_STATS=1 COND_OUT=$COND_OUT/$results_name python3 -m pytpcc.tpcc brad "${args[@]}" &
   tpcc_pid=$!
   popd
 }
 
+function log_workload_point() {
+  msg=$1
+  now=$(date --utc "+%Y-%m-%d %H:%M:%S")
+  echo "$now,$msg" >> $COND_OUT/points.log
+}
+
+function start_repeating_olap_runner() {
+  local ra_clients=$1
+  local ra_gap_s=$2
+  local ra_gap_std_s=$3
+  local query_indexes=$4
+  local results_name=$5
+  local client_offset=$6
+
+  local args=(
+    --num-clients $ra_clients
+    --num-front-ends $num_front_ends
+    --query-indexes $query_indexes
+    --query-bank-file $ra_query_bank_file
+    --avg-gap-s $ra_gap_s
+    --avg-gap-std-s $ra_gap_std_s
+  )
+
+  if [[ ! -z $ra_query_frequency_path ]]; then
+    args+=(--query-frequency-path $ra_query_frequency_path)
+  fi
+
+  if [[ ! -z $client_offset ]]; then
+    args+=(--client-offset $client_offset)
+  fi
+
+  >&2 echo "[Serial Repeating Analytics] Running with $ra_clients..."
+  results_dir=$COND_OUT/$results_name
+  mkdir -p $results_dir
+
+  log_workload_point $results_name
+  COND_OUT=$results_dir python3.11 ../../../workloads/IMDB_extended/run_repeating_analytics_serial.py "${args[@]}" &
+
+  # This is a special return value variable that we use.
+  runner_pid=$!
+}
+
+function graceful_shutdown() {
+  for pid_var in "$@"; do
+    kill -INT $pid_var
+  done
+  for pid_var in "$@"; do
+    wait $pid_var
+  done
+
+  kill -INT $brad_pid
+  wait $brad_pid
+}
+
 function extract_named_arguments() {
   # Evaluates any environment variables in this script's arguments. This script
   # should only be run on trusted input.
diff --git a/experiments/17-chbenchmark/debug/COND b/experiments/17-chbenchmark/debug/COND
index 7feaa352..7f403f67 100644
--- a/experiments/17-chbenchmark/debug/COND
+++ b/experiments/17-chbenchmark/debug/COND
@@ -81,3 +81,28 @@ run_experiment(
     "txn-zipfian-alpha": ZIPFIAN_ALPHA,
   },
 )
+
+# Query indices.
+QUERIES = list(range(22))
+QUERIES.remove(4)
+QUERIES.remove(13)
+QUERIES_STR = ",".join([str(v) for v in QUERIES])
+
+run_experiment(
+  name="run_full",
+  run="./run_full.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "system-config-file": "debug_config.yml",  # Relative to one level up.
+    "schema-name": "chbenchmark",
+    "txn-config-file": "brad.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "num-front-ends": 2, # TBD
+    "run-for-s": 60 * 60,  # One hour
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+    "ra-query-indexes": QUERIES_STR,
+    "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
+  },
+)
diff --git a/experiments/17-chbenchmark/debug/debug_config.yml b/experiments/17-chbenchmark/debug/debug_config.yml
index c279878d..3b8a6015 100644
--- a/experiments/17-chbenchmark/debug/debug_config.yml
+++ b/experiments/17-chbenchmark/debug/debug_config.yml
@@ -6,7 +6,7 @@
 # listen on successive ports (e.g., 6584, 6585, etc.).
 front_end_interface: "0.0.0.0"
 front_end_port: 6583
-num_front_ends: 1
+num_front_ends: 2
 
 # If installed and enabled, BRAD will serve its UI from a webserver that listens
 # for connections on this network interface and port.
@@ -42,7 +42,7 @@ front_end_query_latency_buffer_size: 100
 
 # `default` means to use the policy encoded in the blueprint. Other values will
 # override the blueprint.
-routing_policy: always_aurora
+routing_policy: default
 
 # Whether to disable table movement for benchmark purposes (i.e., keep all
 # tables on all engines.)
@@ -104,6 +104,8 @@ txn_latency_p90_ceiling_s: 0.030
 # clusters instead of resizing the main Redshift cluster.
 use_preset_redshift_clusters: false
 
+result_row_limit: 10
+
 # Used for ordering blueprints during planning.
 comparator:
   type: benefit_perf_ceiling  # or `perf_ceiling`
@@ -119,10 +121,8 @@ comparator:
 
 # Used for precomputed predictions.
 std_datasets:
-  - name: regular
-    path: workloads/IMDB_100GB/regular_test/
-  - name: adhoc
-    path: workloads/IMDB_100GB/adhoc_test/
+  - name: chbenchmark
+    path: workloads/chbenchmark/
 
 # Blueprint planning trigger configs.
 
diff --git a/experiments/17-chbenchmark/debug/run_full.sh b/experiments/17-chbenchmark/debug/run_full.sh
new file mode 100755
index 00000000..f06a0504
--- /dev/null
+++ b/experiments/17-chbenchmark/debug/run_full.sh
@@ -0,0 +1,25 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+abs_system_config_file=$(realpath $system_config_file)
+abs_physical_config_file=$(realpath $physical_config_file)
+
+export BRAD_IGNORE_BLUEPRINT=1
+start_brad $abs_system_config_file $abs_physical_config_file
+
+sleep 30
+
+run_tpcc "t_1"
+start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients
+ra_pid=$runner_pid
+
+sleep $run_for_s
+
+# Shut down.
+graceful_shutdown $tpcc_pid $ra_pid
diff --git a/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
new file mode 100755
index 00000000..2e7c9986
--- /dev/null
+++ b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+
+if [ -z $1 ]; then
+  >&2 echo "Usage: $0 path/to/physical/config.yml"
+  exit 1
+fi
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
+  --schema-name chbenchmark \
+  --query-bank-file ../../../workloads/chbenchmark/queries.sql \
+  --redshift-queries "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \
+  --redshift-provisioning "dc2.large:16" \
+  --aurora-provisioning "db.r6g.xlarge:1" \
+  --system-config-file debug_config.yml \
+  --physical-config-file $1 \
+  --override-definite-routing redshift
diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
new file mode 100644
index 00000000..f62230bc
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -0,0 +1,24 @@
+ZIPFIAN_ALPHA = 5.0
+
+# Query indices.
+QUERIES = list(range(22))
+QUERIES_STR = ",".join([str(v) for v in QUERIES])
+
+run_experiment(
+  name="run_full",
+  run="./run_full.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "system-config-file": "ch_scale_down_config.yml",  # Relative to one level up.
+    "schema-name": "chbenchmark",
+    "txn-config-file": "brad.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "num-front-ends": 2, # TBD
+    "run-for-s": 2 * 60 * 60,  # 2 hours
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+    "ra-query-indexes": QUERIES_STR,
+    "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
+  },
+)
diff --git a/experiments/17-chbenchmark/scale_down/brad.config b/experiments/17-chbenchmark/scale_down/brad.config
new file mode 100644
index 00000000..c71fe1e5
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/brad.config
@@ -0,0 +1,6 @@
+# BradDriver Configuration File
+[brad]
+host                 = localhost
+port                 = 6583
+isolation_level      = REPEATABLE READ
+use_worker_offset    = true
diff --git a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
new file mode 100644
index 00000000..3e40530d
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
@@ -0,0 +1,167 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 2
+
+# If installed and enabled, BRAD will serve its UI from a webserver that listens
+# for connections on this network interface and port.
+ui_interface: "0.0.0.0"
+ui_port: 7583
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: COND_OUT
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: COND_OUT
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: COND_OUT
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: COND_OUT
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.10
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.8
+
+# Blueprint planning performance ceilings.
+query_latency_p90_ceiling_s: 360.0
+txn_latency_p90_ceiling_s: 0.080
+
+# If set to true, BRAD will attempt to use the specified preset Redshift
+# clusters instead of resizing the main Redshift cluster.
+use_preset_redshift_clusters: false
+
+result_row_limit: 10
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 24
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 8  # Only used by the `benefit_perf_ceiling` comparator
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: chbenchmark
+    path: workloads/chbenchmark/
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: false
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 360  # Wait 6 mins before starting.
+
+  # Triggers will not fire for at least this many minutes after a new blueprint
+  # takes effect. Usually this should be greater than zero to give BRAD
+  # sufficient time to observe the effect of the blueprint on the workload. BRAD
+  # may wait longer to ensure metrics are also available for this many minutes.
+  observe_new_blueprint_mins: 5
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 360.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.080
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5
diff --git a/experiments/17-chbenchmark/scale_down/run_full.sh b/experiments/17-chbenchmark/scale_down/run_full.sh
new file mode 100755
index 00000000..f06a0504
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/run_full.sh
@@ -0,0 +1,25 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+abs_system_config_file=$(realpath $system_config_file)
+abs_physical_config_file=$(realpath $physical_config_file)
+
+export BRAD_IGNORE_BLUEPRINT=1
+start_brad $abs_system_config_file $abs_physical_config_file
+
+sleep 30
+
+run_tpcc "t_1"
+start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients
+ra_pid=$runner_pid
+
+sleep $run_for_s
+
+# Shut down.
+graceful_shutdown $tpcc_pid $ra_pid
diff --git a/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
new file mode 100755
index 00000000..1735545e
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
@@ -0,0 +1,21 @@
+#! /bin/bash
+
+if [ -z $1 ]; then
+  >&2 echo "Usage: $0 path/to/physical/config.yml"
+  exit 1
+fi
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
+  --schema-name chbenchmark \
+  --query-bank-file ../../../workloads/chbenchmark/queries.sql \
+  --redshift-queries "0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \
+  --athena-queries "4" \
+  --redshift-provisioning "dc2.large:16" \
+  --aurora-provisioning "db.r6g.2xlarge:1" \
+  --system-config-file ch_scale_down_config.yml \
+  --physical-config-file $1 \
+  --override-definite-routing redshift
diff --git a/src/brad/config/file.py b/src/brad/config/file.py
index c14facc2..fe781c23 100644
--- a/src/brad/config/file.py
+++ b/src/brad/config/file.py
@@ -288,6 +288,12 @@ def ui_port(self) -> int:
         else:
             return 7583
 
+    def result_row_limit(self) -> Optional[int]:
+        try:
+            return self._raw["result_row_limit"]
+        except KeyError:
+            return None
+
     def _extract_log_path(self, config_key: str) -> Optional[pathlib.Path]:
         if config_key not in self._raw:
             return None
diff --git a/src/brad/daemon/daemon.py b/src/brad/daemon/daemon.py
index f634eb05..56045c7b 100644
--- a/src/brad/daemon/daemon.py
+++ b/src/brad/daemon/daemon.py
@@ -66,6 +66,7 @@
 from brad.planner.workload.builder import WorkloadBuilder
 from brad.planner.workload.provider import LoggedWorkloadProvider
 from brad.routing.policy import RoutingPolicy
+from brad.routing.tree_based.forest_policy import ForestPolicy
 from brad.row_list import RowList
 from brad.utils.time_periods import period_start, universal_now
 from brad.ui.manager import UiManager
@@ -328,7 +329,10 @@ async def _run_setup(self) -> None:
             or self._config.routing_policy == RoutingPolicy.Default
         ):
             logger.info("Setting up the cardinality estimator...")
-            if is_stub_mode:
+            blueprint = self._blueprint_mgr.get_blueprint()
+            policy = blueprint.get_routing_policy()
+            requires_estimator = isinstance(policy.definite_policy, ForestPolicy)
+            if is_stub_mode or not requires_estimator:
                 estimator: Estimator = StubEstimator()
             else:
                 estimator = await PostgresEstimator.connect(
diff --git a/src/brad/front_end/front_end.py b/src/brad/front_end/front_end.py
index f7f871fe..560ba211 100644
--- a/src/brad/front_end/front_end.py
+++ b/src/brad/front_end/front_end.py
@@ -453,8 +453,20 @@ async def _run_query_impl(
                 else:
                     connection = session.engines.get_reader_connection(engine_to_use)
                     cursor = connection.cursor_sync()
+                    # HACK: To work around dialect differences between
+                    # Athena/Aurora/Redshift for now. This should be replaced by
+                    # a more robust translation layer.
+                    if (
+                        engine_to_use == Engine.Athena
+                        and "ascii" in query_rep.raw_query
+                    ):
+                        translated_query = query_rep.raw_query.replace(
+                            "ascii", "codepoint"
+                        )
+                    else:
+                        translated_query = query_rep.raw_query
                     start = universal_now()
-                    await cursor.execute(query_rep.raw_query)
+                    await cursor.execute(translated_query)
                 end = universal_now()
             except (
                 pyodbc.ProgrammingError,
@@ -513,9 +525,23 @@ async def _run_query_impl(
 
             # Extract and return the results, if any.
             try:
-                # Using `fetchall_sync()` is lower overhead than the async interface.
-                results = [tuple(row) for row in cursor.fetchall_sync()]
-                log_verbose(logger, "Responded with %d rows.", len(results))
+                result_row_limit = self._config.result_row_limit()
+                if result_row_limit is not None:
+                    results = []
+                    for _ in range(result_row_limit):
+                        row = cursor.fetchone_sync()
+                        if row is None:
+                            break
+                        results.append(tuple(row))
+                    log_verbose(
+                        logger,
+                        "Responded with %d rows (limited to %d rows).",
+                        len(results),
+                    )
+                else:
+                    # Using `fetchall_sync()` is lower overhead than the async interface.
+                    results = [tuple(row) for row in cursor.fetchall_sync()]
+                    log_verbose(logger, "Responded with %d rows.", len(results))
                 return (
                     results,
                     (cursor.result_schema(results) if retrieve_schema else None),
diff --git a/src/brad/front_end/session.py b/src/brad/front_end/session.py
index 09ae5311..416e2515 100644
--- a/src/brad/front_end/session.py
+++ b/src/brad/front_end/session.py
@@ -11,6 +11,7 @@
 from brad.front_end.engine_connections import EngineConnections
 from brad.planner.estimator import Estimator
 from brad.routing.policy import RoutingPolicy
+from brad.routing.tree_based.forest_policy import ForestPolicy
 from brad.data_stats.postgres_estimator import PostgresEstimator
 from brad.data_stats.stub_estimator import StubEstimator
 from brad.utils.time_periods import universal_now
@@ -117,7 +118,9 @@ async def create_new_session(self) -> Tuple[SessionId, Session]:
             routing_policy_override == RoutingPolicy.ForestTableSelectivity
             or routing_policy_override == RoutingPolicy.Default
         ):
-            if self._config.stub_mode_path() is None:
+            policy = blueprint.get_routing_policy()
+            requires_estimator = isinstance(policy.definite_policy, ForestPolicy)
+            if self._config.stub_mode_path() is None and requires_estimator:
                 estimator: Optional[Estimator] = await PostgresEstimator.connect(
                     self._schema_name, self._config
                 )
diff --git a/workloads/IMDB_extended/set_up_starting_blueprint.py b/workloads/IMDB_extended/set_up_starting_blueprint.py
index be5bf2c1..62589379 100644
--- a/workloads/IMDB_extended/set_up_starting_blueprint.py
+++ b/workloads/IMDB_extended/set_up_starting_blueprint.py
@@ -15,6 +15,7 @@
 from brad.routing.cached import CachedLocationPolicy
 from brad.routing.policy import RoutingPolicy
 from brad.routing.tree_based.forest_policy import ForestPolicy
+from brad.routing.always_one import AlwaysOneRouter
 from brad.utils import set_up_logging
 
 logger = logging.getLogger(__name__)
@@ -83,6 +84,11 @@ def main():
     parser.add_argument(
         "--aurora-provisioning", type=str, help="Format: <instance type>:<num. nodes>"
     )
+    parser.add_argument(
+        "--override-definite-routing",
+        type=str,
+        help="An engine to always route queries for if the indefinite policy does not capture it.",
+    )
     args = parser.parse_args()
     set_up_logging(debug_mode=True)
 
@@ -124,11 +130,15 @@ def main():
 
     # 5. Replace the policy.
     enum_blueprint = EnumeratedBlueprint(blueprint)
-    definite_policy = asyncio.run(
-        ForestPolicy.from_assets(
-            args.schema_name, RoutingPolicy.ForestTableCardinality, assets
+    if args.override_definite_routing is not None:
+        routing_engine = Engine.from_str(args.override_definite_routing)
+        definite_policy = AlwaysOneRouter(routing_engine)
+    else:
+        definite_policy = asyncio.run(
+            ForestPolicy.from_assets(
+                args.schema_name, RoutingPolicy.ForestTableCardinality, assets
+            )
         )
-    )
     replaced_policy = FullRoutingPolicy(
         indefinite_policies=[clp], definite_policy=definite_policy
     )
diff --git a/workloads/chbenchmark/queries.sql b/workloads/chbenchmark/queries.sql
new file mode 100644
index 00000000..6ced3e67
--- /dev/null
+++ b/workloads/chbenchmark/queries.sql
@@ -0,0 +1,22 @@
+select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line group by ol_number order by ol_number;
+select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id;
+select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d;
+select o_ol_cnt, count(*) as order_count from orders where exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt;
+select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(cast(substring(c_state,1,1) as varchar(1))) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc;
+select sum(ol_amount) as revenue from order_line where ol_quantity between 1 and 100000;
+WITH inner_query AS (select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ((n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany'))) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
+select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year;
+select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc;
+select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc;
+select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc;
+select o_ol_cnt,  sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt;
+select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id > 8) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc;
+select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id;
+with revenue (supplier_no, total_revenue) as (select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) order by su_suppkey;
+select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc;
+select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a;
+select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d;
+select sum(ol_amount) as revenue from order_line, item where (ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,3)) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,4)) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,5,3));
+select su_name, su_address from supplier, nation where su_suppkey in (select  mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id  from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having   2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and n_name = 'Germany' order by su_name;
+select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where  l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name;
+select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where  c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id) group by substring(c_state,1,1) order by substring(c_state,1,1);