From 48a491ff2f98f04774512ba308c9786fe0b52d38 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 26 Apr 2024 13:10:16 -0400
Subject: [PATCH 01/30] Check in table size stats for CH-BenCHmark

---
 src/brad/planner/constants.yml   | 16 ++++++++++++++++
 tools/calibration/table_sizes.py |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/brad/planner/constants.yml b/src/brad/planner/constants.yml
index 51d9a428..4c393d51 100644
--- a/src/brad/planner/constants.yml
+++ b/src/brad/planner/constants.yml
@@ -181,6 +181,22 @@ table_extract_bytes_per_row:
     movie_info: 29.57191
     person_info: 133.458044
 
+  # TPC-C Warehouses: 1740
+  # Around ~120 GB of uncompressed data.
+  chbenchmark:
+    warehouse: 92.40747126436781
+    item: 75.62581
+    stock: 308.868974
+    district: 98.52431034482758
+    customer: 570.148704
+    history: 65.51127
+    orders: 40.134002
+    new_order: 9.937048
+    order_line: 68.538322
+    region: 216.8
+    nation: 185.03225806451613
+    supplier: 194.728
+
 ###
 ### Models used to account for hardware/system load.
 ###
diff --git a/tools/calibration/table_sizes.py b/tools/calibration/table_sizes.py
index ca1c3bc4..0fce773d 100644
--- a/tools/calibration/table_sizes.py
+++ b/tools/calibration/table_sizes.py
@@ -30,7 +30,7 @@ def delete_s3_object(client, bucket: str, key: str) -> None:
 
 
 async def main_impl(args) -> None:
-    config = ConfigFile.load(args.config_file)
+    config = ConfigFile.load_from_physical_config(args.physical_config_file)
     assets = AssetManager(config)
     mgr = BlueprintManager(config, assets, args.schema_name)
     await mgr.load()
@@ -121,7 +121,7 @@ def main():
         "Run this after bootstrapping a schema to measure table sizing "
         "constants used by the blueprint planner."
     )
-    parser.add_argument("--config-file", type=str, required=True)
+    parser.add_argument("--physical-config-file", type=str, required=True)
     parser.add_argument("--schema-name", type=str, required=True)
     parser.add_argument("--debug", action="store_true")
     # Unloading is slow - we do not need to unload the entire table to get a

From f5cb08488649f07ee3078aecec3dfdc3964fa8b9 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 26 Apr 2024 21:48:16 +0000
Subject: [PATCH 02/30] Adjust starting config for the SLO experiment

---
 .../15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
index 7c17816d..8834bd79 100755
--- a/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
+++ b/experiments/15-e2e-scenarios-v2/slo_change/set_up_starting_blueprint.sh
@@ -15,6 +15,6 @@ python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
   --aurora-queries "99,56,32,92,91" \
   --redshift-queries "49,30,83,94,38,87,86,76,37,31,46,58,61,62,64,69,73,74,51,57,60" \
   --redshift-provisioning "dc2.large:2" \
-  --aurora-provisioning "db.r6g.xlarge:2" \
+  --aurora-provisioning "db.r6g.xlarge:1" \
   --system-config-file slo_change_config.yml \
   --physical-config-file $1

From e8fd93e8c5528917f933f7d651e033a359c655cb Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 27 Apr 2024 20:15:51 -0400
Subject: [PATCH 03/30] Add data gathering scripts for CH-BenCHmark (load and
 instance types) (#501)

Part of #487.
---
 .../calibration/load_chbench/cond_config.toml |  0
 .../load_chbench/gather_redshift.sh           | 86 +++++++++++++++++++
 tools/calibration/load_chbench/redshift/COND  | 49 +++++++++++
 .../load_chbench/sample_full_queries.py       | 29 +++++++
 .../load_chbench/selected_queries.sql         | 22 +++++
 .../calibration/load_chbench/test_queries.py  | 43 ++++++++++
 6 files changed, 229 insertions(+)
 create mode 100644 tools/calibration/load_chbench/cond_config.toml
 create mode 100755 tools/calibration/load_chbench/gather_redshift.sh
 create mode 100644 tools/calibration/load_chbench/redshift/COND
 create mode 100644 tools/calibration/load_chbench/sample_full_queries.py
 create mode 100644 tools/calibration/load_chbench/selected_queries.sql
 create mode 100644 tools/calibration/load_chbench/test_queries.py

diff --git a/tools/calibration/load_chbench/cond_config.toml b/tools/calibration/load_chbench/cond_config.toml
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/calibration/load_chbench/gather_redshift.sh b/tools/calibration/load_chbench/gather_redshift.sh
new file mode 100755
index 00000000..37bd23aa
--- /dev/null
+++ b/tools/calibration/load_chbench/gather_redshift.sh
@@ -0,0 +1,86 @@
+#! /bin/bash
+
+if [ -z $2 ]; then
+  >&2 echo "Usage: $0 <config_path> <cluster id>"
+  >&2 echo "The config path should be relative to the redshift/ subdirectory."
+  exit 1
+fi
+
+export BRAD_CONFIG=$1
+cluster_identifier=$2
+
+export BRAD_SCHEMA="chbenchmark"
+
+function run_warm_up() {
+  >&2 echo "Running warm up..."
+  pushd redshift
+  python3 -m brad.calibration.measure_load --run-warmup --engine redshift --query-file ../../../../tools/calibration/load_chbench/selected_queries.sql
+  popd
+}
+
+function sync_redshift_resize() {
+  raw_instance=$1
+  target_instance_type=${raw_instance//_/.}
+  target_node_count=$2
+
+  if [[ $target_node_count = "2" ]] && [[ $raw_instance = "dc2_large" ]]; then
+    >&2 echo "Skipping initial resize to $raw_instance $target_node_count (special case)"
+    return
+  fi
+
+  # Try an elastic resize first.
+  >&2 echo "Resizing Redshift cluster to $target_instance_type with $target_node_count nodes (attempt elastic)"
+  aws redshift resize-cluster --cluster-identifier "$cluster_identifier" --cluster-type multi-node --node-type "$target_instance_type" --number-of-nodes "$target_node_count" --no-classic --region us-east-1 > /dev/null
+  result=$?
+
+  # Resize Redshift cluster
+  if [ $result -ne 0 ]; then
+    >&2 echo "Classic resizing Redshift cluster to $target_instance_type with $target_node_count nodes"
+    aws redshift modify-cluster --cluster-identifier "$cluster_identifier" --node-type "$target_instance_type" --number-of-nodes "$target_node_count" > /dev/null
+  fi
+
+  sleep 60
+
+  # Wait for resize to complete
+  while true; do
+      cluster_status=$(aws redshift describe-clusters --cluster-identifier "$cluster_identifier" --query 'Clusters[0].ClusterStatus' --output text)
+      if [[ $cluster_status == "available" ]]; then
+          break
+      fi
+      >&2 echo "Waiting for resize to complete..."
+      sleep 10
+  done
+}
+
+function run_cfg() {
+  instance_type=$1
+  num_nodes=$2
+
+  >&2 echo "$instance_type $num_nodes"
+  sync_redshift_resize $instance_type $num_nodes
+  >&2 echo "Warming up..."
+  run_warm_up
+  >&2 echo "Running..."
+  cond run "//redshift:${instance_type}-${num_nodes}"
+}
+
+>&2 echo "Running $cluster_identifier"
+>&2 echo "Config $BRAD_CONFIG"
+>&2 echo "Cluster id $cluster_identifier"
+sleep 10
+
+run_cfg "dc2_large" 2
+run_cfg "dc2_large" 4
+run_cfg "dc2_large" 8
+run_cfg "dc2_large" 16
+run_cfg "ra3_xlplus" 2
+run_cfg "ra3_xlplus" 4
+run_cfg "ra3_xlplus" 8
+run_cfg "ra3_4xlarge" 8
+run_cfg "ra3_4xlarge" 4
+run_cfg "ra3_4xlarge" 2
+
+sleep 60
+
+>&2 echo "Done. Pausing $cluster_identifier..."
+aws redshift pause-cluster --cluster-identifier "$cluster_identifier"
diff --git a/tools/calibration/load_chbench/redshift/COND b/tools/calibration/load_chbench/redshift/COND
new file mode 100644
index 00000000..7dfa96a0
--- /dev/null
+++ b/tools/calibration/load_chbench/redshift/COND
@@ -0,0 +1,49 @@
+from itertools import product
+
+
+AVG_GAP_S = 3
+RUN_FOR_S = 5 * 60  # 5 minutes
+NUM_CLIENTS = [1, 2, 4, 6]
+WAIT_BEFORE_START = 10
+NUM_QUERIES = 22
+
+
+# Relative to experiment definition directories.
+QUERY_BANK = "../selected_queries.sql"
+
+
+CLUSTER_CONFIGS = [
+  ("dc2_large", 2),
+  ("dc2_large", 4),
+  ("dc2_large", 8),
+  ("dc2_large", 16),
+  ("ra3_xlplus", 2),
+  ("ra3_xlplus", 4),
+  ("ra3_xlplus", 8),
+  ("ra3_4xlarge", 2),
+  ("ra3_4xlarge", 4),
+  ("ra3_4xlarge", 8),
+]
+
+
+for inst, nodes in CLUSTER_CONFIGS:
+  cfg_name = f"{inst}-{nodes}"
+  run_experiment_group(
+    name=cfg_name,
+    run="python3 -m brad.calibration.measure_load",
+    experiments=[
+      ExperimentInstance(
+        name=f"{cfg_name}-{clients}-q{query_idx}",
+        options={
+          "num-clients": clients,
+          "specific-query-idx": query_idx,
+          "run-for-s": RUN_FOR_S,
+          "avg-gap-s": AVG_GAP_S,
+          "wait-before-start": WAIT_BEFORE_START,
+          "query-file": QUERY_BANK,
+          "engine": "redshift",
+        },
+      )
+      for query_idx, clients in product(range(NUM_QUERIES), NUM_CLIENTS)
+    ],
+  )
diff --git a/tools/calibration/load_chbench/sample_full_queries.py b/tools/calibration/load_chbench/sample_full_queries.py
new file mode 100644
index 00000000..02ee1385
--- /dev/null
+++ b/tools/calibration/load_chbench/sample_full_queries.py
@@ -0,0 +1,29 @@
+import argparse
+import random
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--query-file", type=str, required=True)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num-query-blocks", type=int, default=22)
+    parser.add_argument("--queries-per-block", type=int, default=200)
+    args = parser.parse_args()
+
+    prng = random.Random(args.seed)
+
+    with open(args.query_file, "r", encoding="UTF-8") as file:
+        queries = [line.strip() for line in file]
+
+    selected = []
+    for qidx in range(args.num_query_blocks):
+        offset = prng.randint(0, args.queries_per_block - 1)
+        selected.append(queries[qidx * args.queries_per_block + offset])
+
+    with open("selected_queries.sql", "w", encoding="UTF-8") as file:
+        for q in selected:
+            print(q, file=file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/calibration/load_chbench/selected_queries.sql b/tools/calibration/load_chbench/selected_queries.sql
new file mode 100644
index 00000000..8700fcf9
--- /dev/null
+++ b/tools/calibration/load_chbench/selected_queries.sql
@@ -0,0 +1,22 @@
+select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line where ol_amount <= 33.06648003661816 group by ol_number order by ol_number;
+select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and s_quantity >= 11.777062936461403 and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and su_suppkey >= 2941.508980163913 and i_id <= 79857.67421953629 and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id;
+select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and o_w_id >= 42.7486611465113 and ol_amount <= 99.70737680321619 and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d;
+select o_ol_cnt, count(*) as order_count from orders where o_carrier_id <= 6.749521428520183 and exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt;
+select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and n_nationkey <= 99.21862547006236 and s_order_cnt >= 42.71343586058127 and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(substring(c_state,1,1)) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc;
+select sum(ol_amount) as revenue from order_line where ol_quantity <= 5.0 and ol_quantity between 1 and 100000;
+WITH inner_query AS ( select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and c_id <= 2665.5792747107366 and s_quantity <= 72.4600053847094 and su_suppkey >= 528.9934672447876 and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ( (n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany') ) ) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
+select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and o_w_id >= 583.5206747942913 and s_order_cnt <= 81.86012188607452 and su_suppkey >= 689.8054116558625 and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year;
+select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and o_id <= 2939.378308830152 and s_quantity <= 86.59608959532211 and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc;
+select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and n_nationkey <= 116.40294250401558 and o_id <= 1560.082691309974 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc;
+select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_suppkey >= 2406.641955682944 and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and s_quantity >= 27.35152833573742 and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc;
+select o_ol_cnt, sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_amount <= 36.57742392006392 and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt;
+select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id >= 4.392919247526648 ) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc;
+select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id and i_id <= 82830.86056286634;
+with revenue (supplier_no, total_revenue) as ( select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and s_quantity >= 52.409029036617504 group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) and su_suppkey >= 600.4811082997699 order by su_suppkey;
+select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and i_price >= 17.67656701310919 and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc;
+select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_quantity <= 5.0 and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a;
+select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_d_id <= 6.652803502875462 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d;
+select sum(ol_amount) as revenue from order_line, item where ( ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 97.10832030687996 and ol_w_id in (1,2,3) ) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 79.78326318023088 and ol_w_id in (1,2,4) ) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price >= 5.450023586551352 and ol_w_id in (1,5,3) );
+select su_name, su_address from supplier, nation where su_suppkey in (select mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having 2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and su_suppkey <= 8996.163667412242 and n_name = 'Germany' order by su_name;
+select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and su_suppkey <= 8526.675416612981 and o_w_id >= 369.02551642220345 and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name;
+select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id and o_w_id <= 1264.1427731874844 ) group by substring(c_state,1,1) order by substring(c_state,1,1);
diff --git a/tools/calibration/load_chbench/test_queries.py b/tools/calibration/load_chbench/test_queries.py
new file mode 100644
index 00000000..72d807da
--- /dev/null
+++ b/tools/calibration/load_chbench/test_queries.py
@@ -0,0 +1,43 @@
+import argparse
+import asyncio
+from brad.config.file import ConfigFile
+from brad.connection.factory import ConnectionFactory
+from brad.config.engine import Engine
+from brad.provisioning.directory import Directory
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--schema-name", type=str, required=True)
+    parser.add_argument("--physical-config-file", type=str, required=True)
+    parser.add_argument("--query-file", type=str, required=True)
+    args = parser.parse_args()
+
+    with open(args.query_file, "r", encoding="UTF-8") as file:
+        queries = [line.strip() for line in file]
+
+    config = ConfigFile.load_from_physical_config(args.physical_config_file)
+    directory = Directory(config)
+    asyncio.run(directory.refresh())
+    connection = ConnectionFactory.connect_to_sync(
+        Engine.Redshift, args.schema_name, config, directory, autocommit=True
+    )
+
+    cursor = connection.cursor_sync()
+    num_succeeded = 0
+    for idx, q in enumerate(queries):
+        try:
+            print("Running query", idx, "of", len(queries) - 1)
+            cursor.execute_sync(q)
+            num_succeeded += 1
+        except Exception as ex:
+            print("Query", idx, "failed with error", str(ex))
+
+    if num_succeeded == len(queries):
+        print("All succeeded.")
+    else:
+        print((len(queries) - num_succeeded), "failed.")
+
+
+if __name__ == "__main__":
+    main()

From 903a48fc04892ed300e732b2149677452279f626 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 29 Apr 2024 16:58:14 -0400
Subject: [PATCH 04/30] Check in TPC-C run time debug code (#503)

Part of #487.
---
 experiments/17-chbenchmark/debug/COND         |  12 +
 .../17-chbenchmark/debug/run_aurora_timing.sh |  18 +
 .../pytpcc/drivers/auroratimingdriver.py      | 701 ++++++++++++++++++
 workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py  |   3 +
 4 files changed, 734 insertions(+)
 create mode 100755 experiments/17-chbenchmark/debug/run_aurora_timing.sh
 create mode 100644 workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py

diff --git a/experiments/17-chbenchmark/debug/COND b/experiments/17-chbenchmark/debug/COND
index f00864cf..4cfa490f 100644
--- a/experiments/17-chbenchmark/debug/COND
+++ b/experiments/17-chbenchmark/debug/COND
@@ -24,3 +24,15 @@ run_command(
     "run-for-s": 180,
   },
 )
+
+run_experiment(
+  name="aurora_timing",
+  run="./run_aurora_timing.sh",
+  options={
+    "txn-config-file": "aurora.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "run-for-s": 30,
+  },
+)
diff --git a/experiments/17-chbenchmark/debug/run_aurora_timing.sh b/experiments/17-chbenchmark/debug/run_aurora_timing.sh
new file mode 100755
index 00000000..cb96028a
--- /dev/null
+++ b/experiments/17-chbenchmark/debug/run_aurora_timing.sh
@@ -0,0 +1,18 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+
+cd ../../../workloads/chbenchmark/py-tpcc/
+RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc auroratiming \
+  --no-load \
+  --config $abs_txn_config_file \
+  --warehouses $txn_warehouses \
+  --duration $run_for_s \
+  --clients $t_clients \
+  --scalefactor $txn_scale_factor
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py
new file mode 100644
index 00000000..d1d88cf4
--- /dev/null
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py
@@ -0,0 +1,701 @@
+import logging
+import traceback
+import decimal
+import os
+import time
+from typing import Dict, Tuple, Any, Optional, List
+
+from .abstractdriver import *
+from .. import constants
+
+from brad.connection.psycopg_connection import PsycopgConnection
+from brad.connection.psycopg_cursor import PsycopgCursor
+import conductor.lib as cond
+
+Config = Dict[str, Tuple[str, Any]]
+
+logger = logging.getLogger(__name__)
+
+
+TXN_QUERIES = {
+    "DELIVERY": {
+        "getNewOrder": "SELECT no_o_id FROM new_order WHERE no_d_id = {} AND no_w_id = {} AND no_o_id > -1 LIMIT 1",  #
+        "deleteNewOrder": "DELETE FROM new_order WHERE no_d_id = {} AND no_w_id = {} AND no_o_id = {}",  # d_id, w_id, no_o_id
+        "getCId": "SELECT o_c_id FROM orders WHERE o_id = {} AND o_d_id = {} AND o_w_id = {}",  # no_o_id, d_id, w_id
+        "updateOrders": "UPDATE orders SET o_carrier_id = {} WHERE o_id = {} AND o_d_id = {} AND o_w_id = {}",  # o_carrier_id, no_o_id, d_id, w_id
+        "updateOrderLine": "UPDATE order_line SET ol_delivery_d = '{}' WHERE ol_o_id = {} AND ol_d_id = {} AND ol_w_id = {}",  # o_entry_d, no_o_id, d_id, w_id
+        "sumOLAmount": "SELECT SUM(ol_amount) FROM order_line WHERE ol_o_id = {} AND ol_d_id = {} AND ol_w_id = {}",  # no_o_id, d_id, w_id
+        "updateCustomer": "UPDATE customer SET c_balance = c_balance + {} WHERE c_id = {} AND c_d_id = {} AND c_w_id = {}",  # ol_total, c_id, d_id, w_id
+    },
+    "NEW_ORDER": {
+        "getWarehouseTaxRate": "SELECT w_tax FROM warehouse WHERE w_id = {}",  # w_id
+        "getDistrict": "SELECT d_tax, d_next_o_id FROM district WHERE d_id = {} AND d_w_id = {}",  # d_id, w_id
+        "incrementNextOrderId": "UPDATE district SET d_next_o_id = {} WHERE d_id = {} AND d_w_id = {}",  # d_next_o_id, d_id, w_id
+        "getCustomer": "SELECT c_discount, c_last, c_credit FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
+        "createOrder": "INSERT INTO orders (o_id, o_d_id, o_w_id, o_c_id, o_entry_d, o_carrier_id, o_ol_cnt, o_all_local) VALUES ({}, {}, {}, {}, '{}', {}, {}, {})",  # d_next_o_id, d_id, w_id, c_id, o_entry_d, o_carrier_id, o_ol_cnt, o_all_local
+        "createNewOrder": "INSERT INTO new_order (no_o_id, no_d_id, no_w_id) VALUES ({}, {}, {})",  # o_id, d_id, w_id
+        "getItemInfo": "SELECT i_price, i_name, i_data FROM item WHERE i_id = {}",  # ol_i_id
+        "getStockInfo": "SELECT s_quantity, s_data, s_ytd, s_order_cnt, s_remote_cnt, s_dist_{:02d} FROM stock WHERE s_i_id = {} AND s_w_id = {}",  # d_id, ol_i_id, ol_supply_w_id
+        "updateStock": "UPDATE stock SET s_quantity = {}, s_ytd = {}, s_order_cnt = {}, s_remote_cnt = {} WHERE s_i_id = {} AND s_w_id = {}",  # s_quantity, s_order_cnt, s_remote_cnt, ol_i_id, ol_supply_w_id
+        "createOrderLine": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",  # o_id, d_id, w_id, ol_number, ol_i_id, ol_supply_w_id, ol_quantity, ol_amount, ol_dist_info
+    },
+    "ORDER_STATUS": {
+        "getCustomerByCustomerId": "SELECT c_id, c_first, c_middle, c_last, c_balance FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
+        "getCustomersByLastName": "SELECT c_id, c_first, c_middle, c_last, c_balance FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_last = '{}' ORDER BY c_first",  # w_id, d_id, c_last
+        "getLastOrder": "SELECT o_id, o_carrier_id, o_entry_d FROM orders WHERE o_w_id = {} AND o_d_id = {} AND o_c_id = {} ORDER BY o_id DESC LIMIT 1",  # w_id, d_id, c_id
+        "getOrderLines": "SELECT ol_supply_w_id, ol_i_id, ol_quantity, ol_amount, ol_delivery_d FROM order_line WHERE ol_w_id = {} AND ol_d_id = {} AND ol_o_id = {}",  # w_id, d_id, o_id
+    },
+    "PAYMENT": {
+        "getWarehouse": "SELECT w_name, w_street_1, w_street_2, w_city, w_state, w_zip FROM warehouse WHERE w_id = {}",  # w_id
+        "updateWarehouseBalance": "UPDATE warehouse SET w_ytd = w_ytd + {} WHERE w_id = {}",  # h_amount, w_id
+        "getDistrict": "SELECT d_name, d_street_1, d_street_2, d_city, d_state, d_zip FROM district WHERE d_w_id = {} AND d_id = {}",  # w_id, d_id
+        "updateDistrictBalance": "UPDATE district SET d_ytd = d_ytd + {} WHERE d_w_id = {} AND d_id = {}",  # h_amount, d_w_id, d_id
+        "getCustomerByCustomerId": "SELECT c_id, c_first, c_middle, c_last, c_street_1, c_street_2, c_city, c_state, c_zip, c_phone, c_since, c_credit, c_credit_lim, c_discount, c_balance, c_ytd_payment, c_payment_cnt, c_data FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
+        "getCustomersByLastName": "SELECT c_id, c_first, c_middle, c_last, c_street_1, c_street_2, c_city, c_state, c_zip, c_phone, c_since, c_credit, c_credit_lim, c_discount, c_balance, c_ytd_payment, c_payment_cnt, c_data FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_last = '{}' ORDER BY c_first",  # w_id, d_id, c_last
+        "updateBCCustomer": "UPDATE customer SET c_balance = {}, c_ytd_payment = {}, c_payment_cnt = {}, c_data = '{}' WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # c_balance, c_ytd_payment, c_payment_cnt, c_data, c_w_id, c_d_id, c_id
+        "updateGCCustomer": "UPDATE customer SET c_balance = {}, c_ytd_payment = {}, c_payment_cnt = {} WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # c_balance, c_ytd_payment, c_payment_cnt, c_w_id, c_d_id, c_id
+        "insertHistory": "INSERT INTO history (h_c_id, h_c_d_id, h_c_w_id, h_d_id, h_w_id, h_date, h_amount, h_data) VALUES ({}, {}, {}, {}, {}, '{}', {}, '{}')",
+    },
+    "STOCK_LEVEL": {
+        "getOId": "SELECT d_next_o_id FROM district WHERE d_w_id = {} AND d_id = {}",
+        "getStockCount": """
+            SELECT COUNT(DISTINCT(ol_i_id)) FROM order_line, stock
+            WHERE ol_w_id = {}
+              AND ol_d_id = {}
+              AND ol_o_id < {}
+              AND ol_o_id >= {}
+              AND s_w_id = {}
+              AND s_i_id = ol_i_id
+              AND s_quantity < {}
+        """,
+    },
+}
+
+
+class AuroraTimingDriver(AbstractDriver):
+    DEFAULT_CONFIG = {
+        "host": ("Host running the database.", "localhost"),
+        "port": ("Port on which the database is listening.", 5432),
+        "user": ("Username", "postgres"),
+        "password": ("Password", ""),
+        "database": ("Database", "chbenchmark"),
+        "isolation_level": ("The isolation level to use.", "REPEATABLE READ"),
+    }
+
+    def __init__(self, ddl: str) -> None:
+        super().__init__("brad", ddl)
+        self._connection: Optional[PsycopgConnection] = None
+        self._cursor: Optional[PsycopgCursor] = None
+        self._config: Dict[str, Any] = {}
+        self._nonsilent_errs = constants.NONSILENT_ERRORS_VAR in os.environ
+        self._measure_file = None
+        self._wdc_stats_file = None
+        self._ol_stats_file = None
+        self._ins_ol_counter = 0
+
+        if "LOG_QUERIES" in os.environ:
+            query_log_file_path = cond.in_output_dir("queries.log")
+            self._query_log_file = open(query_log_file_path, "w", encoding="UTF-8")
+        else:
+            self._query_log_file = None
+
+    def makeDefaultConfig(self) -> Config:
+        return AuroraTimingDriver.DEFAULT_CONFIG
+
+    def loadConfig(self, config: Config) -> None:
+        self._config = config
+        address = self._config["host"]
+        port = int(self._config["port"])
+        user = self._config["user"]
+        password = self._config["password"]
+        database = self._config["database"]
+        cstr = f"host={address} port={port} user={user} password={password} dbname={database}"
+        self._connection = PsycopgConnection.connect_sync(cstr, autocommit=True)
+        self._cursor = self._connection.cursor_sync()
+
+    def loadTuples(self, tableName: str, tuples) -> None:
+        # We don't support data loading directly here.
+        pass
+
+    def executeStart(self):
+        assert self._cursor is not None
+        # We use this callback to set the isolation level.
+        logger.info("Setting isolation level to %s", self._config["isolation_level"])
+        self._cursor.execute_sync(
+            f"SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL {self._config['isolation_level']}"
+        )
+        measure_file_path = cond.in_output_dir("aurora_timing.csv")
+        self._measure_file = open(measure_file_path, "w", encoding="UTF-8")
+        print(
+            "init,begin,getitems,getwdc,getorder,insertorder,commit,collect,total",
+            file=self._measure_file,
+        )
+
+        stats_file = cond.in_output_dir("wdc_stats.csv")
+        self._wdc_stats_file = open(stats_file, "w", encoding="UTF-8")
+        print("tax_rate,district,customer,total", file=self._wdc_stats_file)
+
+        stats_file2 = cond.in_output_dir("item_stats.csv")
+        self._ol_stats_file = open(stats_file2, "w", encoding="UTF-8")
+        print(
+            "txn_counter,init,fetch_stock,stock_prep,update_stock,ol_prep,ol_insert,ol_append,total",
+            file=self._ol_stats_file,
+        )
+
+    def __del__(self):
+        if self._measure_file is not None:
+            self._measure_file.close()
+            self._measure_file = None
+
+        if self._wdc_stats_file is not None:
+            self._wdc_stats_file.close()
+            self._wdc_stats_file = None
+
+        if self._ol_stats_file is not None:
+            self._ol_stats_file.close()
+            self._ol_stats_file = None
+
+        if self._query_log_file is not None:
+            self._query_log_file.close()
+            self._query_log_file = None
+
+    def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+        try:
+            assert self._cursor is not None
+
+            q = TXN_QUERIES["DELIVERY"]
+            w_id = params["w_id"]
+            o_carrier_id = params["o_carrier_id"]
+            ol_delivery_d = params["ol_delivery_d"]
+
+            result: List[Tuple[Any, ...]] = []
+            self._cursor.execute_sync("BEGIN")
+            for d_id in range(1, constants.DISTRICTS_PER_WAREHOUSE + 1):
+                self._cursor.execute_sync(q["getNewOrder"].format(d_id, w_id))
+                r = self._cursor.fetchall_sync()
+                if len(r) == 0:
+                    ## No orders for this district: skip it. Note: This must be reported if > 1%
+                    continue
+                no_o_id = r[0][0]
+
+                self._cursor.execute_sync(q["getCId"].format(no_o_id, d_id, w_id))
+                r = self._cursor.fetchall_sync()
+                c_id = r[0][0]
+
+                self._cursor.execute_sync(q["sumOLAmount"].format(no_o_id, d_id, w_id))
+                r = self._cursor.fetchall_sync()
+                ol_total = decimal.Decimal(r[0][0])
+
+                self._cursor.execute_sync(
+                    q["deleteNewOrder"].format(d_id, w_id, no_o_id)
+                )
+                updateOrders = q["updateOrders"].format(
+                    o_carrier_id, no_o_id, d_id, w_id
+                )
+                self._cursor.execute_sync(updateOrders)
+                updateOrderLine = q["updateOrderLine"].format(
+                    ol_delivery_d.strftime("%Y-%m-%d %H:%M:%S"), no_o_id, d_id, w_id
+                )
+                self._cursor.execute_sync(updateOrderLine)
+
+                # These must be logged in the "result file" according to TPC-C 2.7.2.2 (page 39)
+                # We remove the queued time, completed time, w_id, and o_carrier_id: the client can figure
+                # them out
+                # If there are no order lines, SUM returns null. There should always be order lines.
+                assert (
+                    ol_total != None
+                ), "ol_total is NULL: there are no order lines. This should not happen"
+                assert ol_total > 0.0
+
+                self._cursor.execute_sync(
+                    q["updateCustomer"].format(
+                        ol_total.quantize(decimal.Decimal("1.00")), c_id, d_id, w_id
+                    )
+                )
+
+                result.append((d_id, no_o_id))
+
+            self._cursor.execute_sync("COMMIT")
+            return result
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in DELIVERY", str(ex))
+                print(traceback.format_exc())
+            raise
+
+    def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+        try:
+            assert self._cursor is not None
+
+            no_start = time.time()
+            q = TXN_QUERIES["NEW_ORDER"]
+            w_id = params["w_id"]
+            d_id = params["d_id"]
+            c_id = params["c_id"]
+            o_entry_d = params["o_entry_d"]
+            i_ids = params["i_ids"]
+            i_w_ids = params["i_w_ids"]
+            i_qtys = params["i_qtys"]
+
+            assert len(i_ids) > 0
+            assert len(i_ids) == len(i_w_ids)
+            assert len(i_ids) == len(i_qtys)
+
+            no_pbegin = time.time()
+            self._cursor.execute_sync("BEGIN")
+            no_abegin = time.time()
+            all_local = True
+            items = []
+            for i in range(len(i_ids)):
+                ## Determine if this is an all local order or not
+                all_local = all_local and i_w_ids[i] == w_id
+                self._cursor.execute_sync(q["getItemInfo"].format(i_ids[i]))
+                r = self._cursor.fetchone_sync()
+                items.append(r)
+            assert len(items) == len(i_ids)
+            no_getitems = time.time()
+
+            ## TPCC defines 1% of neworder gives a wrong itemid, causing rollback.
+            ## Note that this will happen with 1% of transactions on purpose.
+            for item in items:
+                if len(item) == 0:
+                    self._cursor.execute_sync("ROLLBACK")
+                    return
+            ## FOR
+
+            ## ----------------
+            ## Collect Information from WAREHOUSE, DISTRICT, and CUSTOMER
+            ## ----------------
+            wdc_start = time.time()
+            get_warehouse = q["getWarehouseTaxRate"].format(w_id)
+            self._cursor.execute_sync(get_warehouse)
+            r = self._cursor.fetchone_sync()
+            w_tax = r[0]
+            wdc_warehouse_tax_rate = time.time()
+
+            get_district = q["getDistrict"].format(d_id, w_id)
+            self._cursor.execute_sync(get_district)
+            r = self._cursor.fetchone_sync()
+            district_info = r
+            d_tax = district_info[0]
+            d_next_o_id = district_info[1]
+            wdc_district = time.time()
+
+            get_customer = q["getCustomer"].format(w_id, d_id, c_id)
+            self._cursor.execute_sync(get_customer)
+            r = self._cursor.fetchone_sync()
+            customer_info = r
+            c_discount = customer_info[0]
+            no_get_wdc_info = time.time()
+
+            if self._query_log_file is not None:
+                print(get_warehouse, file=self._query_log_file)
+                print(get_district, file=self._query_log_file)
+                print(get_customer, file=self._query_log_file)
+
+            ## ----------------
+            ## Insert Order Information
+            ## ----------------
+            ol_cnt = len(i_ids)
+            o_carrier_id = constants.NULL_CARRIER_ID
+
+            self._cursor.execute_sync(
+                q["incrementNextOrderId"].format(d_next_o_id + 1, d_id, w_id)
+            )
+            createOrder = q["createOrder"].format(
+                d_next_o_id,
+                d_id,
+                w_id,
+                c_id,
+                o_entry_d.strftime("%Y-%m-%d %H:%M:%S"),
+                o_carrier_id,
+                ol_cnt,
+                1 if all_local else 0,
+            )
+            self._cursor.execute_sync(createOrder)
+            self._cursor.execute_sync(
+                q["createNewOrder"].format(d_next_o_id, d_id, w_id)
+            )
+            no_ins_order_info = time.time()
+
+            ## ----------------
+            ## Insert Order Item Information
+            ## ----------------
+            item_data = []
+            total = 0
+            insert_metadata = []
+            for i in range(len(i_ids)):
+                io_start = time.time()
+                ol_number = i + 1
+                ol_supply_w_id = i_w_ids[i]
+                ol_i_id = i_ids[i]
+                ol_quantity = i_qtys[i]
+
+                itemInfo = items[i]
+                i_name = itemInfo[1]
+                i_data = itemInfo[2]
+                i_price = decimal.Decimal(itemInfo[0])
+                io_init = time.time()
+
+                get_stock_info = q["getStockInfo"].format(d_id, ol_i_id, ol_supply_w_id)
+                self._cursor.execute_sync(get_stock_info)
+                r = self._cursor.fetchone_sync()
+                io_fetch_stock = time.time()
+                if r is None:
+                    logger.warning(
+                        "No STOCK record for (ol_i_id=%d, ol_supply_w_id=%d)",
+                        ol_i_id,
+                        ol_supply_w_id,
+                    )
+                    continue
+                stockInfo = r
+                s_quantity = stockInfo[0]
+                s_ytd = decimal.Decimal(stockInfo[2])
+                s_order_cnt = int(stockInfo[3])
+                s_remote_cnt = int(stockInfo[4])
+                s_data = stockInfo[1]
+                s_dist_xx = stockInfo[5]  # Fetches data from the s_dist_[d_id] column
+
+                ## Update stock
+                s_ytd += ol_quantity
+                if s_quantity >= ol_quantity + 10:
+                    s_quantity = s_quantity - ol_quantity
+                else:
+                    s_quantity = s_quantity + 91 - ol_quantity
+                s_order_cnt += 1
+
+                if ol_supply_w_id != w_id:
+                    s_remote_cnt += 1
+                io_stock_prep = time.time()
+
+                update_stock = q["updateStock"].format(
+                    s_quantity,
+                    s_ytd.quantize(decimal.Decimal("1.00")),
+                    s_order_cnt,
+                    s_remote_cnt,
+                    ol_i_id,
+                    ol_supply_w_id,
+                )
+                self._cursor.execute_sync(update_stock)
+                io_update_stock = time.time()
+
+                if (
+                    i_data.find(constants.ORIGINAL_STRING) != -1
+                    and s_data.find(constants.ORIGINAL_STRING) != -1
+                ):
+                    brand_generic = "B"
+                else:
+                    brand_generic = "G"
+
+                ## Transaction profile states to use "ol_quantity * i_price"
+                ol_amount = ol_quantity * i_price
+                total += ol_amount
+                io_ol_prep = time.time()
+
+                createOrderLine = q["createOrderLine"].format(
+                    d_next_o_id,
+                    d_id,
+                    w_id,
+                    ol_number,
+                    ol_i_id,
+                    ol_supply_w_id,
+                    o_entry_d.strftime("%Y-%m-%d %H:%M:%S"),
+                    ol_quantity,
+                    ol_amount,
+                    s_dist_xx,
+                )
+                self._cursor.execute_sync(createOrderLine)
+                io_ol_insert = time.time()
+
+                ## Add the info to be returned
+                item_data.append(
+                    (i_name, s_quantity, brand_generic, i_price, ol_amount)
+                )
+                io_ol_append = time.time()
+
+                insert_metadata.append(
+                    (
+                        io_init - io_start,
+                        io_fetch_stock - io_init,
+                        io_stock_prep - io_fetch_stock,
+                        io_update_stock - io_stock_prep,
+                        io_ol_prep - io_update_stock,
+                        io_ol_insert - io_ol_prep,
+                        io_ol_append - io_ol_insert,
+                        io_ol_append - io_start,
+                    )
+                )
+
+                if self._query_log_file is not None:
+                    print(get_stock_info, file=self._query_log_file)
+                    print(update_stock, file=self._query_log_file)
+                    print(createOrderLine, file=self._query_log_file)
+
+            ## FOR
+            no_insert_order_line = time.time()
+
+            ## Commit!
+            self._cursor.execute_sync("COMMIT")
+            no_commit = time.time()
+
+            ## Adjust the total for the discount
+            # print "c_discount:", c_discount, type(c_discount)
+            # print "w_tax:", w_tax, type(w_tax)
+            # print "d_tax:", d_tax, type(d_tax)
+            total = int(
+                total
+                * (1 - decimal.Decimal(c_discount))
+                * (1 + decimal.Decimal(w_tax) + decimal.Decimal(d_tax))
+            )
+
+            ## Pack up values the client is missing (see TPC-C 2.4.3.5)
+            misc = [(w_tax, d_tax, d_next_o_id, total)]
+            no_collect = time.time()
+
+            if self._measure_file is not None:
+                init_time = no_pbegin - no_start
+                begin_time = no_abegin - no_pbegin
+                getitems_time = no_getitems - no_abegin
+                getwdc_time = no_get_wdc_info - no_getitems
+                getorder_time = no_ins_order_info - no_get_wdc_info
+                insertorder_time = no_insert_order_line - no_ins_order_info
+                commit_time = no_commit - no_insert_order_line
+                collect_time = no_collect - no_commit
+                total_time = no_collect - no_start
+                print(
+                    f"{init_time},{begin_time},{getitems_time},{getwdc_time},{getorder_time},{insertorder_time},{commit_time},{collect_time},{total_time}",
+                    file=self._measure_file,
+                )
+
+            if self._wdc_stats_file is not None:
+                tax_rate_time = wdc_warehouse_tax_rate - wdc_start
+                district_time = wdc_district - wdc_warehouse_tax_rate
+                customer_time = no_get_wdc_info - wdc_district
+                total_time = no_get_wdc_info - wdc_start
+                print(
+                    f"{tax_rate_time},{district_time},{customer_time},{total_time}",
+                    file=self._wdc_stats_file,
+                )
+
+            if self._ol_stats_file is not None:
+                for im in insert_metadata:
+                    print(
+                        "{},{},{},{},{},{},{},{},{}".format(self._ins_ol_counter, *im),
+                        file=self._ol_stats_file,
+                    )
+                self._ins_ol_counter += 1
+
+            return [customer_info, misc, item_data]
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in NEWORDER", str(ex))
+                print(traceback.format_exc())
+            raise
+
+    def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+        try:
+            assert self._cursor is not None
+
+            q = TXN_QUERIES["ORDER_STATUS"]
+            w_id = params["w_id"]
+            d_id = params["d_id"]
+            c_id = params["c_id"]
+            c_last = params["c_last"]
+
+            self._cursor.execute_sync("BEGIN")
+            if c_id != None:
+                self._cursor.execute_sync(
+                    q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
+                )
+                r = self._cursor.fetchall_sync()
+                customer = r[0]
+            else:
+                # Get the midpoint customer's id
+                self._cursor.execute_sync(
+                    q["getCustomersByLastName"].format(w_id, d_id, c_last)
+                )
+                r = self._cursor.fetchall_sync()
+                all_customers = r
+                assert len(all_customers) > 0
+                namecnt = len(all_customers)
+                index = (namecnt - 1) // 2
+                customer = all_customers[index]
+                c_id = customer[0]
+            assert len(customer) > 0
+            assert c_id != None
+
+            getLastOrder = q["getLastOrder"].format(w_id, d_id, c_id)
+            self._cursor.execute_sync(getLastOrder)
+            r = self._cursor.fetchall_sync()
+            order = r[0]
+            if order:
+                self._cursor.execute_sync(
+                    q["getOrderLines"].format(w_id, d_id, order[0])
+                )
+                r = self._cursor.fetchall_sync()
+                orderLines = r
+            else:
+                orderLines = []
+
+            self._cursor.execute_sync("COMMIT")
+            return [customer, order, orderLines]
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in ORDER_STATUS", str(ex))
+                print(traceback.format_exc())
+            raise
+
+    def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+        try:
+            assert self._cursor is not None
+
+            q = TXN_QUERIES["PAYMENT"]
+            w_id = params["w_id"]
+            d_id = params["d_id"]
+            h_amount = decimal.Decimal(params["h_amount"])
+            c_w_id = params["c_w_id"]
+            c_d_id = params["c_d_id"]
+            c_id = params["c_id"]
+            c_last = params["c_last"]
+            h_date = params["h_date"]  # Python datetime
+
+            self._cursor.execute_sync("BEGIN")
+            if c_id != None:
+                self._cursor.execute_sync(
+                    q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
+                )
+                r = self._cursor.fetchall_sync()
+                customer = r[0]
+            else:
+                # Get the midpoint customer's id
+                self._cursor.execute_sync(
+                    q["getCustomersByLastName"].format(w_id, d_id, c_last)
+                )
+                r = self._cursor.fetchall_sync()
+                all_customers = r
+                assert len(all_customers) > 0
+                namecnt = len(all_customers)
+                index = (namecnt - 1) // 2
+                customer = all_customers[index]
+                c_id = customer[0]
+            assert len(customer) > 0
+            c_balance = decimal.Decimal(customer[14]) - h_amount
+            c_ytd_payment = decimal.Decimal(customer[15]) + h_amount
+            c_payment_cnt = int(customer[16]) + 1
+            c_data = customer[17]
+
+            self._cursor.execute_sync(q["getWarehouse"].format(w_id))
+            r = self._cursor.fetchall_sync()
+            warehouse = r[0]
+
+            self._cursor.execute_sync(q["getDistrict"].format(w_id, d_id))
+            r = self._cursor.fetchall_sync()
+            district = r[0]
+
+            self._cursor.execute_sync(
+                q["updateWarehouseBalance"].format(h_amount, w_id)
+            )
+            self._cursor.execute_sync(
+                q["updateDistrictBalance"].format(h_amount, w_id, d_id)
+            )
+
+            # Customer Credit Information
+            if customer[11] == constants.BAD_CREDIT:
+                newData = " ".join(
+                    map(str, [c_id, c_d_id, c_w_id, d_id, w_id, h_amount])
+                )
+                c_data = newData + "|" + c_data
+                if len(c_data) > constants.MAX_C_DATA:
+                    c_data = c_data[: constants.MAX_C_DATA]
+                updateCustomer = q["updateBCCustomer"].format(
+                    c_balance,
+                    c_ytd_payment,
+                    c_payment_cnt,
+                    c_data,
+                    c_w_id,
+                    c_d_id,
+                    c_id,
+                )
+                self._cursor.execute_sync(updateCustomer)
+            else:
+                c_data = ""
+                self._cursor.execute_sync(
+                    q["updateGCCustomer"].format(
+                        c_balance, c_ytd_payment, c_payment_cnt, c_w_id, c_d_id, c_id
+                    ),
+                )
+
+            # Concatenate w_name, four spaces, d_name
+            h_data = "%s    %s" % (warehouse[0], district[0])
+            # Create the history record
+            insertHistory = q["insertHistory"].format(
+                c_id,
+                c_d_id,
+                c_w_id,
+                d_id,
+                w_id,
+                h_date.strftime("%Y-%m-%d %H:%M:%S"),
+                h_amount.quantize(decimal.Decimal("1.00")),
+                h_data,
+            )
+            self._cursor.execute_sync(insertHistory)
+
+            self._cursor.execute_sync("COMMIT")
+
+            # TPC-C 2.5.3.3: Must display the following fields:
+            # W_ID, D_ID, C_ID, C_D_ID, C_W_ID, W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP,
+            # D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, C_FIRST, C_MIDDLE, C_LAST, C_STREET_1,
+            # C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_SINCE, C_CREDIT, C_CREDIT_LIM,
+            # C_DISCOUNT, C_BALANCE, the first 200 characters of C_DATA (only if C_CREDIT = "BC"),
+            # H_AMOUNT, and H_DATE.
+
+            # Hand back all the warehouse, district, and customer data
+            return [warehouse, district, customer]
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in PAYMENT", str(ex))
+                print(traceback.format_exc())
+            raise
+
+    def doStockLevel(self, params: Dict[str, Any]) -> int:
+        try:
+            assert self._cursor is not None
+
+            q = TXN_QUERIES["STOCK_LEVEL"]
+            w_id = params["w_id"]
+            d_id = params["d_id"]
+            threshold = params["threshold"]
+
+            self._cursor.execute_sync("BEGIN")
+            self._cursor.execute_sync(q["getOId"].format(w_id, d_id))
+            r = self._cursor.fetchall_sync()
+            result = r[0]
+            assert result
+            o_id = result[0]
+
+            self._cursor.execute_sync(
+                q["getStockCount"].format(
+                    w_id, d_id, o_id, (o_id - 20), w_id, threshold
+                )
+            )
+            r = self._cursor.fetchall_sync()
+            result = r[0]
+
+            self._cursor.execute_sync("COMMIT")
+            return int(result[0])
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in STOCK_LEVEL", str(ex))
+                print(traceback.format_exc())
+            raise
+
+    def ensureRollback(self) -> None:
+        """
+        Makes sure the transaction has rolled back.
+        """
+        self._cursor.execute_sync("ROLLBACK")
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py b/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
index 2273933f..027d4fb3 100755
--- a/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
@@ -43,6 +43,7 @@
 from .util import *
 from .runtime import *
 from .drivers.auroradriver import AuroraDriver
+from .drivers.auroratimingdriver import AuroraTimingDriver
 from .drivers.braddriver import BradDriver
 
 logging.basicConfig(
@@ -61,6 +62,8 @@ def createDriverClass(name):
         return BradDriver
     elif name == "aurora":
         return AuroraDriver
+    elif name == "auroratiming":
+        return AuroraTimingDriver
     else:
         raise NotImplementedError
 

From b4a54a9c8e2acd1de415c39a421a5359f53ab63b Mon Sep 17 00:00:00 2001
From: Sophie Zhang <88999452+sopzha@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:39:50 -0400
Subject: [PATCH 05/30] Create RecordBatch in BradStatement from query result
 and schema exposed from underlying connections (#502)

Co-authored-by: Sophie Zhang <sopzha@gmail.com>
---
 cpp/server/brad_server_simple.cc | 124 ++++++++++++++++++++++++++-----
 cpp/server/brad_server_simple.h  |   1 +
 cpp/server/brad_statement.cc     |  89 +++-------------------
 cpp/server/brad_statement.h      |   8 +-
 4 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/cpp/server/brad_server_simple.cc b/cpp/server/brad_server_simple.cc
index 6c4260bc..5cc7594d 100644
--- a/cpp/server/brad_server_simple.cc
+++ b/cpp/server/brad_server_simple.cc
@@ -7,6 +7,7 @@
 #include <utility>
 #include <stdexcept>
 
+#include <arrow/api.h>
 #include <arrow/array/builder_binary.h>
 #include "brad_sql_info.h"
 #include "brad_statement.h"
@@ -50,23 +51,108 @@ arrow::Result<std::pair<std::string, std::string>> DecodeTransactionQuery(
   return std::make_pair(std::move(autoincrement_id), std::move(transaction_id));
 }
 
-std::vector<std::vector<std::any>> TransformQueryResult(
-  std::vector<py::tuple> query_result) {
-  std::vector<std::vector<std::any>> transformed_query_result;
-  for (const auto &row : query_result) {
-    std::vector<std::any> transformed_row{};
-    for (const auto &field : row) {
-      if (py::isinstance<py::int_>(field)) {
-        transformed_row.push_back(std::make_any<int>(py::cast<int>(field)));
-      } else if (py::isinstance<py::float_>(field)) {
-        transformed_row.push_back(std::make_any<float>(py::cast<float>(field)));
-      } else {
-        transformed_row.push_back(std::make_any<std::string>(py::cast<std::string>(field)));
+arrow::Result<std::shared_ptr<arrow::RecordBatch>> ResultToRecordBatch(
+  const std::vector<py::tuple> &query_result,
+  const std::shared_ptr<arrow::Schema> &schema) {
+  const size_t num_rows = query_result.size();
+
+  const size_t num_columns = schema->num_fields();
+  std::vector<std::shared_ptr<arrow::Array>> columns;
+  columns.reserve(num_columns);
+
+  for (int field_ix = 0; field_ix < num_columns; ++field_ix) {
+    const auto &field_type = schema->field(field_ix)->type();
+    if (field_type->Equals(arrow::int64())) {
+      arrow::Int64Builder int64builder;
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        const std::optional<int64_t> val =
+          py::cast<std::optional<int64_t>>(query_result[row_ix][field_ix]);
+        if (val) {
+          ARROW_RETURN_NOT_OK(int64builder.Append(*val));
+        } else {
+          ARROW_RETURN_NOT_OK(int64builder.AppendNull());
+        }
       }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, int64builder.Finish());
+      columns.push_back(values);
+
+    } else if (field_type->Equals(arrow::float32())) {
+      arrow::FloatBuilder floatbuilder;
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        const std::optional<float> val =
+          py::cast<std::optional<float>>(query_result[row_ix][field_ix]);
+        if (val) {
+          ARROW_RETURN_NOT_OK(floatbuilder.Append(*val));
+        } else {
+          ARROW_RETURN_NOT_OK(floatbuilder.AppendNull());
+        }
+      }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, floatbuilder.Finish());
+      columns.push_back(values);
+
+    } else if (field_type->Equals(arrow::decimal(/*precision=*/10, /*scale=*/2))) {
+      arrow::Decimal128Builder decimalbuilder(arrow::decimal(/*precision=*/10, /*scale=*/2));
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        const std::optional<std::string> val =
+          py::cast<std::optional<std::string>>(query_result[row_ix][field_ix]);
+        if (val) {
+          ARROW_RETURN_NOT_OK(
+            decimalbuilder.Append(arrow::Decimal128::FromString(*val).ValueOrDie()));
+        } else {
+          ARROW_RETURN_NOT_OK(decimalbuilder.AppendNull());
+        }
+      }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, decimalbuilder.Finish());
+      columns.push_back(values);  
+
+    } else if (field_type->Equals(arrow::utf8())) {
+      arrow::StringBuilder stringbuilder;
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        const std::optional<std::string> str =
+          py::cast<std::optional<std::string>>(query_result[row_ix][field_ix]);
+        if (str) {
+          ARROW_RETURN_NOT_OK(stringbuilder.Append(str->data(), str->size()));
+        } else {
+          ARROW_RETURN_NOT_OK(stringbuilder.AppendNull());
+        }
+      }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, stringbuilder.Finish());
+      columns.push_back(values);
+
+    } else if (field_type->Equals(arrow::date64())) {
+      arrow::Date64Builder datebuilder;
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        const std::optional<int64_t> val =
+          py::cast<std::optional<int64_t>>(query_result[row_ix][field_ix]);
+        if (val) {
+          ARROW_RETURN_NOT_OK(datebuilder.Append(*val));
+        } else {
+          ARROW_RETURN_NOT_OK(datebuilder.AppendNull());
+        }
+      }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, datebuilder.Finish());
+      columns.push_back(values); 
+
+    } else if (field_type->Equals(arrow::null())) {
+      arrow::NullBuilder nullbuilder;
+      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
+        ARROW_RETURN_NOT_OK(nullbuilder.AppendNull());
+      }
+      std::shared_ptr<arrow::Array> values;
+      ARROW_ASSIGN_OR_RAISE(values, nullbuilder.Finish());
+      columns.push_back(values);
     }
-    transformed_query_result.push_back(transformed_row);
   }
-  return transformed_query_result;  
+
+  std::shared_ptr<arrow::RecordBatch> result_record_batch =
+    arrow::RecordBatch::Make(schema, num_rows, columns);
+
+  return result_record_batch;
 }
 
 BradFlightSqlServer::BradFlightSqlServer() : autoincrement_id_(0ULL) {}
@@ -125,25 +211,23 @@ arrow::Result<std::unique_ptr<FlightInfo>>
                         EncodeTransactionQuery(query_ticket));
 
   std::shared_ptr<arrow::Schema> result_schema;
-  std::vector<std::vector<std::any>> transformed_query_result;
+  std::shared_ptr<arrow::RecordBatch> result_record_batch;
 
   { 
     py::gil_scoped_acquire guard;
     auto result = handle_query_(query);
     result_schema = ArrowSchemaFromBradSchema(result.second);
-    transformed_query_result = TransformQueryResult(result.first);
+    result_record_batch = ResultToRecordBatch(result.first, result_schema).ValueOrDie();
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto statement, BradStatement::Create(transformed_query_result));
+  ARROW_ASSIGN_OR_RAISE(auto statement, BradStatement::Create(std::move(result_record_batch), result_schema));
   query_data_.insert(query_ticket, statement);
 
-  ARROW_ASSIGN_OR_RAISE(auto schema, statement->GetSchema());
-
   std::vector<FlightEndpoint> endpoints{
     FlightEndpoint{std::move(ticket), {}, std::nullopt, ""}};
 
   const bool ordered = false;
-  ARROW_ASSIGN_OR_RAISE(auto result, FlightInfo::Make(*schema,
+  ARROW_ASSIGN_OR_RAISE(auto result, FlightInfo::Make(*result_schema,
                                                       descriptor,
                                                       endpoints,
                                                       -1,
diff --git a/cpp/server/brad_server_simple.h b/cpp/server/brad_server_simple.h
index 484ea216..ee6eaf21 100644
--- a/cpp/server/brad_server_simple.h
+++ b/cpp/server/brad_server_simple.h
@@ -15,6 +15,7 @@
 #include "libcuckoo/cuckoohash_map.hh"
 
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 namespace brad {
 
diff --git a/cpp/server/brad_statement.cc b/cpp/server/brad_statement.cc
index e9ce1588..0db4a786 100644
--- a/cpp/server/brad_statement.cc
+++ b/cpp/server/brad_statement.cc
@@ -25,96 +25,27 @@ arrow::Result<std::shared_ptr<BradStatement>> BradStatement::Create(
 }
 
 arrow::Result<std::shared_ptr<BradStatement>> BradStatement::Create(
-  std::vector<std::vector<std::any>> query_result) {
-  std::shared_ptr<BradStatement> result(
-    std::make_shared<BradStatement>(query_result));
-  return result;
+  std::shared_ptr<arrow::RecordBatch> result_record_batch,
+  std::shared_ptr<arrow::Schema> schema) {
+    std::shared_ptr<BradStatement> result(
+      std::make_shared<BradStatement>(result_record_batch, schema));
+    return result;
 }
 
-BradStatement::BradStatement(std::vector<std::vector<std::any>> query_result) :
-    query_result_(std::move(query_result)) {}
+BradStatement::BradStatement(std::shared_ptr<arrow::RecordBatch> result_record_batch,
+                             std::shared_ptr<arrow::Schema> schema) :
+  result_record_batch_(std::move(result_record_batch)),
+  schema_(std::move(schema)) {}
 
 BradStatement::~BradStatement() {
 }
 
 arrow::Result<std::shared_ptr<arrow::Schema>> BradStatement::GetSchema() const {
-  if (schema_) {
-    return schema_;
-  }
-
-  std::vector<std::shared_ptr<arrow::Field>> fields;
-
-  if (query_result_.size() > 0) {
-    const std::vector<std::any> &row = query_result_[0];
-
-    int counter = 0;
-    for (const auto &field : row) {
-      std::string field_type = field.type().name();
-      if (field_type == "i") {
-        fields.push_back(arrow::field("INT FIELD " + std::to_string(++counter), arrow::int8()));
-      } else if (field_type == "f") {
-        fields.push_back(arrow::field("FLOAT FIELD " + std::to_string(++counter), arrow::float32()));
-      } else {
-        fields.push_back(arrow::field("STRING FIELD " + std::to_string(++counter), arrow::utf8()));
-      }
-    }
-  }
-
-  schema_ = arrow::schema(fields);
   return schema_;
 }
 
 arrow::Result<std::shared_ptr<arrow::RecordBatch>> BradStatement::FetchResult() {
-  std::shared_ptr<arrow::Schema> schema = GetSchema().ValueOrDie();
-
-  const int num_rows = query_result_.size();
-
-  std::vector<std::shared_ptr<arrow::Array>> columns;
-  columns.reserve(schema->num_fields());
-
-  for (int field_ix = 0; field_ix < schema->num_fields(); ++field_ix) {
-    const auto &field = schema->fields()[field_ix];
-    if (field->type() == arrow::int8()) {
-      arrow::Int8Builder int8builder;
-      int8_t values_raw[num_rows];
-      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
-        values_raw[row_ix] = std::any_cast<int>(query_result_[row_ix][field_ix]);
-      }
-      ARROW_RETURN_NOT_OK(int8builder.AppendValues(values_raw, num_rows));
-
-      std::shared_ptr<arrow::Array> values;
-      ARROW_ASSIGN_OR_RAISE(values, int8builder.Finish());
-
-      columns.push_back(values);
-    } else if (field->type() == arrow::float32()) {
-      arrow::FloatBuilder floatbuilder;
-      float values_raw[num_rows];
-      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
-        values_raw[row_ix] = std::any_cast<float>(query_result_[row_ix][field_ix]);
-      }
-      ARROW_RETURN_NOT_OK(floatbuilder.AppendValues(values_raw, num_rows));
-
-      std::shared_ptr<arrow::Array> values;
-      ARROW_ASSIGN_OR_RAISE(values, floatbuilder.Finish());
-
-      columns.push_back(values);
-    } else if (field->type() == arrow::utf8()) {
-      arrow::StringBuilder stringbuilder;
-      for (int row_ix = 0; row_ix < num_rows; ++row_ix) {
-        const std::string* str = std::any_cast<const std::string>(&(query_result_[row_ix][field_ix]));
-        ARROW_RETURN_NOT_OK(stringbuilder.Append(str->data(), str->size()));
-      }
-
-      std::shared_ptr<arrow::Array> values;
-      ARROW_ASSIGN_OR_RAISE(values, stringbuilder.Finish());
-    }
-  }
-
-  std::shared_ptr<arrow::RecordBatch> record_batch =
-    arrow::RecordBatch::Make(schema,
-                             num_rows,
-                             columns);
-  return record_batch;
+  return result_record_batch_;
 }
 
 std::string* BradStatement::GetBradStmt() const { return stmt_; }
diff --git a/cpp/server/brad_statement.h b/cpp/server/brad_statement.h
index b3dba2cc..6d296c16 100644
--- a/cpp/server/brad_statement.h
+++ b/cpp/server/brad_statement.h
@@ -26,9 +26,11 @@ class BradStatement {
     const std::string& sql);
 
   static arrow::Result<std::shared_ptr<BradStatement>> Create(
-    const std::vector<std::vector<std::any>>);
+    std::shared_ptr<arrow::RecordBatch> result_record_batch,
+    std::shared_ptr<arrow::Schema> schema);
 
-  BradStatement(std::vector<std::vector<std::any>>);
+  BradStatement(std::shared_ptr<arrow::RecordBatch>,
+                std::shared_ptr<arrow::Schema>);
 
   ~BradStatement();
 
@@ -41,7 +43,7 @@ class BradStatement {
   std::string* GetBradStmt() const;
 
  private:
-  std::vector<std::vector<std::any>> query_result_;
+  std::shared_ptr<arrow::RecordBatch> result_record_batch_;
 
   mutable std::shared_ptr<arrow::Schema> schema_;
 

From b9ba0f398723f56f7e85c3562570a35ff8730372 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Wed, 1 May 2024 18:55:30 -0400
Subject: [PATCH 06/30] Various improvements to the TPC-C runner (#504)

- Batch inserts
- Add Zipfian skew to how keys are selected
- Remove use of fetchall() where not needed (does not really impact BRAD
though)

These changes reduce new order latency from ~95 ms down to ~45 ms.

Part of #487.
---
 experiments/17-chbenchmark/common.sh          |  23 +-
 experiments/17-chbenchmark/debug/COND         |  45 +++
 .../17-chbenchmark/debug/run_aurora_direct.sh |  20 +-
 .../17-chbenchmark/debug/run_aurora_timing.sh |  20 +-
 .../calibration/transactions/chbenchmark/COND |   3 +
 .../transactions/chbenchmark/run_instance.sh  |   7 +-
 .../py-tpcc/pytpcc/drivers/auroradriver.py    |  81 ++---
 .../pytpcc/drivers/auroratimingdriver.py      | 287 +++++++++++++++++-
 .../py-tpcc/pytpcc/drivers/braddriver.py      |  61 ++--
 .../py-tpcc/pytpcc/runtime/executor.py        |  39 ++-
 workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py  |  12 +-
 11 files changed, 510 insertions(+), 88 deletions(-)

diff --git a/experiments/17-chbenchmark/common.sh b/experiments/17-chbenchmark/common.sh
index 04ad6a0b..2db49e0e 100644
--- a/experiments/17-chbenchmark/common.sh
+++ b/experiments/17-chbenchmark/common.sh
@@ -14,13 +14,18 @@ function start_brad() {
 
 function run_tpcc() {
   pushd ../../../workloads/chbenchmark/py-tpcc/
-  RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc brad \
-    --no-load \
-    --config $abs_txn_config_file \
-    --warehouses $txn_warehouses \
-    --duration $run_for_s \
-    --clients $t_clients \
-    --scalefactor $txn_scale_factor &
+  local args=(
+    --no-load
+    --config $abs_txn_config_file
+    --warehouses $txn_warehouses
+    --duration $run_for_s
+    --clients $t_clients
+    --scalefactor $txn_scale_factor
+  )
+  if [[ ! -z $txn_zipfian_alpha ]]; then
+    args+=(--zipfian-alpha $txn_zipfian_alpha)
+  fi
+  RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc brad "${args[@]}" &
   tpcc_pid=$!
   popd
 }
@@ -91,6 +96,10 @@ function extract_named_arguments() {
     if [[ $phys_arg =~ --txn-config-file=.+ ]]; then
       txn_config_file=${phys_arg:18}
     fi
+
+    if [[ $phys_arg =~ --txn-zipfian-alpha=.+ ]]; then
+      txn_zipfian_alpha=${phys_arg:20}
+    fi
   done
 }
 
diff --git a/experiments/17-chbenchmark/debug/COND b/experiments/17-chbenchmark/debug/COND
index 4cfa490f..7feaa352 100644
--- a/experiments/17-chbenchmark/debug/COND
+++ b/experiments/17-chbenchmark/debug/COND
@@ -1,3 +1,6 @@
+ZIPFIAN_ALPHA = 5.0
+
+
 run_command(
   name="txn_lat",
   run="./run_tpcc.sh",
@@ -13,6 +16,22 @@ run_command(
   },
 )
 
+run_command(
+  name="txn_lat_zipf",
+  run="./run_tpcc.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "system-config-file": "debug_config.yml",  # Relative to one level up.
+    "txn-config-file": "brad.config",
+    "schema-name": "chbenchmark",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "run-for-s": 180,
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+  },
+)
+
 run_command(
   name="aurora_direct",
   run="./run_aurora_direct.sh",
@@ -25,6 +44,19 @@ run_command(
   },
 )
 
+run_command(
+  name="aurora_direct_zipf",
+  run="./run_aurora_direct.sh",
+  options={
+    "txn-config-file": "aurora.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "run-for-s": 180,
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+  },
+)
+
 run_experiment(
   name="aurora_timing",
   run="./run_aurora_timing.sh",
@@ -36,3 +68,16 @@ run_experiment(
     "run-for-s": 30,
   },
 )
+
+run_experiment(
+  name="aurora_timing_zipf",
+  run="./run_aurora_timing.sh",
+  options={
+    "txn-config-file": "aurora.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "run-for-s": 30,
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+  },
+)
diff --git a/experiments/17-chbenchmark/debug/run_aurora_direct.sh b/experiments/17-chbenchmark/debug/run_aurora_direct.sh
index df6b232a..36d85f2b 100755
--- a/experiments/17-chbenchmark/debug/run_aurora_direct.sh
+++ b/experiments/17-chbenchmark/debug/run_aurora_direct.sh
@@ -9,10 +9,18 @@ extract_named_arguments $@
 abs_txn_config_file=$(realpath $txn_config_file)
 
 cd ../../../workloads/chbenchmark/py-tpcc/
-RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc aurora \
-  --no-load \
-  --config $abs_txn_config_file \
-  --warehouses $txn_warehouses \
-  --duration $run_for_s \
-  --clients $t_clients \
+
+args=(
+  --no-load
+  --config $abs_txn_config_file
+  --warehouses $txn_warehouses
+  --duration $run_for_s
+  --clients $t_clients
   --scalefactor $txn_scale_factor
+)
+
+if [[ ! -z $txn_zipfian_alpha ]]; then
+  args+=(--zipfian-alpha $txn_zipfian_alpha)
+fi
+
+RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc aurora "${args[@]}"
diff --git a/experiments/17-chbenchmark/debug/run_aurora_timing.sh b/experiments/17-chbenchmark/debug/run_aurora_timing.sh
index cb96028a..d28f1633 100755
--- a/experiments/17-chbenchmark/debug/run_aurora_timing.sh
+++ b/experiments/17-chbenchmark/debug/run_aurora_timing.sh
@@ -9,10 +9,18 @@ extract_named_arguments $@
 abs_txn_config_file=$(realpath $txn_config_file)
 
 cd ../../../workloads/chbenchmark/py-tpcc/
-RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc auroratiming \
-  --no-load \
-  --config $abs_txn_config_file \
-  --warehouses $txn_warehouses \
-  --duration $run_for_s \
-  --clients $t_clients \
+
+args=(
+  --no-load
+  --config $abs_txn_config_file
+  --warehouses $txn_warehouses
+  --duration $run_for_s
+  --clients $t_clients
   --scalefactor $txn_scale_factor
+)
+
+if [[ ! -z $txn_zipfian_alpha ]]; then
+  args+=(--zipfian-alpha $txn_zipfian_alpha)
+fi
+
+RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc auroratiming "${args[@]}"
diff --git a/tools/calibration/transactions/chbenchmark/COND b/tools/calibration/transactions/chbenchmark/COND
index f59e559a..f8901f59 100644
--- a/tools/calibration/transactions/chbenchmark/COND
+++ b/tools/calibration/transactions/chbenchmark/COND
@@ -13,6 +13,8 @@ COND_INSTANCES = {
   instance: instance.replace(".", "_").replace("db.", "") for instance in INSTANCES
 }
 
+ZIPFIAN_ALPHA = 5.0
+
 combine(
   name="all",
   deps=[
@@ -36,6 +38,7 @@ for instance in INSTANCES:
           "txn-warehouses": 1740,
           "txn-config-file": "aurora.config",
           "schema-name": "chbenchmark",
+          "txn-zipfian-alpha": ZIPFIAN_ALPHA,
           "instance": instance,
         },
       )
diff --git a/tools/calibration/transactions/chbenchmark/run_instance.sh b/tools/calibration/transactions/chbenchmark/run_instance.sh
index dfe0c6b5..3890358c 100755
--- a/tools/calibration/transactions/chbenchmark/run_instance.sh
+++ b/tools/calibration/transactions/chbenchmark/run_instance.sh
@@ -38,6 +38,10 @@ function extract_named_arguments() {
     if [[ $phys_arg =~ --instance=.+ ]]; then
       instance=${phys_arg:11}
     fi
+
+    if [[ $phys_arg =~ --txn-zipfian-alpha=.+ ]]; then
+      txn_zipfian_alpha=${phys_arg:20}
+    fi
   done
 }
 
@@ -74,7 +78,8 @@ RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc aurora \
   --duration $run_for_s \
   --clients $t_clients \
   --scalefactor 1 \
-  --lat-sample-prob 0.25
+  --lat-sample-prob 0.25 \
+  --txn-zipfian-alpha $txn_zipfian_alpha
 popd
 
 >&2 echo "Waiting 10 seconds before retrieving metrics..."
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroradriver.py b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroradriver.py
index 79e65ebc..26d10812 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroradriver.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroradriver.py
@@ -36,6 +36,8 @@
         "getStockInfo": "SELECT s_quantity, s_data, s_ytd, s_order_cnt, s_remote_cnt, s_dist_{:02d} FROM stock WHERE s_i_id = {} AND s_w_id = {}",  # d_id, ol_i_id, ol_supply_w_id
         "updateStock": "UPDATE stock SET s_quantity = {}, s_ytd = {}, s_order_cnt = {}, s_remote_cnt = {} WHERE s_i_id = {} AND s_w_id = {}",  # s_quantity, s_order_cnt, s_remote_cnt, ol_i_id, ol_supply_w_id
         "createOrderLine": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",  # o_id, d_id, w_id, ol_number, ol_i_id, ol_supply_w_id, ol_quantity, ol_amount, ol_dist_info
+        "createOrderLineMultivalue": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ",
+        "createOrderLineValues": "({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",
     },
     "ORDER_STATUS": {
         "getCustomerByCustomerId": "SELECT c_id, c_first, c_middle, c_last, c_balance FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
@@ -81,7 +83,7 @@ class AuroraDriver(AbstractDriver):
     }
 
     def __init__(self, ddl: str) -> None:
-        super().__init__("brad", ddl)
+        super().__init__("aurora", ddl)
         self._connection: Optional[PsycopgConnection] = None
         self._cursor: Optional[PsycopgCursor] = None
         self._config: Dict[str, Any] = {}
@@ -127,19 +129,19 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             self._cursor.execute_sync("BEGIN")
             for d_id in range(1, constants.DISTRICTS_PER_WAREHOUSE + 1):
                 self._cursor.execute_sync(q["getNewOrder"].format(d_id, w_id))
-                r = self._cursor.fetchall_sync()
-                if len(r) == 0:
+                r = self._cursor.fetchone_sync()
+                if r is None:
                     ## No orders for this district: skip it. Note: This must be reported if > 1%
                     continue
-                no_o_id = r[0][0]
+                no_o_id = r[0]
 
                 self._cursor.execute_sync(q["getCId"].format(no_o_id, d_id, w_id))
-                r = self._cursor.fetchall_sync()
-                c_id = r[0][0]
+                r = self._cursor.fetchone_sync()
+                c_id = r[0]
 
                 self._cursor.execute_sync(q["sumOLAmount"].format(no_o_id, d_id, w_id))
-                r = self._cursor.fetchall_sync()
-                ol_total = decimal.Decimal(r[0][0])
+                r = self._cursor.fetchone_sync()
+                ol_total = decimal.Decimal(r[0])
 
                 self._cursor.execute_sync(
                     q["deleteNewOrder"].format(d_id, w_id, no_o_id)
@@ -203,8 +205,8 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 ## Determine if this is an all local order or not
                 all_local = all_local and i_w_ids[i] == w_id
                 self._cursor.execute_sync(q["getItemInfo"].format(i_ids[i]))
-                r = self._cursor.fetchall_sync()
-                items.append(r[0])
+                r = self._cursor.fetchone_sync()
+                items.append(r)
             assert len(items) == len(i_ids)
 
             ## TPCC defines 1% of neworder gives a wrong itemid, causing rollback.
@@ -219,18 +221,18 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ## Collect Information from WAREHOUSE, DISTRICT, and CUSTOMER
             ## ----------------
             self._cursor.execute_sync(q["getWarehouseTaxRate"].format(w_id))
-            r = self._cursor.fetchall_sync()
-            w_tax = r[0][0]
+            r = self._cursor.fetchone_sync()
+            w_tax = r[0]
 
             self._cursor.execute_sync(q["getDistrict"].format(d_id, w_id))
-            r = self._cursor.fetchall_sync()
-            district_info = r[0]
+            r = self._cursor.fetchone_sync()
+            district_info = r
             d_tax = district_info[0]
             d_next_o_id = district_info[1]
 
             self._cursor.execute_sync(q["getCustomer"].format(w_id, d_id, c_id))
-            r = self._cursor.fetchall_sync()
-            customer_info = r[0]
+            r = self._cursor.fetchone_sync()
+            customer_info = r
             c_discount = customer_info[0]
 
             ## ----------------
@@ -261,6 +263,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ## Insert Order Item Information
             ## ----------------
             item_data = []
+            insert_value_strings = []
             total = 0
             for i in range(len(i_ids)):
                 ol_number = i + 1
@@ -276,15 +279,15 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 self._cursor.execute_sync(
                     q["getStockInfo"].format(d_id, ol_i_id, ol_supply_w_id)
                 )
-                r = self._cursor.fetchall_sync()
-                if len(r) == 0:
+                r = self._cursor.fetchone_sync()
+                if r is None:
                     logger.warning(
                         "No STOCK record for (ol_i_id=%d, ol_supply_w_id=%d)",
                         ol_i_id,
                         ol_supply_w_id,
                     )
                     continue
-                stockInfo = r[0]
+                stockInfo = r
                 s_quantity = stockInfo[0]
                 s_ytd = decimal.Decimal(stockInfo[2])
                 s_order_cnt = int(stockInfo[3])
@@ -326,7 +329,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 ol_amount = ol_quantity * i_price
                 total += ol_amount
 
-                createOrderLine = q["createOrderLine"].format(
+                createOrderLineValues = q["createOrderLineValues"].format(
                     d_next_o_id,
                     d_id,
                     w_id,
@@ -338,7 +341,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                     ol_amount,
                     s_dist_xx,
                 )
-                self._cursor.execute_sync(createOrderLine)
+                insert_value_strings.append(createOrderLineValues)
 
                 ## Add the info to be returned
                 item_data.append(
@@ -346,6 +349,12 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 )
             ## FOR
 
+            # Do one multivalue insert.
+            insertOrderLines = q["createOrderLineMultivalue"] + ", ".join(
+                insert_value_strings
+            )
+            self._cursor.execute_sync(insertOrderLines)
+
             ## Commit!
             self._cursor.execute_sync("COMMIT")
 
@@ -385,8 +394,8 @@ def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 self._cursor.execute_sync(
                     q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
                 )
-                r = self._cursor.fetchall_sync()
-                customer = r[0]
+                r = self._cursor.fetchone_sync()
+                customer = r
             else:
                 # Get the midpoint customer's id
                 self._cursor.execute_sync(
@@ -404,13 +413,13 @@ def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
 
             getLastOrder = q["getLastOrder"].format(w_id, d_id, c_id)
             self._cursor.execute_sync(getLastOrder)
-            r = self._cursor.fetchall_sync()
-            order = r[0]
+            r = self._cursor.fetchone_sync()
+            order = r
             if order:
                 self._cursor.execute_sync(
                     q["getOrderLines"].format(w_id, d_id, order[0])
                 )
-                r = self._cursor.fetchall_sync()
+                r = self._cursor.fetchone_sync()
                 orderLines = r
             else:
                 orderLines = []
@@ -443,8 +452,8 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 self._cursor.execute_sync(
                     q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
                 )
-                r = self._cursor.fetchall_sync()
-                customer = r[0]
+                r = self._cursor.fetchone_sync()
+                customer = r
             else:
                 # Get the midpoint customer's id
                 self._cursor.execute_sync(
@@ -464,12 +473,12 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             c_data = customer[17]
 
             self._cursor.execute_sync(q["getWarehouse"].format(w_id))
-            r = self._cursor.fetchall_sync()
-            warehouse = r[0]
+            r = self._cursor.fetchone_sync()
+            warehouse = r
 
             self._cursor.execute_sync(q["getDistrict"].format(w_id, d_id))
-            r = self._cursor.fetchall_sync()
-            district = r[0]
+            r = self._cursor.fetchone_sync()
+            district = r
 
             self._cursor.execute_sync(
                 q["updateWarehouseBalance"].format(h_amount, w_id)
@@ -548,8 +557,8 @@ def doStockLevel(self, params: Dict[str, Any]) -> int:
 
             self._cursor.execute_sync("BEGIN")
             self._cursor.execute_sync(q["getOId"].format(w_id, d_id))
-            r = self._cursor.fetchall_sync()
-            result = r[0]
+            r = self._cursor.fetchone_sync()
+            result = r
             assert result
             o_id = result[0]
 
@@ -558,8 +567,8 @@ def doStockLevel(self, params: Dict[str, Any]) -> int:
                     w_id, d_id, o_id, (o_id - 20), w_id, threshold
                 )
             )
-            r = self._cursor.fetchall_sync()
-            result = r[0]
+            r = self._cursor.fetchone_sync()
+            result = r
 
             self._cursor.execute_sync("COMMIT")
             return int(result[0])
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py
index d1d88cf4..8443fd53 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/auroratimingdriver.py
@@ -38,6 +38,8 @@
         "getStockInfo": "SELECT s_quantity, s_data, s_ytd, s_order_cnt, s_remote_cnt, s_dist_{:02d} FROM stock WHERE s_i_id = {} AND s_w_id = {}",  # d_id, ol_i_id, ol_supply_w_id
         "updateStock": "UPDATE stock SET s_quantity = {}, s_ytd = {}, s_order_cnt = {}, s_remote_cnt = {} WHERE s_i_id = {} AND s_w_id = {}",  # s_quantity, s_order_cnt, s_remote_cnt, ol_i_id, ol_supply_w_id
         "createOrderLine": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",  # o_id, d_id, w_id, ol_number, ol_i_id, ol_supply_w_id, ol_quantity, ol_amount, ol_dist_info
+        "createOrderLineMultivalue": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ",
+        "createOrderLineValues": "({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",
     },
     "ORDER_STATUS": {
         "getCustomerByCustomerId": "SELECT c_id, c_first, c_middle, c_last, c_balance FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
@@ -83,7 +85,7 @@ class AuroraTimingDriver(AbstractDriver):
     }
 
     def __init__(self, ddl: str) -> None:
-        super().__init__("brad", ddl)
+        super().__init__("aurora timing", ddl)
         self._connection: Optional[PsycopgConnection] = None
         self._cursor: Optional[PsycopgCursor] = None
         self._config: Dict[str, Any] = {}
@@ -127,7 +129,7 @@ def executeStart(self):
         measure_file_path = cond.in_output_dir("aurora_timing.csv")
         self._measure_file = open(measure_file_path, "w", encoding="UTF-8")
         print(
-            "init,begin,getitems,getwdc,getorder,insertorder,commit,collect,total",
+            "init,begin,getitems,getwdc,getorder,insertorder,commit,collect,multi_insert_time,total",
             file=self._measure_file,
         )
 
@@ -224,7 +226,7 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 print(traceback.format_exc())
             raise
 
-    def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+    def doNewOrderOriginal(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
         try:
             assert self._cursor is not None
 
@@ -494,6 +496,285 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 print(traceback.format_exc())
             raise
 
+    def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
+        try:
+            assert self._cursor is not None
+
+            no_start = time.time()
+            q = TXN_QUERIES["NEW_ORDER"]
+            w_id = params["w_id"]
+            d_id = params["d_id"]
+            c_id = params["c_id"]
+            o_entry_d = params["o_entry_d"]
+            i_ids = params["i_ids"]
+            i_w_ids = params["i_w_ids"]
+            i_qtys = params["i_qtys"]
+
+            assert len(i_ids) > 0
+            assert len(i_ids) == len(i_w_ids)
+            assert len(i_ids) == len(i_qtys)
+
+            no_pbegin = time.time()
+            self._cursor.execute_sync("BEGIN")
+            no_abegin = time.time()
+            all_local = True
+            items = []
+            for i in range(len(i_ids)):
+                ## Determine if this is an all local order or not
+                all_local = all_local and i_w_ids[i] == w_id
+                self._cursor.execute_sync(q["getItemInfo"].format(i_ids[i]))
+                r = self._cursor.fetchone_sync()
+                items.append(r)
+            assert len(items) == len(i_ids)
+            no_getitems = time.time()
+
+            ## TPCC defines 1% of neworder gives a wrong itemid, causing rollback.
+            ## Note that this will happen with 1% of transactions on purpose.
+            for item in items:
+                if item is None or len(item) == 0:
+                    self._cursor.execute_sync("ROLLBACK")
+                    return
+            ## FOR
+
+            ## ----------------
+            ## Collect Information from WAREHOUSE, DISTRICT, and CUSTOMER
+            ## ----------------
+            wdc_start = time.time()
+            get_warehouse = q["getWarehouseTaxRate"].format(w_id)
+            self._cursor.execute_sync(get_warehouse)
+            r = self._cursor.fetchone_sync()
+            w_tax = r[0]
+            wdc_warehouse_tax_rate = time.time()
+
+            get_district = q["getDistrict"].format(d_id, w_id)
+            self._cursor.execute_sync(get_district)
+            r = self._cursor.fetchone_sync()
+            district_info = r
+            d_tax = district_info[0]
+            d_next_o_id = district_info[1]
+            wdc_district = time.time()
+
+            get_customer = q["getCustomer"].format(w_id, d_id, c_id)
+            self._cursor.execute_sync(get_customer)
+            r = self._cursor.fetchone_sync()
+            customer_info = r
+            c_discount = customer_info[0]
+            no_get_wdc_info = time.time()
+
+            if self._query_log_file is not None:
+                print(get_warehouse, file=self._query_log_file)
+                print(get_district, file=self._query_log_file)
+                print(get_customer, file=self._query_log_file)
+
+            ## ----------------
+            ## Insert Order Information
+            ## ----------------
+            ol_cnt = len(i_ids)
+            o_carrier_id = constants.NULL_CARRIER_ID
+
+            self._cursor.execute_sync(
+                q["incrementNextOrderId"].format(d_next_o_id + 1, d_id, w_id)
+            )
+            createOrder = q["createOrder"].format(
+                d_next_o_id,
+                d_id,
+                w_id,
+                c_id,
+                o_entry_d.strftime("%Y-%m-%d %H:%M:%S"),
+                o_carrier_id,
+                ol_cnt,
+                1 if all_local else 0,
+            )
+            self._cursor.execute_sync(createOrder)
+            self._cursor.execute_sync(
+                q["createNewOrder"].format(d_next_o_id, d_id, w_id)
+            )
+            no_ins_order_info = time.time()
+
+            ## ----------------
+            ## Insert Order Item Information
+            ## ----------------
+            item_data = []
+            total = 0
+            insert_metadata = []
+            insert_value_strs = []
+            for i in range(len(i_ids)):
+                io_start = time.time()
+                ol_number = i + 1
+                ol_supply_w_id = i_w_ids[i]
+                ol_i_id = i_ids[i]
+                ol_quantity = i_qtys[i]
+
+                itemInfo = items[i]
+                i_name = itemInfo[1]
+                i_data = itemInfo[2]
+                i_price = decimal.Decimal(itemInfo[0])
+                io_init = time.time()
+
+                get_stock_info = q["getStockInfo"].format(d_id, ol_i_id, ol_supply_w_id)
+                self._cursor.execute_sync(get_stock_info)
+                r = self._cursor.fetchone_sync()
+                io_fetch_stock = time.time()
+                if r is None:
+                    logger.warning(
+                        "No STOCK record for (ol_i_id=%d, ol_supply_w_id=%d)",
+                        ol_i_id,
+                        ol_supply_w_id,
+                    )
+                    continue
+                stockInfo = r
+                s_quantity = stockInfo[0]
+                s_ytd = decimal.Decimal(stockInfo[2])
+                s_order_cnt = int(stockInfo[3])
+                s_remote_cnt = int(stockInfo[4])
+                s_data = stockInfo[1]
+                s_dist_xx = stockInfo[5]  # Fetches data from the s_dist_[d_id] column
+
+                ## Update stock
+                s_ytd += ol_quantity
+                if s_quantity >= ol_quantity + 10:
+                    s_quantity = s_quantity - ol_quantity
+                else:
+                    s_quantity = s_quantity + 91 - ol_quantity
+                s_order_cnt += 1
+
+                if ol_supply_w_id != w_id:
+                    s_remote_cnt += 1
+                io_stock_prep = time.time()
+
+                update_stock = q["updateStock"].format(
+                    s_quantity,
+                    s_ytd.quantize(decimal.Decimal("1.00")),
+                    s_order_cnt,
+                    s_remote_cnt,
+                    ol_i_id,
+                    ol_supply_w_id,
+                )
+                self._cursor.execute_sync(update_stock)
+                io_update_stock = time.time()
+
+                if (
+                    i_data.find(constants.ORIGINAL_STRING) != -1
+                    and s_data.find(constants.ORIGINAL_STRING) != -1
+                ):
+                    brand_generic = "B"
+                else:
+                    brand_generic = "G"
+
+                ## Transaction profile states to use "ol_quantity * i_price"
+                ol_amount = ol_quantity * i_price
+                total += ol_amount
+                io_ol_prep = time.time()
+
+                createOrderLineValues = q["createOrderLineValues"].format(
+                    d_next_o_id,
+                    d_id,
+                    w_id,
+                    ol_number,
+                    ol_i_id,
+                    ol_supply_w_id,
+                    o_entry_d.strftime("%Y-%m-%d %H:%M:%S"),
+                    ol_quantity,
+                    ol_amount,
+                    s_dist_xx,
+                )
+                insert_value_strs.append(createOrderLineValues)
+                io_ol_insert = time.time()
+
+                ## Add the info to be returned
+                item_data.append(
+                    (i_name, s_quantity, brand_generic, i_price, ol_amount)
+                )
+                io_ol_append = time.time()
+
+                insert_metadata.append(
+                    (
+                        io_init - io_start,
+                        io_fetch_stock - io_init,
+                        io_stock_prep - io_fetch_stock,
+                        io_update_stock - io_stock_prep,
+                        io_ol_prep - io_update_stock,
+                        io_ol_insert - io_ol_prep,
+                        io_ol_append - io_ol_insert,
+                        io_ol_append - io_start,
+                    )
+                )
+
+                if self._query_log_file is not None:
+                    print(get_stock_info, file=self._query_log_file)
+                    print(update_stock, file=self._query_log_file)
+
+            no_mv_insert_pre = time.time()
+            ## FOR
+            insert_order_line_query = q["createOrderLineMultivalue"] + ", ".join(
+                insert_value_strs
+            )
+            self._cursor.execute_sync(insert_order_line_query)
+            no_mv_insert_after = time.time()
+            if self._query_log_file is not None:
+                print(insert_order_line_query, file=self._query_log_file)
+            no_insert_order_line = time.time()
+
+            ## Commit!
+            self._cursor.execute_sync("COMMIT")
+            no_commit = time.time()
+
+            ## Adjust the total for the discount
+            # print "c_discount:", c_discount, type(c_discount)
+            # print "w_tax:", w_tax, type(w_tax)
+            # print "d_tax:", d_tax, type(d_tax)
+            total = int(
+                total
+                * (1 - decimal.Decimal(c_discount))
+                * (1 + decimal.Decimal(w_tax) + decimal.Decimal(d_tax))
+            )
+
+            ## Pack up values the client is missing (see TPC-C 2.4.3.5)
+            misc = [(w_tax, d_tax, d_next_o_id, total)]
+            no_collect = time.time()
+
+            if self._measure_file is not None:
+                init_time = no_pbegin - no_start
+                begin_time = no_abegin - no_pbegin
+                getitems_time = no_getitems - no_abegin
+                getwdc_time = no_get_wdc_info - no_getitems
+                getorder_time = no_ins_order_info - no_get_wdc_info
+                insertorder_time = no_insert_order_line - no_ins_order_info
+                commit_time = no_commit - no_insert_order_line
+                collect_time = no_collect - no_commit
+                total_time = no_collect - no_start
+                multi_insert_time = no_mv_insert_after - no_mv_insert_pre
+                print(
+                    f"{init_time},{begin_time},{getitems_time},{getwdc_time},{getorder_time},{insertorder_time},{commit_time},{collect_time},{multi_insert_time},{total_time}",
+                    file=self._measure_file,
+                )
+
+            if self._wdc_stats_file is not None:
+                tax_rate_time = wdc_warehouse_tax_rate - wdc_start
+                district_time = wdc_district - wdc_warehouse_tax_rate
+                customer_time = no_get_wdc_info - wdc_district
+                total_time = no_get_wdc_info - wdc_start
+                print(
+                    f"{tax_rate_time},{district_time},{customer_time},{total_time}",
+                    file=self._wdc_stats_file,
+                )
+
+            if self._ol_stats_file is not None:
+                for im in insert_metadata:
+                    print(
+                        "{},{},{},{},{},{},{},{},{}".format(self._ins_ol_counter, *im),
+                        file=self._ol_stats_file,
+                    )
+                self._ins_ol_counter += 1
+
+            return [customer_info, misc, item_data]
+
+        except Exception as ex:
+            if self._nonsilent_errs:
+                print("Error in NEWORDER", str(ex))
+                print(traceback.format_exc())
+            raise
+
     def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
         try:
             assert self._cursor is not None
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/braddriver.py b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/braddriver.py
index 9458a0c1..fa6e678f 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/drivers/braddriver.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/drivers/braddriver.py
@@ -35,6 +35,8 @@
         "getStockInfo": "SELECT s_quantity, s_data, s_ytd, s_order_cnt, s_remote_cnt, s_dist_{:02d} FROM stock WHERE s_i_id = {} AND s_w_id = {}",  # d_id, ol_i_id, ol_supply_w_id
         "updateStock": "UPDATE stock SET s_quantity = {}, s_ytd = {}, s_order_cnt = {}, s_remote_cnt = {} WHERE s_i_id = {} AND s_w_id = {}",  # s_quantity, s_order_cnt, s_remote_cnt, ol_i_id, ol_supply_w_id
         "createOrderLine": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",  # o_id, d_id, w_id, ol_number, ol_i_id, ol_supply_w_id, ol_quantity, ol_amount, ol_dist_info
+        "createOrderLineMultivalue": "INSERT INTO order_line (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info) VALUES ",
+        "createOrderLineValues": "({}, {}, {}, {}, {}, {}, '{}', {}, {}, '{}')",
     },
     "ORDER_STATUS": {
         "getCustomerByCustomerId": "SELECT c_id, c_first, c_middle, c_last, c_balance FROM customer WHERE c_w_id = {} AND c_d_id = {} AND c_id = {}",  # w_id, d_id, c_id
@@ -119,7 +121,7 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ol_delivery_d = params["ol_delivery_d"]
 
             result: List[Tuple[Any, ...]] = []
-            self._client.run_query_json("BEGIN")
+            self._client.run_query_ignore_results("BEGIN")
             for d_id in range(1, constants.DISTRICTS_PER_WAREHOUSE + 1):
                 r, _ = self._client.run_query_json(q["getNewOrder"].format(d_id, w_id))
                 if len(r) == 0:
@@ -137,17 +139,17 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 )
                 ol_total = decimal.Decimal(r[0][0])
 
-                self._client.run_query_json(
+                self._client.run_query_ignore_results(
                     q["deleteNewOrder"].format(d_id, w_id, no_o_id)
                 )
                 updateOrders = q["updateOrders"].format(
                     o_carrier_id, no_o_id, d_id, w_id
                 )
-                self._client.run_query_json(updateOrders)
+                self._client.run_query_ignore_results(updateOrders)
                 updateOrderLine = q["updateOrderLine"].format(
                     ol_delivery_d.strftime("%Y-%m-%d %H:%M:%S"), no_o_id, d_id, w_id
                 )
-                self._client.run_query_json(updateOrderLine)
+                self._client.run_query_ignore_results(updateOrderLine)
 
                 # These must be logged in the "result file" according to TPC-C 2.7.2.2 (page 39)
                 # We remove the queued time, completed time, w_id, and o_carrier_id: the client can figure
@@ -158,7 +160,7 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 ), "ol_total is NULL: there are no order lines. This should not happen"
                 assert ol_total > 0.0
 
-                self._client.run_query_json(
+                self._client.run_query_ignore_results(
                     q["updateCustomer"].format(
                         ol_total.quantize(decimal.Decimal("1.00")), c_id, d_id, w_id
                     )
@@ -166,7 +168,7 @@ def doDelivery(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
 
                 result.append((d_id, no_o_id))
 
-            self._client.run_query_json("COMMIT")
+            self._client.run_query_ignore_results("COMMIT")
             return result
 
         except Exception as ex:
@@ -192,7 +194,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             assert len(i_ids) == len(i_w_ids)
             assert len(i_ids) == len(i_qtys)
 
-            self._client.run_query_json("BEGIN")
+            self._client.run_query_ignore_results("BEGIN")
             all_local = True
             items = []
             for i in range(len(i_ids)):
@@ -206,7 +208,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ## Note that this will happen with 1% of transactions on purpose.
             for item in items:
                 if len(item) == 0:
-                    self._client.run_query_json("ROLLBACK")
+                    self._client.run_query_ignore_results("ROLLBACK")
                     return
             ## FOR
 
@@ -233,7 +235,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ol_cnt = len(i_ids)
             o_carrier_id = constants.NULL_CARRIER_ID
 
-            self._client.run_query_json(
+            self._client.run_query_ignore_results(
                 q["incrementNextOrderId"].format(d_next_o_id + 1, d_id, w_id)
             )
             createOrder = q["createOrder"].format(
@@ -246,8 +248,8 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 ol_cnt,
                 1 if all_local else 0,
             )
-            self._client.run_query_json(createOrder)
-            self._client.run_query_json(
+            self._client.run_query_ignore_results(createOrder)
+            self._client.run_query_ignore_results(
                 q["createNewOrder"].format(d_next_o_id, d_id, w_id)
             )
 
@@ -256,6 +258,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             ## ----------------
             item_data = []
             total = 0
+            insert_value_strings = []
             for i in range(len(i_ids)):
                 ol_number = i + 1
                 ol_supply_w_id = i_w_ids[i]
@@ -296,7 +299,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 if ol_supply_w_id != w_id:
                     s_remote_cnt += 1
 
-                self._client.run_query_json(
+                self._client.run_query_ignore_results(
                     q["updateStock"].format(
                         s_quantity,
                         s_ytd.quantize(decimal.Decimal("1.00")),
@@ -319,7 +322,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 ol_amount = ol_quantity * i_price
                 total += ol_amount
 
-                createOrderLine = q["createOrderLine"].format(
+                createOrderLineValues = q["createOrderLineValues"].format(
                     d_next_o_id,
                     d_id,
                     w_id,
@@ -331,7 +334,7 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                     ol_amount,
                     s_dist_xx,
                 )
-                self._client.run_query_json(createOrderLine)
+                insert_value_strings.append(createOrderLineValues)
 
                 ## Add the info to be returned
                 item_data.append(
@@ -339,8 +342,14 @@ def doNewOrder(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 )
             ## FOR
 
+            # Do one multivalue insert.
+            insertOrderLines = q["createOrderLineMultivalue"] + ", ".join(
+                insert_value_strings
+            )
+            self._client.run_query_ignore_results(insertOrderLines)
+
             ## Commit!
-            self._client.run_query_json("COMMIT")
+            self._client.run_query_ignore_results("COMMIT")
 
             ## Adjust the total for the discount
             # print "c_discount:", c_discount, type(c_discount)
@@ -373,7 +382,7 @@ def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             c_id = params["c_id"]
             c_last = params["c_last"]
 
-            self._client.run_query_json("BEGIN")
+            self._client.run_query_ignore_results("BEGIN")
             if c_id != None:
                 r, _ = self._client.run_query_json(
                     q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
@@ -404,7 +413,7 @@ def doOrderStatus(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             else:
                 orderLines = []
 
-            self._client.run_query_json("COMMIT")
+            self._client.run_query_ignore_results("COMMIT")
             return [customer, order, orderLines]
 
         except Exception as ex:
@@ -427,7 +436,7 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             c_last = params["c_last"]
             h_date = params["h_date"]  # Python datetime
 
-            self._client.run_query_json("BEGIN")
+            self._client.run_query_ignore_results("BEGIN")
             if c_id != None:
                 r, _ = self._client.run_query_json(
                     q["getCustomerByCustomerId"].format(w_id, d_id, c_id)
@@ -456,10 +465,10 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
             r, _ = self._client.run_query_json(q["getDistrict"].format(w_id, d_id))
             district = r[0]
 
-            self._client.run_query_json(
+            self._client.run_query_ignore_results(
                 q["updateWarehouseBalance"].format(h_amount, w_id)
             )
-            self._client.run_query_json(
+            self._client.run_query_ignore_results(
                 q["updateDistrictBalance"].format(h_amount, w_id, d_id)
             )
 
@@ -480,10 +489,10 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                     c_d_id,
                     c_id,
                 )
-                self._client.run_query_json(updateCustomer)
+                self._client.run_query_ignore_results(updateCustomer)
             else:
                 c_data = ""
-                self._client.run_query_json(
+                self._client.run_query_ignore_results(
                     q["updateGCCustomer"].format(
                         c_balance, c_ytd_payment, c_payment_cnt, c_w_id, c_d_id, c_id
                     ),
@@ -502,9 +511,9 @@ def doPayment(self, params: Dict[str, Any]) -> List[Tuple[Any, ...]]:
                 h_amount.quantize(decimal.Decimal("1.00")),
                 h_data,
             )
-            self._client.run_query_json(insertHistory)
+            self._client.run_query_ignore_results(insertHistory)
 
-            self._client.run_query_json("COMMIT")
+            self._client.run_query_ignore_results("COMMIT")
 
             # TPC-C 2.5.3.3: Must display the following fields:
             # W_ID, D_ID, C_ID, C_D_ID, C_W_ID, W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP,
@@ -531,7 +540,7 @@ def doStockLevel(self, params: Dict[str, Any]) -> int:
             d_id = params["d_id"]
             threshold = params["threshold"]
 
-            self._client.run_query_json("BEGIN")
+            self._client.run_query_ignore_results("BEGIN")
             r, _ = self._client.run_query_json(q["getOId"].format(w_id, d_id))
             result = r[0]
             assert result
@@ -544,7 +553,7 @@ def doStockLevel(self, params: Dict[str, Any]) -> int:
             )
             result = r[0]
 
-            self._client.run_query_json("COMMIT")
+            self._client.run_query_ignore_results("COMMIT")
             return int(result[0])
 
         except Exception as ex:
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
index c25bce1c..f10f111f 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
@@ -37,6 +37,7 @@
 import logging
 import os
 import pathlib
+import numpy as np
 from datetime import datetime
 from pprint import pprint, pformat
 from brad.utils.rand_exponential_backoff import RandomizedExponentialBackoff
@@ -62,6 +63,9 @@ def __init__(self, driver, scaleParameters, stop_on_error=False, pct_remote=0.1)
         self.total_workers = 1
         self.worker_index = 0
 
+        self.skew_alpha = None
+        self.skew_prng = None
+
     ## DEF
 
     def execute(
@@ -70,6 +74,7 @@ def execute(
         worker_index: int,
         total_workers: int,
         lat_sample_prob: float,
+        zipfian_alpha: Optional[float],
     ) -> results.Results:
         if RECORD_DETAILED_STATS_VAR in os.environ:
             import conductor.lib as cond
@@ -115,6 +120,17 @@ def execute(
             *self.local_warehouse_range
         )
 
+        if zipfian_alpha is not None:
+            self.skew_alpha = zipfian_alpha
+            self.skew_prng = np.random.default_rng(seed=42 ^ worker_index)
+            logging.info(
+                "Worker index %d - Selecting warehouse and items using a Zipfian distribution; a = %.2f",
+                worker_index,
+                self.skew_alpha,
+            )
+        else:
+            logging.info("Worker index %d - Not using a Zipfian distribution")
+
         r = results.Results(options)
         assert r
         logging.info("Executing benchmark for %d seconds" % duration)
@@ -370,7 +386,19 @@ def makeWarehouseId(self):
                 ):
                     break
         else:
-            w_id = rand.number(*self.local_warehouse_range)
+            if self.skew_prng is not None:
+                # Skewed warehouse choice
+                min_warehouse, max_warehouse = self.local_warehouse_range
+                warehouse_span = max_warehouse - min_warehouse + 1
+                while True:
+                    # Chosen in range [1, inf)
+                    candidate = self.skew_prng.zipf(a=self.skew_alpha)
+                    if candidate <= warehouse_span:
+                        break
+                return min_warehouse + (candidate - 1)
+            else:
+                # Uniformly randomly chosen warehouse
+                w_id = rand.number(*self.local_warehouse_range)
 
         assert w_id >= self.scaleParameters.starting_warehouse, (
             "Invalid W_ID: %d" % w_id
@@ -391,7 +419,14 @@ def makeCustomerId(self):
     ## DEF
 
     def makeItemId(self):
-        return rand.NURand(8191, 1, self.scaleParameters.items)
+        if self.skew_alpha is None:
+            return rand.NURand(8191, 1, self.scaleParameters.items)
+        else:
+            # Select item ID using a zipfian distribution.
+            while True:
+                candidate = self.skew_prng.zipf(a=self.skew_alpha)
+                if candidate <= self.scaleParameters.items:
+                    return candidate
 
     ## DEF
 
diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py b/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
index 027d4fb3..57ff0910 100755
--- a/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
@@ -218,7 +218,11 @@ def executorFunc(
         )
         driver.executeStart()
         results = e.execute(
-            args["duration"], worker_index, total_workers, args["lat_sample_prob"]
+            args["duration"],
+            worker_index,
+            total_workers,
+            args["lat_sample_prob"],
+            args["zipfian_alpha"],
         )
         driver.executeFinish()
 
@@ -304,6 +308,11 @@ def executorFunc(
         default=0.1,
         help="The fraction of the transaction latencies to record.",
     )
+    aparser.add_argument(
+        "--zipfian-alpha",
+        type=float,
+        help="The alpha parameter to use in a Zipfian distribution when selecting warehouse and item IDs.",
+    )
     args = vars(aparser.parse_args())
 
     if args["debug"]:
@@ -386,6 +395,7 @@ def executorFunc(
                 worker_index=0,
                 total_workers=1,
                 lat_sample_prob=args["lat_sample_prob"],
+                zipfian_alpha=args["zipfian_alpha"],
             )
             driver.executeFinish()
         else:

From 30119b8f29e55338dc7d5544c29398fda4ed83fc Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Wed, 1 May 2024 23:12:25 +0000
Subject: [PATCH 07/30] Fix command line argument

---
 tools/calibration/transactions/chbenchmark/run_instance.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/calibration/transactions/chbenchmark/run_instance.sh b/tools/calibration/transactions/chbenchmark/run_instance.sh
index 3890358c..74a0cb8b 100755
--- a/tools/calibration/transactions/chbenchmark/run_instance.sh
+++ b/tools/calibration/transactions/chbenchmark/run_instance.sh
@@ -79,7 +79,7 @@ RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc aurora \
   --clients $t_clients \
   --scalefactor 1 \
   --lat-sample-prob 0.25 \
-  --txn-zipfian-alpha $txn_zipfian_alpha
+  --zipfian-alpha $txn_zipfian_alpha
 popd
 
 >&2 echo "Waiting 10 seconds before retrieving metrics..."

From 025dff6cef87d07c4024582a5326575616abf2a0 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Thu, 2 May 2024 19:17:20 -0400
Subject: [PATCH 08/30] Various changes to support the table movement
 experiments (#505)

This is to make running them slightly less painful:
- We avoid deleting tables from Athena
- Add experiment configs for table movement (specialized scenario first)
- Add tool to make physical alterations to the blueprint and placement

Part of #487.
---
 .../15-e2e-scenarios-v2/specialized/COND      |  10 ++
 .../specialized/run_vector_workload_tm.sh     |  64 +++++++
 .../specialized/specialized_config_tm.yml     | 166 ++++++++++++++++++
 src/brad/admin/table_adjustments.py           | 112 ++++++++++++
 src/brad/blueprint/sql_gen/table.py           |  17 ++
 src/brad/config/file.py                       |  16 ++
 src/brad/daemon/transition_orchestrator.py    |  25 ++-
 src/brad/exec/admin.py                        |   2 +
 8 files changed, 403 insertions(+), 9 deletions(-)
 create mode 100755 experiments/15-e2e-scenarios-v2/specialized/run_vector_workload_tm.sh
 create mode 100644 experiments/15-e2e-scenarios-v2/specialized/specialized_config_tm.yml
 create mode 100644 src/brad/admin/table_adjustments.py

diff --git a/experiments/15-e2e-scenarios-v2/specialized/COND b/experiments/15-e2e-scenarios-v2/specialized/COND
index 439d850b..c7d38ced 100644
--- a/experiments/15-e2e-scenarios-v2/specialized/COND
+++ b/experiments/15-e2e-scenarios-v2/specialized/COND
@@ -19,6 +19,16 @@ run_experiment(
   },
 )
 
+run_experiment(
+  name="brad_100g_vector_tm",
+  run="./run_vector_workload_tm.sh",
+  options={
+    # NOTE: This has table movement enabled.
+    "system-config-file": "specialized_config_tm.yml",
+    **COMMON_CONFIGS,
+  },
+)
+
 run_experiment(
   name="hand_designed_100g_vector",
   run="./run_vector_workload.sh",
diff --git a/experiments/15-e2e-scenarios-v2/specialized/run_vector_workload_tm.sh b/experiments/15-e2e-scenarios-v2/specialized/run_vector_workload_tm.sh
new file mode 100755
index 00000000..a6209ec4
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/specialized/run_vector_workload_tm.sh
@@ -0,0 +1,64 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+# Arguments:
+# --config-file
+# --planner-config-file
+# --query-indexes
+extract_named_arguments $@
+
+# Repeating query indexes:
+# 51, 53, 58, 61, 62, 64, 65, 66, 69, 72, 73, 74, 77, 86, 91
+#
+# Touch `title`:
+# 65, 69, 73
+#
+# Heavy repeating query indexes:
+# 14, 54, 59, 60, 71, 75
+#
+# Touch `title`:
+# 14, 54, 59, 75
+
+# General scenario:
+# Aurora is being used for queries involving `title` because of the vector
+# similarity queries that also touch `title`. After deploying BRAD, it realizes
+# that it's better to replicate `title` and route the rest of the queries onto
+# Redshift.
+
+query_indices="62,64,65,66,69,72,73,74,91,59"
+heavier_queries="14,54,60,71,75"
+all_queries="${query_indices},${heavier_queries}"
+
+start_brad $system_config_file $physical_config_file
+log_workload_point "brad_start_initiated"
+sleep 30
+
+log_workload_point "clients_starting"
+start_repeating_olap_runner 8 5 5 $all_queries "ra_8"
+rana_pid=$runner_pid
+
+start_other_repeating_runner 2 8 5 "ra_vector" 8
+other_pid=$runner_pid
+
+start_txn_runner_serial 4  # Implicit: --dataset-type
+txn_pid=$runner_pid
+log_workload_point "clients_started"
+
+function inner_cancel_experiment() {
+  cancel_experiment $rana_pid $txn_pid $other_pid
+}
+
+trap "inner_cancel_experiment" INT
+trap "inner_cancel_experiment" TERM
+
+# Note that this line is different from the TM-disabled version (3 hours instead of 2).
+sleep $((3 * 60 * 60))  # Wait for 3 hours.
+log_workload_point "experiment_done"
+
+# Shut down everything now.
+>&2 echo "Experiment done. Shutting down runners..."
+graceful_shutdown $rana_pid $txn_pid $other_pid
+log_workload_point "shutdown_complete"
diff --git a/experiments/15-e2e-scenarios-v2/specialized/specialized_config_tm.yml b/experiments/15-e2e-scenarios-v2/specialized/specialized_config_tm.yml
new file mode 100644
index 00000000..41d3ca39
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/specialized/specialized_config_tm.yml
@@ -0,0 +1,166 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 16
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: COND_OUT
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: COND_OUT
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: COND_OUT
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: COND_OUT
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.01
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: false
+skip_sync_before_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.9
+
+aurora_max_query_factor: 4.0
+aurora_max_query_factor_replace: 10000.0
+redshift_peak_load_threshold: 99.0
+redshift_peak_load_multiplier: 1.5
+
+# Blueprint planning performance ceilings.
+query_latency_p90_ceiling_s: 30.0
+txn_latency_p90_ceiling_s: 0.030
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 24
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 2  # Only used by the `benefit_perf_ceiling` comparator
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: regular
+    path: workloads/IMDB_100GB/regular_test/
+  - name: adhoc
+    path: workloads/IMDB_100GB/adhoc_test/
+
+use_preset_redshift_clusters: false
+
+aurora_provisioning_search_distance: 1500.0
+redshift_provisioning_search_distance: 400.0
+
+planner_max_workers: 16
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: true
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 360  # Wait 6 mins before starting.
+  observe_new_blueprint_mins: 3
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 10
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 30.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.030
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5
diff --git a/src/brad/admin/table_adjustments.py b/src/brad/admin/table_adjustments.py
new file mode 100644
index 00000000..75c35b4f
--- /dev/null
+++ b/src/brad/admin/table_adjustments.py
@@ -0,0 +1,112 @@
+import asyncio
+import logging
+
+from brad.asset_manager import AssetManager
+from brad.blueprint.manager import BlueprintManager
+from brad.config.engine import Engine
+from brad.config.file import ConfigFile
+from brad.blueprint.blueprint import Blueprint
+from brad.blueprint.sql_gen.table import TableSqlGenerator
+from brad.front_end.engine_connections import EngineConnections
+
+logger = logging.getLogger(__name__)
+
+
+def register_admin_action(subparser) -> None:
+    parser = subparser.add_parser(
+        "table_adjustments",
+        help="Used to manually modify the physical tables in BRAD's underlying infrastructure.",
+    )
+    parser.add_argument(
+        "--physical-config-file",
+        type=str,
+        required=True,
+        help="Path to BRAD's physical configuration file.",
+    )
+    parser.add_argument(
+        "--schema-name",
+        type=str,
+        required=True,
+        help="The schema name to use.",
+    )
+    parser.add_argument(
+        "action",
+        type=str,
+        help="The action to run {remove_blueprint_table, rename_table}.",
+    )
+    parser.add_argument(
+        "--table-name", type=str, help="The name of the table.", required=True
+    )
+    parser.add_argument("--engines", type=str, nargs="+", help="The engines involved.")
+    parser.add_argument(
+        "--new-table-name", type=str, help="The new table name, when applicable."
+    )
+    parser.set_defaults(admin_action=table_adjustments)
+
+
+async def table_adjustments_impl(args) -> None:
+    # 1. Load the config, blueprint, and provisioning.
+    config = ConfigFile.load_from_physical_config(phys_config=args.physical_config_file)
+    assets = AssetManager(config)
+
+    blueprint_mgr = BlueprintManager(config, assets, args.schema_name)
+    await blueprint_mgr.load()
+    blueprint = blueprint_mgr.get_blueprint()
+    directory = blueprint_mgr.get_directory()
+
+    if args.action == "remove_blueprint_table":
+        # NOTE: This only removes the table from the blueprint. You need to
+        # manually remove it from the physical engines (if appropriate).
+        table_to_remove = args.table_name
+        new_blueprint = Blueprint(
+            schema_name=blueprint.schema_name(),
+            table_schemas=[
+                table for table in blueprint.tables() if table.name != table_to_remove
+            ],
+            table_locations={
+                table_name: locations
+                for table_name, locations in blueprint.table_locations().items()
+                if table_name != table_to_remove
+            },
+            aurora_provisioning=blueprint.aurora_provisioning(),
+            redshift_provisioning=blueprint.redshift_provisioning(),
+            full_routing_policy=blueprint.get_routing_policy(),
+        )
+        blueprint_mgr.force_new_blueprint_sync(new_blueprint, score=None)
+
+    elif args.action == "rename_table":
+        engines = {Engine.from_str(engine_str) for engine_str in args.engines}
+        connections = EngineConnections.connect_sync(
+            config,
+            directory,
+            schema_name=args.schema_name,
+            autocommit=False,
+            specific_engines=engines,
+        )
+        sqlgen = TableSqlGenerator(config, blueprint)
+        for engine in engines:
+            table = blueprint.get_table(args.table_name)
+            logger.info(
+                "On %s: Renaming table %s to %s",
+                str(engine),
+                table.name,
+                args.new_table_name,
+            )
+            statements, run_on = sqlgen.generate_rename_table_sql(
+                table, engine, args.new_table_name
+            )
+            conn = connections.get_connection(run_on)
+            cursor = conn.cursor_sync()
+            for stmt in statements:
+                cursor.execute_sync(stmt)
+            cursor.commit_sync()
+
+    else:
+        logger.error("Unknown action %s", args.action)
+
+    logger.info("Done.")
+
+
+# This method is called by `brad.exec.admin.main`.
+def table_adjustments(args):
+    asyncio.run(table_adjustments_impl(args))
diff --git a/src/brad/blueprint/sql_gen/table.py b/src/brad/blueprint/sql_gen/table.py
index efc8e2c4..5ae0ab6d 100644
--- a/src/brad/blueprint/sql_gen/table.py
+++ b/src/brad/blueprint/sql_gen/table.py
@@ -232,6 +232,23 @@ def generate_extraction_progress_init(
             queries.append(initialize_template.format(table_name=table_name))
         return (queries, Engine.Aurora)
 
+    def generate_rename_table_sql(
+        self, table: Table, location: Engine, new_name: str
+    ) -> Tuple[List[str], Engine]:
+        """
+        Generates the SQL statements needed to rename a table on the given engine.
+        """
+        if location == Engine.Aurora:
+            # Aurora is more complicated because we use a view with other
+            # metadata too. This is not currently needed.
+            raise RuntimeError("Aurora renames are currently unimplemented.")
+
+        elif location == Engine.Redshift or location == Engine.Athena:
+            return ([f"ALTER TABLE {table.name} RENAME TO {new_name}"], location)
+
+        else:
+            raise RuntimeError(f"Unsupported location {str(location)}")
+
 
 def generate_create_index_sql(
     table: Table, indexes: List[Tuple[Column, ...]]
diff --git a/src/brad/config/file.py b/src/brad/config/file.py
index e7eda3d3..b8ef4054 100644
--- a/src/brad/config/file.py
+++ b/src/brad/config/file.py
@@ -190,6 +190,22 @@ def disable_table_movement(self) -> bool:
             # Table movement disabled by default.
             return True
 
+    @property
+    def skip_sync_before_movement(self) -> bool:
+        try:
+            return self._raw["skip_sync_before_table_movement"]
+        except KeyError:
+            # Skip by default.
+            return True
+
+    @property
+    def skip_athena_table_deletion(self) -> bool:
+        try:
+            return self._raw["skip_athena_table_deletion"]
+        except KeyError:
+            # Skip by default.
+            return True
+
     @property
     def use_preset_redshift_clusters(self) -> bool:
         try:
diff --git a/src/brad/daemon/transition_orchestrator.py b/src/brad/daemon/transition_orchestrator.py
index 06bda3d9..4b2d02cc 100644
--- a/src/brad/daemon/transition_orchestrator.py
+++ b/src/brad/daemon/transition_orchestrator.py
@@ -131,14 +131,17 @@ async def run_prepare_then_transition(
 
         # 2. Sync tables (TODO: discuss more efficient alternatives -
         # possibly add a filter of tables to run_sync)
-        await self._data_sync_executor.establish_connections()
-        ran_sync = await self._data_sync_executor.run_sync(
-            self._blueprint_mgr.get_blueprint()
-        )
-        logger.debug(
-            """Completed data sync step during transition. """
-            f"""There were {'some' if ran_sync else 'no'} new writes to sync"""
-        )
+        if not self._config.skip_sync_before_movement:
+            await self._data_sync_executor.establish_connections()
+            ran_sync = await self._data_sync_executor.run_sync(
+                self._blueprint_mgr.get_blueprint()
+            )
+            logger.debug(
+                """Completed data sync step during transition. """
+                f"""There were {'some' if ran_sync else 'no'} new writes to sync"""
+            )
+        else:
+            logger.info("Not running table sync before movement.")
 
         # 3. Create tables in new locations as needed
         directory = self._blueprint_mgr.get_directory()
@@ -628,7 +631,11 @@ async def _run_athena_post_transition(
     ) -> None:
         # Drop removed tables
         to_drop = []
-        if table_diffs is not None and self._config.disable_table_movement is False:
+        if (
+            table_diffs is not None
+            and self._config.disable_table_movement is False
+            and self._config.skip_athena_table_deletion is False
+        ):
             for table_diff in table_diffs:
                 if Engine.Athena in table_diff.removed_locations():
                     to_drop.append(table_diff.table_name())
diff --git a/src/brad/exec/admin.py b/src/brad/exec/admin.py
index 13a970ed..d70e7d7d 100644
--- a/src/brad/exec/admin.py
+++ b/src/brad/exec/admin.py
@@ -15,6 +15,7 @@
 import brad.admin.replay_planner as replay_planner
 import brad.admin.clean_dataset as clean_dataset
 import brad.admin.alter_schema as alter_schema
+import brad.admin.table_adjustments as table_adjustments
 
 logger = logging.getLogger(__name__)
 
@@ -43,6 +44,7 @@ def register_command(subparsers) -> None:
     replay_planner.register_admin_action(admin_subparsers)
     clean_dataset.register_admin_action(admin_subparsers)
     alter_schema.register_admin_action(admin_subparsers)
+    table_adjustments.register_admin_action(admin_subparsers)
     parser.set_defaults(func=main)
 
 
From e2971153f62edba80d6d87bea16b9140dc6821a8 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 3 May 2024 11:00:35 -0400
Subject: [PATCH 09/30] Additional table movement experiment improvements
 (#506)

This is primarily for the specialized scenario.

- Remove the embeddings table from Athena
- Exclude it from the all tables constraint (in the VDBE abstraction,
this table will have different constraints anyways, so this is natural)
- Add a defensive data type conversion for the vector data type (unsure
if unloading will even work)

Part of #487.
---
 .../specialized/set_up_vector_blueprint.py                 | 4 +++-
 src/brad/blueprint/sql_gen/table.py                        | 7 +++++++
 src/brad/planner/beam/fpqb.py                              | 6 +++++-
 src/brad/planner/beam/query_based.py                       | 6 +++++-
 src/brad/planner/beam/table_based.py                       | 6 +++++-
 5 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py b/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py
index cd79ecb3..53a834a5 100644
--- a/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py
+++ b/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py
@@ -133,8 +133,10 @@ def main():
         new_placement[table.name] = [Engine.Aurora, Engine.Athena, Engine.Redshift]
         if table.name == "telemetry":
             new_placement[table.name] = [Engine.Athena]
-        if table.name == "embeddings" or table.name == "title":
+        if table.name == "title":
             new_placement[table.name] = [Engine.Aurora, Engine.Athena]
+        if table.name == "embeddings":
+            new_placement[table.name] = [Engine.Aurora]
     enum_blueprint.set_table_locations(new_placement)
 
     # 6. Transition to the new blueprint.
diff --git a/src/brad/blueprint/sql_gen/table.py b/src/brad/blueprint/sql_gen/table.py
index 5ae0ab6d..59b2d082 100644
--- a/src/brad/blueprint/sql_gen/table.py
+++ b/src/brad/blueprint/sql_gen/table.py
@@ -318,5 +318,12 @@ def _type_for(data_type: str, for_db: Engine) -> str:
         return "BIGINT"
     elif data_type_upper.startswith("VARCHAR") and for_db == Engine.Athena:
         return "STRING"
+    elif data_type_upper.startswith("VECTOR"):
+        if for_db == Engine.Athena:
+            return "BINARY"
+        elif for_db == Engine.Redshift:
+            return "VARBYTE"
+        else:
+            return data_type
     else:
         return data_type
diff --git a/src/brad/planner/beam/fpqb.py b/src/brad/planner/beam/fpqb.py
index d61c4718..8ac887c3 100644
--- a/src/brad/planner/beam/fpqb.py
+++ b/src/brad/planner/beam/fpqb.py
@@ -74,7 +74,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"
diff --git a/src/brad/planner/beam/query_based.py b/src/brad/planner/beam/query_based.py
index 7f6c8e95..eebfa834 100644
--- a/src/brad/planner/beam/query_based.py
+++ b/src/brad/planner/beam/query_based.py
@@ -75,7 +75,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"
diff --git a/src/brad/planner/beam/table_based.py b/src/brad/planner/beam/table_based.py
index 36cc6138..2040598f 100644
--- a/src/brad/planner/beam/table_based.py
+++ b/src/brad/planner/beam/table_based.py
@@ -75,7 +75,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"

From dc45218cb6a7fb7fb01860288eb083c5ab6d48f3 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 3 May 2024 11:46:41 -0400
Subject: [PATCH 10/30] Additional table movement support fixes

- Ensure consistent serialized schema names
- Make drop table operator tolerant of missing tables
- Print serialized schema name
---
 src/brad/admin/modify_blueprint.py          | 11 ++++++++++-
 src/brad/blueprint/blueprint.py             |  2 ++
 src/brad/data_sync/operators/drop_tables.py |  2 +-
 src/brad/planner/enumeration/blueprint.py   |  4 ++--
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/brad/admin/modify_blueprint.py b/src/brad/admin/modify_blueprint.py
index 7b0a530c..64ca6c75 100644
--- a/src/brad/admin/modify_blueprint.py
+++ b/src/brad/admin/modify_blueprint.py
@@ -149,6 +149,13 @@ def register_admin_action(subparser) -> None:
         help="Set to abort an in-progress transition. "
         "Only do this if you know what you are doing!",
     )
+    parser.add_argument(
+        "--reset-schema-name",
+        action="store_true",
+        help="Set to ensure the serialized schema name is the same as the "
+        "passed-in schema name. Sometimes there may be a mismatch, which can "
+        "cause problems.",
+    )
     parser.set_defaults(admin_action=modify_blueprint)
 
 
@@ -350,7 +357,9 @@ def modify_blueprint(args) -> None:
         enum_blueprint.set_routing_policy(full_policy)
 
     # 6. Write the changes back.
-    modified_blueprint = enum_blueprint.to_blueprint()
+    modified_blueprint = enum_blueprint.to_blueprint(
+        forced_schema_name=args.schema_name if args.reset_schema_name else None
+    )
     if blueprint == modified_blueprint:
         logger.info("No changes made to the blueprint.")
         return
diff --git a/src/brad/blueprint/blueprint.py b/src/brad/blueprint/blueprint.py
index ba169a97..2610278a 100644
--- a/src/brad/blueprint/blueprint.py
+++ b/src/brad/blueprint/blueprint.py
@@ -140,6 +140,8 @@ def __repr__(self) -> str:
                 "---",
                 indefinite_policies,
                 definite_policy,
+                "---",
+                f"Schema name: {self.schema_name()}",
             ]
         )
 
diff --git a/src/brad/data_sync/operators/drop_tables.py b/src/brad/data_sync/operators/drop_tables.py
index c8d41ed6..35f7b5df 100644
--- a/src/brad/data_sync/operators/drop_tables.py
+++ b/src/brad/data_sync/operators/drop_tables.py
@@ -30,7 +30,7 @@ def __repr__(self) -> str:
         )
 
     async def execute(self, ctx: ExecutionContext) -> "Operator":
-        query_template = "DROP TABLE {}"
+        query_template = "DROP TABLE IF EXISTS {}"
 
         if self._engine == Engine.Aurora:
             for table in self._table_names:
diff --git a/src/brad/planner/enumeration/blueprint.py b/src/brad/planner/enumeration/blueprint.py
index 8fff27dd..f166f29d 100644
--- a/src/brad/planner/enumeration/blueprint.py
+++ b/src/brad/planner/enumeration/blueprint.py
@@ -51,13 +51,13 @@ def set_routing_policy(
         self._current_routing_policy = routing_policy
         return self
 
-    def to_blueprint(self) -> Blueprint:
+    def to_blueprint(self, forced_schema_name: Optional[str] = None) -> Blueprint:
         """
         Makes a copy of this object as a `Blueprint`.
         """
 
         return Blueprint(
-            self.schema_name(),
+            self.schema_name() if forced_schema_name is None else forced_schema_name,
             self.tables(),
             table_locations={
                 name: locations.copy()

From 938796d7bd12cf0cc313faa11fdb91afc9cf6410 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 3 May 2024 13:24:07 -0400
Subject: [PATCH 11/30] Make S3 loads to Redshift more permissive, add more
 logs to table movement

---
 src/brad/daemon/transition_orchestrator.py   | 42 ++++++++++++++------
 src/brad/data_sync/operators/load_from_s3.py |  2 +
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/brad/daemon/transition_orchestrator.py b/src/brad/daemon/transition_orchestrator.py
index 4b2d02cc..9c065721 100644
--- a/src/brad/daemon/transition_orchestrator.py
+++ b/src/brad/daemon/transition_orchestrator.py
@@ -191,7 +191,7 @@ async def run_prepare_then_transition(
                 table_awaitables.append(self._enforce_table_diff_additions(diff))
         await asyncio.gather(*table_awaitables)
 
-        logger.debug("Table movement complete.")
+        logger.info("Table movement complete.")
 
         # Close connections
         await self._cxns.close()
@@ -490,6 +490,12 @@ async def _run_aurora_post_transition(
                     tables_to_drop.append(source_table_name(table_diff.table_name()))
                     tables_to_drop.append(shadow_table_name(table_diff.table_name()))
 
+            logger.info("In transition: Dropping Aurora views %s", str(views_to_drop))
+            logger.info(
+                "In transition: Dropping Aurora triggers %s", str(triggers_to_drop)
+            )
+            logger.info("In transition: Dropping Aurora tables %s", str(tables_to_drop))
+
             ctx = self._new_execution_context()
 
             dv = DropViews(views_to_drop, Engine.Aurora)
@@ -609,7 +615,7 @@ async def _run_redshift_post_transition(
             for table_diff in table_diffs:
                 if Engine.Redshift in table_diff.removed_locations():
                     to_drop.append(table_diff.table_name())
-            logger.debug(f"Tables to drop: {to_drop}")
+            logger.info("In transition: Dropping Redshift tables %s", str(to_drop))
             d = DropTables(to_drop, Engine.Redshift)
             ctx = self._new_execution_context()
             await d.execute(ctx)
@@ -639,6 +645,7 @@ async def _run_athena_post_transition(
             for table_diff in table_diffs:
                 if Engine.Athena in table_diff.removed_locations():
                     to_drop.append(table_diff.table_name())
+            logger.info("In transition: Dropping Athena tables %s", str(to_drop))
             d = DropTables(to_drop, Engine.Athena)
             ctx = self._new_execution_context()
             await d.execute(ctx)
@@ -677,28 +684,28 @@ async def _unload_table(self, table_name: str, s3_path: str) -> None:
         if Engine.Redshift in curr_locations:  # Faster to write out from Redshift
             u = UnloadToS3(table_name, s3_path, engine=Engine.Redshift, delimiter=",")
             ctx = self._new_execution_context()
-            await u.execute(ctx)
-            logger.debug(
-                f"In transition: table {table_name} written to S3 from Redshift."
+            logger.info(
+                "In transition: table %s being written to S3 from Redshift.", table_name
             )
+            await u.execute(ctx)
         elif Engine.Aurora in curr_locations:
             u = UnloadToS3(table_name, s3_path, engine=Engine.Aurora, delimiter=",")
             ctx = self._new_execution_context()
-            await u.execute(ctx)
-            logger.debug(
-                f"In transition: table {table_name} written to S3 from Aurora."
+            logger.info(
+                "In transition: table %s being written to S3 from Aurora.", table_name
             )
+            await u.execute(ctx)
         elif Engine.Athena in curr_locations:
             u = UnloadToS3(table_name, s3_path, engine=Engine.Athena)
             ctx = self._new_execution_context()
-            await u.execute(ctx)
-            logger.debug(
-                f"In transition: table {table_name} written to S3 from Athena."
+            logger.info(
+                "In transition: table %s being written to S3 from Athena.", table_name
             )
+            await u.execute(ctx)
         else:
             logger.error(
-                f"""In transition: table {table_name} does not exist 
-                         on any engine in current blueprint."""
+                "In transition: table %s does not exist on any engine in current blueprint.",
+                table_name,
             )
 
     async def _load_table_to_engine(self, table_name: str, e: Engine) -> None:
@@ -715,6 +722,9 @@ async def _load_table_to_engine(self, table_name: str, e: Engine) -> None:
 
         if e == Engine.Aurora:
             # Load table to aurora from S3
+            logger.info(
+                "In transition: loading table %s into Aurora from S3", table_name
+            )
             response = ctx.s3_client().list_objects_v2(
                 Bucket=ctx.s3_bucket(), Prefix=ctx.s3_path() + s3_path_prefix
             )
@@ -758,12 +768,18 @@ async def _load_table_to_engine(self, table_name: str, e: Engine) -> None:
             await cursor.commit()
 
         elif e == Engine.Redshift:
+            logger.info(
+                "In transition: loading table %s into Redshift from S3", table_name
+            )
             l = LoadFromS3(table_name, s3_path_prefix, e, delimiter=",", header_rows=1)
             await l.execute(ctx)
             nonsilent_assert(self._cxns is not None)
             assert self._cxns is not None
             self._cxns.get_connection(Engine.Redshift).cursor_sync().commit_sync()
         elif e == Engine.Athena:
+            logger.info(
+                "In transition: loading table %s into Athena from S3", table_name
+            )
             l = LoadFromS3(table_name, s3_path_prefix, e, delimiter=",", header_rows=1)
             await l.execute(ctx)
 
diff --git a/src/brad/data_sync/operators/load_from_s3.py b/src/brad/data_sync/operators/load_from_s3.py
index ff984efd..248209f6 100644
--- a/src/brad/data_sync/operators/load_from_s3.py
+++ b/src/brad/data_sync/operators/load_from_s3.py
@@ -29,6 +29,8 @@
     DELIMITER '{delimiter}'
     IGNOREHEADER {header_rows}
     REMOVEQUOTES
+    BLANKASNULL
+    IGNOREALLERRORS
 """
 
 _ATHENA_CREATE_LOAD_TABLE = """

From 0fac52164e3903d6abf14483848c16430db42cc9 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 3 May 2024 14:39:30 -0400
Subject: [PATCH 12/30] Additional load argument fix

---
 src/brad/data_sync/operators/load_from_s3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/brad/data_sync/operators/load_from_s3.py b/src/brad/data_sync/operators/load_from_s3.py
index 248209f6..98b25397 100644
--- a/src/brad/data_sync/operators/load_from_s3.py
+++ b/src/brad/data_sync/operators/load_from_s3.py
@@ -29,7 +29,6 @@
     DELIMITER '{delimiter}'
     IGNOREHEADER {header_rows}
     REMOVEQUOTES
-    BLANKASNULL
     IGNOREALLERRORS
 """
 

From 68954e773116de8a90bc3bb5608f4e532b980abe Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 4 May 2024 10:14:53 -0400
Subject: [PATCH 13/30] Add table movement enabled scale down experiment and
 log movement progress (#507)

Part of #487.
---
 .../15-e2e-scenarios-v2/scale_down/COND       |   9 +
 .../scale_down/scale_down_config_tm.yml       | 164 ++++++++++++++++++
 .../scale_down/set_up_starting_blueprint.py   |   3 +-
 src/brad/config/system_event.py               |   6 +
 src/brad/daemon/transition_orchestrator.py    |  35 ++++
 5 files changed, 216 insertions(+), 1 deletion(-)
 create mode 100644 experiments/15-e2e-scenarios-v2/scale_down/scale_down_config_tm.yml

diff --git a/experiments/15-e2e-scenarios-v2/scale_down/COND b/experiments/15-e2e-scenarios-v2/scale_down/COND
index 22bdb68a..87694f9e 100644
--- a/experiments/15-e2e-scenarios-v2/scale_down/COND
+++ b/experiments/15-e2e-scenarios-v2/scale_down/COND
@@ -58,6 +58,15 @@ run_experiment(
   },
 )
 
+run_experiment(
+  name="brad_100g_tm",
+  run="./run_workload.sh",
+  options={
+    "system-config-file": "scale_down_config_tm.yml",
+    **COMMON_100G_CONFIGS,
+  },
+)
+
 run_command(
   name="brad_100g_debug",
   run="./run_workload_debug.sh",
diff --git a/experiments/15-e2e-scenarios-v2/scale_down/scale_down_config_tm.yml b/experiments/15-e2e-scenarios-v2/scale_down/scale_down_config_tm.yml
new file mode 100644
index 00000000..2bdf9bfa
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/scale_down/scale_down_config_tm.yml
@@ -0,0 +1,164 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 8
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: COND_OUT
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: COND_OUT
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: COND_OUT
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: COND_OUT
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.01
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: false
+skip_sync_before_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.8
+
+# Blueprint planning performance ceilings.
+query_latency_p90_ceiling_s: 30.0
+txn_latency_p90_ceiling_s: 0.030
+
+aurora_provisioning_search_distance: 900.0
+redshift_provisioning_search_distance: 900.0
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 24
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 2  # Only used by the `benefit_perf_ceiling` comparator
+
+aurora_max_query_factor: 4.0
+aurora_max_query_factor_replace: 10000.0
+redshift_peak_load_threshold: 99.0
+redshift_peak_load_multiplier: 1.5
+
+planner_max_workers: 16
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: regular
+    path: workloads/IMDB_100GB/regular_test/
+  - name: adhoc
+    path: workloads/IMDB_100GB/adhoc_test/
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: true
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 360  # Wait 6 mins before starting.
+  observe_new_blueprint_mins: 3
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 30.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.030
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5
diff --git a/experiments/15-e2e-scenarios-v2/scale_down/set_up_starting_blueprint.py b/experiments/15-e2e-scenarios-v2/scale_down/set_up_starting_blueprint.py
index c96eec27..542c6afa 100644
--- a/experiments/15-e2e-scenarios-v2/scale_down/set_up_starting_blueprint.py
+++ b/experiments/15-e2e-scenarios-v2/scale_down/set_up_starting_blueprint.py
@@ -71,6 +71,7 @@ def main():
         help="Comma separated list of indices.",
         default="99,56,32,92,91,49,30,83,94,38,87,86,76,37,31,46",
     )
+    parser.add_argument("--place-tables-both", action="store_true")
     args = parser.parse_args()
     set_up_logging(debug_mode=True)
 
@@ -130,7 +131,7 @@ def main():
     new_placement = {}
     aurora_txn = ["theatres", "showings", "ticket_orders", "movie_info", "aka_title"]
     for table in blueprint.tables():
-        if table.name in aurora_txn:
+        if args.place_tables_both or table.name in aurora_txn:
             new_placement[table.name] = [Engine.Aurora, Engine.Redshift]
         else:
             new_placement[table.name] = [Engine.Redshift]
diff --git a/src/brad/config/system_event.py b/src/brad/config/system_event.py
index 68c125f5..71574149 100644
--- a/src/brad/config/system_event.py
+++ b/src/brad/config/system_event.py
@@ -37,3 +37,9 @@ class SystemEvent(enum.Enum):
     # Used when a service level objective is changed while BRAD is running (used
     # for experiments).
     ChangedSlos = "changed_slos"
+
+    # Used to mark table movement progress.
+    PreTableMovementStarted = "pre_table_movement_started"
+    PreTableMovementCompleted = "pre_table_movement_completed"
+    PostTableMovementStarted = "post_table_movement_started"
+    PostTableMovementCompleted = "post_table_movement_completed"
diff --git a/src/brad/daemon/transition_orchestrator.py b/src/brad/daemon/transition_orchestrator.py
index 9c065721..8a94165e 100644
--- a/src/brad/daemon/transition_orchestrator.py
+++ b/src/brad/daemon/transition_orchestrator.py
@@ -143,6 +143,9 @@ async def run_prepare_then_transition(
         else:
             logger.info("Not running table sync before movement.")
 
+        if self._system_event_logger is not None:
+            self._system_event_logger.log(SystemEvent.PreTableMovementStarted)
+
         # 3. Create tables in new locations as needed
         directory = self._blueprint_mgr.get_directory()
 
@@ -192,6 +195,8 @@ async def run_prepare_then_transition(
         await asyncio.gather(*table_awaitables)
 
         logger.info("Table movement complete.")
+        if self._system_event_logger is not None:
+            self._system_event_logger.log(SystemEvent.PreTableMovementCompleted)
 
         # Close connections
         await self._cxns.close()
@@ -480,6 +485,11 @@ async def _run_aurora_post_transition(
             and len(table_diffs) > 0
             and self._config.disable_table_movement is False
         ):
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementStarted, "aurora"
+                )
+
             views_to_drop = []
             triggers_to_drop = []
             tables_to_drop = []
@@ -511,6 +521,11 @@ async def _run_aurora_post_transition(
             assert self._cxns is not None
             self._cxns.get_connection(Engine.Aurora).cursor_sync().commit_sync()
 
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementCompleted, "aurora"
+                )
+
         # Change the provisioning.
         if diff is not None:
             if new.num_nodes() == 0:
@@ -611,6 +626,11 @@ async def _run_redshift_post_transition(
     ) -> None:
         # Drop removed tables
         if table_diffs is not None and self._config.disable_table_movement is False:
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementStarted, "redshift"
+                )
+
             to_drop = []
             for table_diff in table_diffs:
                 if Engine.Redshift in table_diff.removed_locations():
@@ -623,6 +643,11 @@ async def _run_redshift_post_transition(
             assert self._cxns is not None
             self._cxns.get_connection(Engine.Redshift).cursor_sync().commit_sync()
 
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementCompleted, "redshift"
+                )
+
         # Pause the cluster if we are transitioning to 0 nodes.
         if diff is not None:
             if diff.new_num_nodes() == 0:
@@ -642,6 +667,11 @@ async def _run_athena_post_transition(
             and self._config.disable_table_movement is False
             and self._config.skip_athena_table_deletion is False
         ):
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementStarted, "athena"
+                )
+
             for table_diff in table_diffs:
                 if Engine.Athena in table_diff.removed_locations():
                     to_drop.append(table_diff.table_name())
@@ -650,6 +680,11 @@ async def _run_athena_post_transition(
             ctx = self._new_execution_context()
             await d.execute(ctx)
 
+            if self._system_event_logger is not None:
+                self._system_event_logger.log(
+                    SystemEvent.PostTableMovementCompleted, "athena"
+                )
+
     async def _enforce_table_diff_additions(self, diff: TableDiff) -> None:
         # Unload table to S3
         table_name = diff.table_name()

From 7fb971f2519b9a58115875e51647eb8c17bf93bb Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 4 May 2024 10:33:51 -0400
Subject: [PATCH 14/30] Skip Aurora table deletion to speed up restarts

---
 src/brad/config/file.py                    | 8 ++++++++
 src/brad/daemon/transition_orchestrator.py | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/brad/config/file.py b/src/brad/config/file.py
index b8ef4054..c14facc2 100644
--- a/src/brad/config/file.py
+++ b/src/brad/config/file.py
@@ -206,6 +206,14 @@ def skip_athena_table_deletion(self) -> bool:
             # Skip by default.
             return True
 
+    @property
+    def skip_aurora_table_deletion(self) -> bool:
+        try:
+            return self._raw["skip_aurora_table_deletion"]
+        except KeyError:
+            # Skip by default.
+            return True
+
     @property
     def use_preset_redshift_clusters(self) -> bool:
         try:
diff --git a/src/brad/daemon/transition_orchestrator.py b/src/brad/daemon/transition_orchestrator.py
index 8a94165e..a08f5e26 100644
--- a/src/brad/daemon/transition_orchestrator.py
+++ b/src/brad/daemon/transition_orchestrator.py
@@ -484,6 +484,7 @@ async def _run_aurora_post_transition(
             table_diffs is not None
             and len(table_diffs) > 0
             and self._config.disable_table_movement is False
+            and self._config.skip_aurora_table_deletion is False
         ):
             if self._system_event_logger is not None:
                 self._system_event_logger.log(

From cdc391af83f43737d6f08f12cac408980b6bda4c Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 4 May 2024 17:21:14 -0400
Subject: [PATCH 15/30] Fix table movement connection context

---
 src/brad/daemon/transition_orchestrator.py | 19 ++++++++++++++-----
 src/brad/front_end/engine_connections.py   |  6 ++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/brad/daemon/transition_orchestrator.py b/src/brad/daemon/transition_orchestrator.py
index a08f5e26..75b6c208 100644
--- a/src/brad/daemon/transition_orchestrator.py
+++ b/src/brad/daemon/transition_orchestrator.py
@@ -480,8 +480,11 @@ async def _run_aurora_post_transition(
         table_diffs: Optional[list[TableDiff]],
     ) -> None:
         # Drop removed tables.
+        assert self._curr_blueprint is not None
+        aurora_on = self._curr_blueprint.aurora_provisioning().num_nodes() > 0
         if (
-            table_diffs is not None
+            aurora_on
+            and table_diffs is not None
             and len(table_diffs) > 0
             and self._config.disable_table_movement is False
             and self._config.skip_aurora_table_deletion is False
@@ -626,7 +629,13 @@ async def _run_redshift_post_transition(
         self, diff: Optional[ProvisioningDiff], table_diffs: Optional[list[TableDiff]]
     ) -> None:
         # Drop removed tables
-        if table_diffs is not None and self._config.disable_table_movement is False:
+        assert self._curr_blueprint is not None
+        redshift_on = self._curr_blueprint.redshift_provisioning().num_nodes() > 0
+        if (
+            redshift_on
+            and table_diffs is not None
+            and self._config.disable_table_movement is False
+        ):
             if self._system_event_logger is not None:
                 self._system_event_logger.log(
                     SystemEvent.PostTableMovementStarted, "redshift"
@@ -825,9 +834,9 @@ def _new_execution_context(self) -> ExecutionContext:
         nonsilent_assert(self._cxns is not None)
         assert self._cxns is not None
         return ExecutionContext(
-            aurora=self._cxns.get_connection(Engine.Aurora),
-            athena=self._cxns.get_connection(Engine.Athena),
-            redshift=self._cxns.get_connection(Engine.Redshift),
+            aurora=self._cxns.get_connection_if_exists(Engine.Aurora),
+            athena=self._cxns.get_connection_if_exists(Engine.Athena),
+            redshift=self._cxns.get_connection_if_exists(Engine.Redshift),
             blueprint=self._blueprint_mgr.get_blueprint(),
             config=self._config,
         )
diff --git a/src/brad/front_end/engine_connections.py b/src/brad/front_end/engine_connections.py
index 5c63dd3c..b1aa041c 100644
--- a/src/brad/front_end/engine_connections.py
+++ b/src/brad/front_end/engine_connections.py
@@ -274,6 +274,12 @@ def get_connection(self, engine: Engine) -> Connection:
         except KeyError as ex:
             raise RuntimeError("Not connected to {}".format(engine)) from ex
 
+    def get_connection_if_exists(self, engine: Engine) -> Optional[Connection]:
+        try:
+            return self._connection_map[engine]
+        except KeyError:
+            return None
+
     def get_reader_connection(
         self, engine: Engine, specific_index: Optional[int] = None
     ) -> Connection:

From 531599b8e34e3fe229988aeb5b70bfdb2efb2014 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sun, 5 May 2024 17:12:49 -0400
Subject: [PATCH 16/30] Scoring fixes for scale down and type fixes in scoring
 methods (#508)

Part of #487.
---
 .../scoring/performance/unified_aurora.py        | 16 +++++++++-------
 .../scoring/performance/unified_redshift.py      | 14 +++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/brad/planner/scoring/performance/unified_aurora.py b/src/brad/planner/scoring/performance/unified_aurora.py
index f015126c..eedb59ba 100644
--- a/src/brad/planner/scoring/performance/unified_aurora.py
+++ b/src/brad/planner/scoring/performance/unified_aurora.py
@@ -114,8 +114,10 @@ def predict_loads(
         current_has_replicas = curr_prov.num_nodes() > 1
         next_has_replicas = next_prov.num_nodes() > 1
 
-        curr_writer_cpu_util = ctx.metrics.aurora_writer_cpu_avg / 100
-        curr_writer_cpu_util_denorm = curr_writer_cpu_util * aurora_num_cpus(curr_prov)
+        curr_writer_cpu_util = float(ctx.metrics.aurora_writer_cpu_avg / 100)
+        curr_writer_cpu_util_denorm = float(
+            curr_writer_cpu_util * aurora_num_cpus(curr_prov)
+        )
 
         # We take a very conservative approach to query movement. If new queries
         # are added onto Aurora, we increase the load. But if queries are
@@ -209,7 +211,7 @@ def predict_loads(
             # We currently have read replicas.
             curr_num_read_replicas = curr_prov.num_nodes() - 1
             total_reader_cpu_denorm = (
-                (ctx.metrics.aurora_reader_cpu_avg / 100)
+                float(ctx.metrics.aurora_reader_cpu_avg / 100)
                 * aurora_num_cpus(curr_prov)
                 * curr_num_read_replicas
             )
@@ -277,11 +279,11 @@ def compute_direct_cpu_denorm(
             per_query_cpu_denorm = np.clip(
                 query_run_times * alpha, a_min=0.0, a_max=load_max
             )
-            total_denorm = np.dot(per_query_cpu_denorm, arrival_weights)
-            max_query_cpu_denorm = per_query_cpu_denorm.max()
+            total_denorm = np.dot(per_query_cpu_denorm, arrival_weights).item()
+            max_query_cpu_denorm = (per_query_cpu_denorm * arrival_weights).max().item()
         else:
             # Edge case: Query with 0 arrival count (used as a constraint).
-            total_denorm = np.zeros_like(query_run_times)
+            total_denorm = 0.0
             max_query_cpu_denorm = 0.0
         if debug_dict is not None:
             debug_dict["aurora_total_cpu_denorm"] = total_denorm
@@ -309,7 +311,7 @@ def query_movement_factor(
         total_next_latency = np.dot(
             curr_query_run_times, workload.get_arrival_counts_batch(query_indices)
         )
-        return total_next_latency / norm_factor
+        return total_next_latency.item() / norm_factor
 
     @classmethod
     def predict_query_latency_load_resources(
diff --git a/src/brad/planner/scoring/performance/unified_redshift.py b/src/brad/planner/scoring/performance/unified_redshift.py
index 4f51b85b..e509cc9d 100644
--- a/src/brad/planner/scoring/performance/unified_redshift.py
+++ b/src/brad/planner/scoring/performance/unified_redshift.py
@@ -53,10 +53,10 @@ def compute(
             ctx.metrics.redshift_cpu_list is not None
             and ctx.metrics.redshift_cpu_list.shape[0] > 0
         ):
-            avg_cpu = ctx.metrics.redshift_cpu_list.mean()
+            avg_cpu: float = ctx.metrics.redshift_cpu_list.mean().item()
         else:
             # This won't be used. This is actually max.
-            avg_cpu = ctx.metrics.redshift_cpu_avg
+            avg_cpu = float(ctx.metrics.redshift_cpu_avg)
 
         gamma_norm_factor = HotConfig.instance().get_value(
             "query_lat_p90", default=30.0
@@ -180,7 +180,7 @@ def predict_max_node_cpu_util(
             curr_cpu_util *= gamma
 
             curr_cpu_denorm = curr_cpu_util * redshift_num_cpus(curr_prov)
-            curr_max_cpu_denorm = curr_cpu_denorm.max()
+            curr_max_cpu_denorm = curr_cpu_denorm.max().item()
 
             (
                 peak_load,
@@ -262,11 +262,11 @@ def compute_direct_cpu_denorm(
             per_query_cpu_denorm = np.clip(
                 query_run_times * alpha, a_min=0.0, a_max=load_max
             )
-            total_denorm = np.dot(per_query_cpu_denorm, arrival_weights)
-            max_query_cpu_denorm = per_query_cpu_denorm.max()
+            total_denorm = np.dot(per_query_cpu_denorm, arrival_weights).item()
+            max_query_cpu_denorm = (per_query_cpu_denorm * arrival_weights).max().item()
         else:
             # Edge case: Query with 0 arrival count (used as a constraint).
-            total_denorm = np.zeros_like(query_run_times)
+            total_denorm = 0.0
             max_query_cpu_denorm = 0.0
         if debug_dict is not None:
             debug_dict["redshift_total_cpu_denorm"] = total_denorm
@@ -294,7 +294,7 @@ def query_movement_factor(
         total_next_latency = np.dot(
             curr_query_run_times, workload.get_arrival_counts_batch(query_indices)
         )
-        return total_next_latency / norm_factor
+        return total_next_latency.item() / norm_factor
 
     @staticmethod
     def predict_query_latency_load_resources(

From 3ea86e3b7a59e9387473012902902817dc4e976b Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 6 May 2024 10:00:21 -0400
Subject: [PATCH 17/30] Add A+R baseline to the scale down scenario

---
 .../15-e2e-scenarios-v2/scale_down/COND       |  9 +++++
 .../scale_down/run_ar_baseline.sh             | 40 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100755 experiments/15-e2e-scenarios-v2/scale_down/run_ar_baseline.sh

diff --git a/experiments/15-e2e-scenarios-v2/scale_down/COND b/experiments/15-e2e-scenarios-v2/scale_down/COND
index 87694f9e..978d8cbf 100644
--- a/experiments/15-e2e-scenarios-v2/scale_down/COND
+++ b/experiments/15-e2e-scenarios-v2/scale_down/COND
@@ -85,3 +85,12 @@ run_experiment(
     **COMMON_100G_CONFIGS,
   },
 )
+
+run_experiment(
+  name="ar_100g",
+  run="./run_ar_baseline.sh",
+  options={
+    # System config file not needed.
+    **COMMON_100G_CONFIGS,
+  },
+)
diff --git a/experiments/15-e2e-scenarios-v2/scale_down/run_ar_baseline.sh b/experiments/15-e2e-scenarios-v2/scale_down/run_ar_baseline.sh
new file mode 100755
index 00000000..be81b7a0
--- /dev/null
+++ b/experiments/15-e2e-scenarios-v2/scale_down/run_ar_baseline.sh
@@ -0,0 +1,40 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+# Arguments:
+# --system-config-file
+# --physical-config-file
+# --query-indexes
+extract_named_arguments $@
+
+schema_name="imdb_extended_100g"
+
+log_workload_point "clients_starting"
+start_redshift_serverless_olap_runner 8 15 5 $ra_query_indexes "ra_8" $schema_name
+rana_pid=$runner_pid
+
+start_aurora_serverless_txn_runner_serial 4 $schema_name  # Implicit: --dataset-type
+txn_pid=$runner_pid
+
+log_workload_point "clients_started"
+
+function inner_cancel_experiment() {
+  cancel_experiment $rana_pid $txn_pid
+}
+
+trap "inner_cancel_experiment" INT
+trap "inner_cancel_experiment" TERM
+
+# The workload should run for 90 minutes.
+# We will run for ~100 mins to add some buffer.
+sleep $(( 100 * 60 ))
+
+# Shut down everything now.
+log_workload_point "experiment_workload_done"
+>&2 echo "Experiment done. Shutting down runners..."
+graceful_shutdown $rana_pid $txn_pid
+log_workload_point "shutdown_complete"
+

From 3578922f3a9e793757cf1fdd49097ddf57317142 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 6 May 2024 13:45:42 -0400
Subject: [PATCH 18/30] Add transaction model constants for CH-BenCHmark

---
 src/brad/planner/constants.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/brad/planner/constants.yml b/src/brad/planner/constants.yml
index 4c393d51..87eb9e24 100644
--- a/src/brad/planner/constants.yml
+++ b/src/brad/planner/constants.yml
@@ -366,6 +366,16 @@ aurora_txns:
     b_p50: 0.0008631267119199038
     b_p90: 0.002251814818009734
 
+  # These constants are for the W = 1740 version of the dataset.
+  chbenchmark:
+    # Note that C_1, C_2 are meant to be the same for this dataset.
+    C_1: 0.008586008776871991
+    C_2: 0.008586008776871991
+
+    K: 1.0293710231781006
+    b_p50: 0.011220300570130348
+    b_p90: 0.022309081628918648
+
 
 aurora_scaling:
   # [Deprecated]

From a242352cdff3c5cc82db3a5583be9e3501e1de7e Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 6 May 2024 18:26:03 -0400
Subject: [PATCH 19/30] Check in CH-BenCHmark precomputed predictions

Co-authored-by: Ziniu Wu <ziniuw@mit.edu>
---
 workloads/chbenchmark/data_accessed-athena.npy    | Bin 0 -> 216 bytes
 .../pred-data_accessed-athena-aurora.npy          | Bin 0 -> 304 bytes
 .../chbenchmark/pred-data_accessed-athena.npy     | Bin 0 -> 216 bytes
 .../pred-run_time_s-athena-aurora-redshift.npy    | Bin 0 -> 656 bytes
 .../run_time_s-athena-aurora-redshift.npy         | Bin 0 -> 656 bytes
 5 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 workloads/chbenchmark/data_accessed-athena.npy
 create mode 100644 workloads/chbenchmark/pred-data_accessed-athena-aurora.npy
 create mode 100644 workloads/chbenchmark/pred-data_accessed-athena.npy
 create mode 100644 workloads/chbenchmark/pred-run_time_s-athena-aurora-redshift.npy
 create mode 100644 workloads/chbenchmark/run_time_s-athena-aurora-redshift.npy

diff --git a/workloads/chbenchmark/data_accessed-athena.npy b/workloads/chbenchmark/data_accessed-athena.npy
new file mode 100644
index 0000000000000000000000000000000000000000..591d89d8d60c2088a461e7df9bacc1db37af8ef1
GIT binary patch
literal 216
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmm~03bhL411@8m82=m7`257TG5RmhdE}>m;JV*30~6m=>y>_`5g+_&
z?OJ?e0xtMI(46O+^0wRW&B|6^mQ}ufVb`bnaBN`ozx%_<*Xv}Qe^Q}@f9kFuzCyVV
Kd``PF_yYi1H9^Aw

literal 0
HcmV?d00001

diff --git a/workloads/chbenchmark/pred-data_accessed-athena-aurora.npy b/workloads/chbenchmark/pred-data_accessed-athena-aurora.npy
new file mode 100644
index 0000000000000000000000000000000000000000..157ba57ed9480f9129714e191b0595ff7b8f3634
GIT binary patch
literal 304
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(4=E~51qv5u
zBo?Fsxf(`BItoUbItsN4WCN~(EsTC!<(T|duVeO~!S&nEaHp`}jeIxXy`{_i&h2FL
zHwDUv05OQZQ!)cgYa56AStL#KaXS3cPdLNPm)+mZKU&tr|1!6~-^HfuK1W~u_hVpa
I*iQ=#0LJ@S-~a#s

literal 0
HcmV?d00001

diff --git a/workloads/chbenchmark/pred-data_accessed-athena.npy b/workloads/chbenchmark/pred-data_accessed-athena.npy
new file mode 100644
index 0000000000000000000000000000000000000000..86e2bda3632ee5853d5dca2c9a78e6e0c05ba02b
GIT binary patch
literal 216
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmm~03bhL41FnKCjDB0?nEY0+WA>lH_1n*Ir?B6Rd^g{{rOW)z?PT*e
z1<Hp2F^IlXG6PI&8;AQ@Bu(>iI{ea4IK$1C-QUeWTGqt>GPl3q#ir{%M_>K-0|2(g
BK92wZ

literal 0
HcmV?d00001

diff --git a/workloads/chbenchmark/pred-run_time_s-athena-aurora-redshift.npy b/workloads/chbenchmark/pred-run_time_s-athena-aurora-redshift.npy
new file mode 100644
index 0000000000000000000000000000000000000000..be06e870c346f46313950f8df84b30449bcbc680
GIT binary patch
literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p)$*Goklfb;`K2?r3(z~C?+L@Ts?b_UT2XU=7U
zXqY%dX_^Lze;_sjtPdvsV7;g&NPL0lt~3x06F+b?0Bp{J2MbZmS@2xQ9;EJp;c*o8
z3!VzegTxn<GYEiam^lw7vV-kSIO~C8PQ$maLJ)NfAMBCTGi>`J3=)4(tAye&T<Q-<
zpHu*wQ*SH|rlH}GU{nq^M`2gF0kZgkgnS7w|I1!h6!F9}5cf#&8X=1(yodw4U*WNI
z4zf7IbbW9*IOw5<gTm7!Rgk)dq-GTNFk~K(1&J$gq@jfWf{LdqAn^x{RjA>)Jz5kb
X-Vm|?><?IYHmv@q2v)!4DMB0o9dC%H

literal 0
HcmV?d00001

diff --git a/workloads/chbenchmark/run_time_s-athena-aurora-redshift.npy b/workloads/chbenchmark/run_time_s-athena-aurora-redshift.npy
new file mode 100644
index 0000000000000000000000000000000000000000..be06e870c346f46313950f8df84b30449bcbc680
GIT binary patch
literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p)$*Goklfb;`K2?r3(z~C?+L@Ts?b_UT2XU=7U
zXqY%dX_^Lze;_sjtPdvsV7;g&NPL0lt~3x06F+b?0Bp{J2MbZmS@2xQ9;EJp;c*o8
z3!VzegTxn<GYEiam^lw7vV-kSIO~C8PQ$maLJ)NfAMBCTGi>`J3=)4(tAye&T<Q-<
zpHu*wQ*SH|rlH}GU{nq^M`2gF0kZgkgnS7w|I1!h6!F9}5cf#&8X=1(yodw4U*WNI
z4zf7IbbW9*IOw5<gTm7!Rgk)dq-GTNFk~K(1&J$gq@jfWf{LdqAn^x{RjA>)Jz5kb
X-Vm|?><?IYHmv@q2v)!4DMB0o9dC%H

literal 0
HcmV?d00001


From d8b5404bb67758912866d1fd09600ba160d21195 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 6 May 2024 20:11:57 -0400
Subject: [PATCH 20/30] Check in updated predictions

Co-authored-by: Ziniu Wu <ziniuw@mit.edu>
---
 .../pred-run_time_s-athena-aurora-redshift.npy  | Bin 656 -> 656 bytes
 .../run_time_s-athena-aurora-redshift.npy       | Bin 656 -> 656 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/workloads/chbenchmark/pred-run_time_s-athena-aurora-redshift.npy b/workloads/chbenchmark/pred-run_time_s-athena-aurora-redshift.npy
index be06e870c346f46313950f8df84b30449bcbc680..c13e41f82a9451888a907a59066e5d28ad55e383 100644
GIT binary patch
literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p)$*GoklfHVVx!+a3^fKkE$L@Ts?b_UTf@q{zy
zGQs@PGz~BfRevBh0<8bR22o99@dnXdX(0ZAqX8)9EO@XGY|et`LiWh&9~d47t7mvB
zB#$i4P{AMo=1*cru{YtY2iTm3Z(oH_)H5)Aux9{*ZC`|u#UIovf&GO`{Q>Ed3drgi
z>W#%g`~;(N6mt}Il^cM>7bN6MfM|I5eA&wi;y*|{gW?_sDPALxc*2V~6!$AUmd*i*
zGfdY<2?qx~NH{1wO;QER!{Q~W8SEa0%mcE>;tCvT;P5|C@l*v_{6S+CI6N0@j}}E1
WZwOfc_RoUV{}hqMA8dIl>;M1?hKH#D

literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p)$*Goklfb;`K2?r3(z~C?+L@Ts?b_UT2XU=7U
zXqY%dX_^Lze;_sjtPdvsV7;g&NPL0lt~3x06F+b?0Bp{J2MbZmS@2xQ9;EJp;c*o8
z3!VzegTxn<GYEiam^lw7vV-kSIO~C8PQ$maLJ)NfAMBCTGi>`J3=)4(tAye&T<Q-<
zpHu*wQ*SH|rlH}GU{nq^M`2gF0kZgkgnS7w|I1!h6!F9}5cf#&8X=1(yodw4U*WNI
z4zf7IbbW9*IOw5<gTm7!Rgk)dq-GTNFk~K(1&J$gq@jfWf{LdqAn^x{RjA>)Jz5kb
X-Vm|?><?IYHmv@q2v)!4DMB0o9dC%H

diff --git a/workloads/chbenchmark/run_time_s-athena-aurora-redshift.npy b/workloads/chbenchmark/run_time_s-athena-aurora-redshift.npy
index be06e870c346f46313950f8df84b30449bcbc680..aecfa07a110dfadf279c04c7168e3bc6bf9d6cd0 100644
GIT binary patch
literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p+P1E-W7fHVVx!+a2(kn}tlL_2I#b_3Bc@dejn
zGr|1pj}5>yRK3Ht$pK)#ri~%8c*AMobP#{R$E*ls@dxb_3P5~?O|mM;;tq*796|gC
zRaa2l^FVvYYkQ#hfwF%n;s?ra+k@0A=*J3xXqY<_w2yrUsb^So2SvTZ#eZObHN3uv
zqJF^x7cmE*@&iR0svsI>jslD2Kd?Cr3=n@q{R>n8<^#=<bpR=ciXV{g;RNv$1a63c
zXqb4zu8-y*e!^o`BM=P}2Qnb$YTi+B00~3QS&-`P4B{^+h&2GwFmZ<47X(541j`Z>
y^$I7F!Qu<v?$H6s!_+qvhT4Jn2j;$)0MRh<1;6+I2Knnj)fANQQHYo=;s5|QD~t&M

literal 656
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Mmh?{nmP)#3giMV1_p)$*Goklfb;`K2?r3(z~C?+L@Ts?b_UT2XU=7U
zXqY%dX_^Lze;_sjtPdvsV7;g&NPL0lt~3x06F+b?0Bp{J2MbZmS@2xQ9;EJp;c*o8
z3!VzegTxn<GYEiam^lw7vV-kSIO~C8PQ$maLJ)NfAMBCTGi>`J3=)4(tAye&T<Q-<
zpHu*wQ*SH|rlH}GU{nq^M`2gF0kZgkgnS7w|I1!h6!F9}5cf#&8X=1(yodw4U*WNI
z4zf7IbbW9*IOw5<gTm7!Rgk)dq-GTNFk~K(1&J$gq@jfWf{LdqAn^x{RjA>)Jz5kb
X-Vm|?><?IYHmv@q2v)!4DMB0o9dC%H


From cf35b38efbf4ff66fe744a899e8ffd86274a8211 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Mon, 6 May 2024 23:34:20 -0400
Subject: [PATCH 21/30] Check in hardware/load model constants for CH-BenCHmark

---
 src/brad/config/planner.py                    | 14 ++----
 src/brad/planner/constants.yml                | 43 ++++++++++++++-----
 .../scoring/performance/unified_aurora.py     |  4 +-
 .../scoring/performance/unified_redshift.py   |  4 +-
 4 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/src/brad/config/planner.py b/src/brad/config/planner.py
index 0017b95a..7771f2c8 100644
--- a/src/brad/config/planner.py
+++ b/src/brad/config/planner.py
@@ -255,15 +255,12 @@ def aurora_scaling_coefs(self) -> npt.NDArray:
     def aurora_txn_coefs(self, schema_name: str) -> Dict[str, float]:
         return self._raw["aurora_txns"][schema_name]
 
-    def aurora_new_scaling_coefs(self) -> npt.NDArray:
+    def aurora_new_scaling_coefs(self, schema_name: str) -> npt.NDArray:
         if self._aurora_new_scaling_coefs is None:
-            coefs = self._raw["aurora_scaling_new"]
+            coefs = self._raw["aurora_scaling_new"][schema_name]
             self._aurora_new_scaling_coefs = np.array([coefs["coef1"], coefs["coef2"]])
         return self._aurora_new_scaling_coefs
 
-    def aurora_new_scaling_alpha(self) -> float:
-        return self._raw["aurora_scaling_new"]["alpha"]
-
     ###
     ### Unified Redshift scaling
     ###
@@ -275,17 +272,14 @@ def redshift_scaling_coefs(self) -> npt.NDArray:
             )
         return self._redshift_scaling_coefs
 
-    def redshift_new_scaling_coefs(self) -> npt.NDArray:
+    def redshift_new_scaling_coefs(self, schema_name: str) -> npt.NDArray:
         if self._redshift_new_scaling_coefs is None:
-            coefs = self._raw["redshift_scaling_new"]
+            coefs = self._raw["redshift_scaling_new"][schema_name]
             self._redshift_new_scaling_coefs = np.array(
                 [coefs["coef1"], coefs["coef2"]]
             )
         return self._redshift_new_scaling_coefs
 
-    def redshift_new_scaling_alpha(self) -> float:
-        return self._raw["redshift_scaling_new"]["alpha"]
-
     def use_io_optimized_aurora(self) -> bool:
         if "use_io_optimized_aurora" not in self._raw:
             # By default.
diff --git a/src/brad/planner/constants.yml b/src/brad/planner/constants.yml
index 87eb9e24..7b65852b 100644
--- a/src/brad/planner/constants.yml
+++ b/src/brad/planner/constants.yml
@@ -202,20 +202,43 @@ table_extract_bytes_per_row:
 ###
 
 aurora_scaling_new:
-  # Wait time (from queuing theory)
-  # alpha * avg_query_time * (u / (1 - u)) + base
-  alpha: 0.0464553
+  imdb_extended_100g:
+    # Wait time (from queuing theory)
+    # alpha * avg_query_time * (u / (1 - u)) + base
+    alpha: 0.0464553
+
+    # Resources
+    # [coef1 (s/d) + coef2] * base
+    coef1: 0.75851053
+    coef2: 0.5486482
 
-  # Resources
-  # [coef1 (s/d) + coef2] * base
-  coef1: 0.75851053
-  coef2: 0.5486482
+  imdb_specialized_100g:
+    alpha: 0.0464553
+    coef1: 0.75851053
+    coef2: 0.5486482
+
+  chbenchmark:
+    # Queries cannot complete in time on Aurora.
+    alpha: 1.0
+    coef1: 0.0
+    coef2: 1.0
 
 redshift_scaling_new:
   # Same model as above.
-  alpha: 0.730064
-  coef1: 0.89125617
-  coef2: 0.1139099
+  imdb_extended_100g:
+    alpha: 0.730064
+    coef1: 0.89125617
+    coef2: 0.1139099
+
+  imdb_specialized_100g:
+    alpha: 0.730064
+    coef1: 0.89125617
+    coef2: 0.1139099
+
+  chbenchmark:
+    alpha: 1.0  # Now unused
+    coef1: 0.16853629
+    coef2: 0.61977525
 
 run_time_to_denorm_cpu:
   aurora:
diff --git a/src/brad/planner/scoring/performance/unified_aurora.py b/src/brad/planner/scoring/performance/unified_aurora.py
index eedb59ba..474aa033 100644
--- a/src/brad/planner/scoring/performance/unified_aurora.py
+++ b/src/brad/planner/scoring/performance/unified_aurora.py
@@ -376,7 +376,7 @@ def predict_query_latency_resources_batch(
         rf = np.array(resource_factors)
         basis = np.stack([rf, np.ones_like(rf)])
         basis = np.transpose(basis)
-        coefs = ctx.planner_config.aurora_new_scaling_coefs()
+        coefs = ctx.planner_config.aurora_new_scaling_coefs(ctx.schema_name)
         coefs = np.multiply(coefs, basis)
 
         num_coefs = coefs.shape[1]
@@ -467,7 +467,7 @@ def predict_base_latency(
             return np.ones_like(latency) * np.inf
         # Ideally we should adjust for load as well.
         resource_factor = _AURORA_BASE_RESOURCE_VALUE / aurora_num_cpus(prov)
-        coefs = ctx.planner_config.aurora_new_scaling_coefs()
+        coefs = ctx.planner_config.aurora_new_scaling_coefs(ctx.schema_name)
         coefs[0] *= resource_factor
         return latency / coefs.sum()
 
diff --git a/src/brad/planner/scoring/performance/unified_redshift.py b/src/brad/planner/scoring/performance/unified_redshift.py
index e509cc9d..2965b172 100644
--- a/src/brad/planner/scoring/performance/unified_redshift.py
+++ b/src/brad/planner/scoring/performance/unified_redshift.py
@@ -365,7 +365,7 @@ def predict_query_latency_resources_batch(
         rf = np.array(resource_factors)
         basis = np.stack([rf, np.ones_like(rf)])
         basis = np.transpose(basis)
-        coefs = ctx.planner_config.redshift_new_scaling_coefs()
+        coefs = ctx.planner_config.redshift_new_scaling_coefs(ctx.schema_name)
         coefs = np.multiply(coefs, basis)
 
         num_coefs = coefs.shape[1]
@@ -415,7 +415,7 @@ def predict_base_latency(
         resource_factor = _REDSHIFT_BASE_RESOURCE_VALUE / (
             redshift_num_cpus(prov) * prov.num_nodes()
         )
-        coefs = ctx.planner_config.redshift_new_scaling_coefs()
+        coefs = ctx.planner_config.redshift_new_scaling_coefs(ctx.schema_name)
         coefs[0] *= resource_factor
         return latency / coefs.sum()
 

From ccd45c438ae61c15d7bbe5e5c641d07300ef2a13 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Thu, 9 May 2024 23:43:14 -0400
Subject: [PATCH 22/30] Add experiment configs for CH-BenCHmark scenario (#509)

Part of #487.
---
 experiments/17-chbenchmark/common.sh          |  58 +++++-
 experiments/17-chbenchmark/debug/COND         |  25 +++
 .../17-chbenchmark/debug/debug_config.yml     |  12 +-
 experiments/17-chbenchmark/debug/run_full.sh  |  25 +++
 .../debug/set_up_starting_blueprint.sh        |  20 +++
 experiments/17-chbenchmark/scale_down/COND    |  24 +++
 .../17-chbenchmark/scale_down/brad.config     |   6 +
 .../scale_down/ch_scale_down_config.yml       | 167 ++++++++++++++++++
 .../17-chbenchmark/scale_down/run_full.sh     |  25 +++
 .../scale_down/set_up_starting_blueprint.sh   |  21 +++
 src/brad/config/file.py                       |   6 +
 src/brad/daemon/daemon.py                     |   6 +-
 src/brad/front_end/front_end.py               |  34 +++-
 src/brad/front_end/session.py                 |   5 +-
 .../set_up_starting_blueprint.py              |  18 +-
 workloads/chbenchmark/queries.sql             |  22 +++
 16 files changed, 457 insertions(+), 17 deletions(-)
 create mode 100755 experiments/17-chbenchmark/debug/run_full.sh
 create mode 100755 experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
 create mode 100644 experiments/17-chbenchmark/scale_down/COND
 create mode 100644 experiments/17-chbenchmark/scale_down/brad.config
 create mode 100644 experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
 create mode 100755 experiments/17-chbenchmark/scale_down/run_full.sh
 create mode 100755 experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
 create mode 100644 workloads/chbenchmark/queries.sql

diff --git a/experiments/17-chbenchmark/common.sh b/experiments/17-chbenchmark/common.sh
index 2db49e0e..95ee520c 100644
--- a/experiments/17-chbenchmark/common.sh
+++ b/experiments/17-chbenchmark/common.sh
@@ -13,6 +13,7 @@ function start_brad() {
 }
 
 function run_tpcc() {
+  local results_name=$1
   pushd ../../../workloads/chbenchmark/py-tpcc/
   local args=(
     --no-load
@@ -25,11 +26,66 @@ function run_tpcc() {
   if [[ ! -z $txn_zipfian_alpha ]]; then
     args+=(--zipfian-alpha $txn_zipfian_alpha)
   fi
-  RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc brad "${args[@]}" &
+  mkdir -p $COND_OUT/$results_name
+  RECORD_DETAILED_STATS=1 COND_OUT=$COND_OUT/$results_name python3 -m pytpcc.tpcc brad "${args[@]}" &
   tpcc_pid=$!
   popd
 }
 
+function log_workload_point() {
+  msg=$1
+  now=$(date --utc "+%Y-%m-%d %H:%M:%S")
+  echo "$now,$msg" >> $COND_OUT/points.log
+}
+
+function start_repeating_olap_runner() {
+  local ra_clients=$1
+  local ra_gap_s=$2
+  local ra_gap_std_s=$3
+  local query_indexes=$4
+  local results_name=$5
+  local client_offset=$6
+
+  local args=(
+    --num-clients $ra_clients
+    --num-front-ends $num_front_ends
+    --query-indexes $query_indexes
+    --query-bank-file $ra_query_bank_file
+    --avg-gap-s $ra_gap_s
+    --avg-gap-std-s $ra_gap_std_s
+  )
+
+  if [[ ! -z $ra_query_frequency_path ]]; then
+    args+=(--query-frequency-path $ra_query_frequency_path)
+  fi
+
+  if [[ ! -z $client_offset ]]; then
+    args+=(--client-offset $client_offset)
+  fi
+
+  >&2 echo "[Serial Repeating Analytics] Running with $ra_clients..."
+  results_dir=$COND_OUT/$results_name
+  mkdir -p $results_dir
+
+  log_workload_point $results_name
+  COND_OUT=$results_dir python3.11 ../../../workloads/IMDB_extended/run_repeating_analytics_serial.py "${args[@]}" &
+
+  # This is a special return value variable that we use.
+  runner_pid=$!
+}
+
+function graceful_shutdown() {
+  for pid_var in "$@"; do
+    kill -INT $pid_var
+  done
+  for pid_var in "$@"; do
+    wait $pid_var
+  done
+
+  kill -INT $brad_pid
+  wait $brad_pid
+}
+
 function extract_named_arguments() {
   # Evaluates any environment variables in this script's arguments. This script
   # should only be run on trusted input.
diff --git a/experiments/17-chbenchmark/debug/COND b/experiments/17-chbenchmark/debug/COND
index 7feaa352..7f403f67 100644
--- a/experiments/17-chbenchmark/debug/COND
+++ b/experiments/17-chbenchmark/debug/COND
@@ -81,3 +81,28 @@ run_experiment(
     "txn-zipfian-alpha": ZIPFIAN_ALPHA,
   },
 )
+
+# Query indices.
+QUERIES = list(range(22))
+QUERIES.remove(4)
+QUERIES.remove(13)
+QUERIES_STR = ",".join([str(v) for v in QUERIES])
+
+run_experiment(
+  name="run_full",
+  run="./run_full.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "system-config-file": "debug_config.yml",  # Relative to one level up.
+    "schema-name": "chbenchmark",
+    "txn-config-file": "brad.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "num-front-ends": 2, # TBD
+    "run-for-s": 60 * 60,  # One hour
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+    "ra-query-indexes": QUERIES_STR,
+    "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
+  },
+)
diff --git a/experiments/17-chbenchmark/debug/debug_config.yml b/experiments/17-chbenchmark/debug/debug_config.yml
index c279878d..3b8a6015 100644
--- a/experiments/17-chbenchmark/debug/debug_config.yml
+++ b/experiments/17-chbenchmark/debug/debug_config.yml
@@ -6,7 +6,7 @@
 # listen on successive ports (e.g., 6584, 6585, etc.).
 front_end_interface: "0.0.0.0"
 front_end_port: 6583
-num_front_ends: 1
+num_front_ends: 2
 
 # If installed and enabled, BRAD will serve its UI from a webserver that listens
 # for connections on this network interface and port.
@@ -42,7 +42,7 @@ front_end_query_latency_buffer_size: 100
 
 # `default` means to use the policy encoded in the blueprint. Other values will
 # override the blueprint.
-routing_policy: always_aurora
+routing_policy: default
 
 # Whether to disable table movement for benchmark purposes (i.e., keep all
 # tables on all engines.)
@@ -104,6 +104,8 @@ txn_latency_p90_ceiling_s: 0.030
 # clusters instead of resizing the main Redshift cluster.
 use_preset_redshift_clusters: false
 
+result_row_limit: 10
+
 # Used for ordering blueprints during planning.
 comparator:
   type: benefit_perf_ceiling  # or `perf_ceiling`
@@ -119,10 +121,8 @@ comparator:
 
 # Used for precomputed predictions.
 std_datasets:
-  - name: regular
-    path: workloads/IMDB_100GB/regular_test/
-  - name: adhoc
-    path: workloads/IMDB_100GB/adhoc_test/
+  - name: chbenchmark
+    path: workloads/chbenchmark/
 
 # Blueprint planning trigger configs.
 
diff --git a/experiments/17-chbenchmark/debug/run_full.sh b/experiments/17-chbenchmark/debug/run_full.sh
new file mode 100755
index 00000000..f06a0504
--- /dev/null
+++ b/experiments/17-chbenchmark/debug/run_full.sh
@@ -0,0 +1,25 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+abs_system_config_file=$(realpath $system_config_file)
+abs_physical_config_file=$(realpath $physical_config_file)
+
+export BRAD_IGNORE_BLUEPRINT=1
+start_brad $abs_system_config_file $abs_physical_config_file
+
+sleep 30
+
+run_tpcc "t_1"
+start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients
+ra_pid=$runner_pid
+
+sleep $run_for_s
+
+# Shut down.
+graceful_shutdown $tpcc_pid $ra_pid
diff --git a/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
new file mode 100755
index 00000000..2e7c9986
--- /dev/null
+++ b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+
+if [ -z $1 ]; then
+  >&2 echo "Usage: $0 path/to/physical/config.yml"
+  exit 1
+fi
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
+  --schema-name chbenchmark \
+  --query-bank-file ../../../workloads/chbenchmark/queries.sql \
+  --redshift-queries "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \
+  --redshift-provisioning "dc2.large:16" \
+  --aurora-provisioning "db.r6g.xlarge:1" \
+  --system-config-file debug_config.yml \
+  --physical-config-file $1 \
+  --override-definite-routing redshift
diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
new file mode 100644
index 00000000..f62230bc
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -0,0 +1,24 @@
+ZIPFIAN_ALPHA = 5.0
+
+# Query indices.
+QUERIES = list(range(22))
+QUERIES_STR = ",".join([str(v) for v in QUERIES])
+
+run_experiment(
+  name="run_full",
+  run="./run_full.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "system-config-file": "ch_scale_down_config.yml",  # Relative to one level up.
+    "schema-name": "chbenchmark",
+    "txn-config-file": "brad.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 1,  # TBD
+    "num-front-ends": 2, # TBD
+    "run-for-s": 2 * 60 * 60,  # 2 hours
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+    "ra-query-indexes": QUERIES_STR,
+    "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
+  },
+)
diff --git a/experiments/17-chbenchmark/scale_down/brad.config b/experiments/17-chbenchmark/scale_down/brad.config
new file mode 100644
index 00000000..c71fe1e5
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/brad.config
@@ -0,0 +1,6 @@
+# BradDriver Configuration File
+[brad]
+host                 = localhost
+port                 = 6583
+isolation_level      = REPEATABLE READ
+use_worker_offset    = true
diff --git a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
new file mode 100644
index 00000000..3e40530d
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
@@ -0,0 +1,167 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 2
+
+# If installed and enabled, BRAD will serve its UI from a webserver that listens
+# for connections on this network interface and port.
+ui_interface: "0.0.0.0"
+ui_port: 7583
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: COND_OUT
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: COND_OUT
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: COND_OUT
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: COND_OUT
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.10
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.8
+
+# Blueprint planning performance ceilings.
+query_latency_p90_ceiling_s: 360.0
+txn_latency_p90_ceiling_s: 0.080
+
+# If set to true, BRAD will attempt to use the specified preset Redshift
+# clusters instead of resizing the main Redshift cluster.
+use_preset_redshift_clusters: false
+
+result_row_limit: 10
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 24
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 8  # Only used by the `benefit_perf_ceiling` comparator
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: chbenchmark
+    path: workloads/chbenchmark/
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: false
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 360  # Wait 6 mins before starting.
+
+  # Triggers will not fire for at least this many minutes after a new blueprint
+  # takes effect. Usually this should be greater than zero to give BRAD
+  # sufficient time to observe the effect of the blueprint on the workload. BRAD
+  # may wait longer to ensure metrics are also available for this many minutes.
+  observe_new_blueprint_mins: 5
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 360.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.080
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5
diff --git a/experiments/17-chbenchmark/scale_down/run_full.sh b/experiments/17-chbenchmark/scale_down/run_full.sh
new file mode 100755
index 00000000..f06a0504
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/run_full.sh
@@ -0,0 +1,25 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+abs_system_config_file=$(realpath $system_config_file)
+abs_physical_config_file=$(realpath $physical_config_file)
+
+export BRAD_IGNORE_BLUEPRINT=1
+start_brad $abs_system_config_file $abs_physical_config_file
+
+sleep 30
+
+run_tpcc "t_1"
+start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients
+ra_pid=$runner_pid
+
+sleep $run_for_s
+
+# Shut down.
+graceful_shutdown $tpcc_pid $ra_pid
diff --git a/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
new file mode 100755
index 00000000..1735545e
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
@@ -0,0 +1,21 @@
+#! /bin/bash
+
+if [ -z $1 ]; then
+  >&2 echo "Usage: $0 path/to/physical/config.yml"
+  exit 1
+fi
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+
+python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
+  --schema-name chbenchmark \
+  --query-bank-file ../../../workloads/chbenchmark/queries.sql \
+  --redshift-queries "0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \
+  --athena-queries "4" \
+  --redshift-provisioning "dc2.large:16" \
+  --aurora-provisioning "db.r6g.2xlarge:1" \
+  --system-config-file ch_scale_down_config.yml \
+  --physical-config-file $1 \
+  --override-definite-routing redshift
diff --git a/src/brad/config/file.py b/src/brad/config/file.py
index c14facc2..fe781c23 100644
--- a/src/brad/config/file.py
+++ b/src/brad/config/file.py
@@ -288,6 +288,12 @@ def ui_port(self) -> int:
         else:
             return 7583
 
+    def result_row_limit(self) -> Optional[int]:
+        try:
+            return self._raw["result_row_limit"]
+        except KeyError:
+            return None
+
     def _extract_log_path(self, config_key: str) -> Optional[pathlib.Path]:
         if config_key not in self._raw:
             return None
diff --git a/src/brad/daemon/daemon.py b/src/brad/daemon/daemon.py
index f634eb05..56045c7b 100644
--- a/src/brad/daemon/daemon.py
+++ b/src/brad/daemon/daemon.py
@@ -66,6 +66,7 @@
 from brad.planner.workload.builder import WorkloadBuilder
 from brad.planner.workload.provider import LoggedWorkloadProvider
 from brad.routing.policy import RoutingPolicy
+from brad.routing.tree_based.forest_policy import ForestPolicy
 from brad.row_list import RowList
 from brad.utils.time_periods import period_start, universal_now
 from brad.ui.manager import UiManager
@@ -328,7 +329,10 @@ async def _run_setup(self) -> None:
             or self._config.routing_policy == RoutingPolicy.Default
         ):
             logger.info("Setting up the cardinality estimator...")
-            if is_stub_mode:
+            blueprint = self._blueprint_mgr.get_blueprint()
+            policy = blueprint.get_routing_policy()
+            requires_estimator = isinstance(policy.definite_policy, ForestPolicy)
+            if is_stub_mode or not requires_estimator:
                 estimator: Estimator = StubEstimator()
             else:
                 estimator = await PostgresEstimator.connect(
diff --git a/src/brad/front_end/front_end.py b/src/brad/front_end/front_end.py
index f7f871fe..560ba211 100644
--- a/src/brad/front_end/front_end.py
+++ b/src/brad/front_end/front_end.py
@@ -453,8 +453,20 @@ async def _run_query_impl(
                 else:
                     connection = session.engines.get_reader_connection(engine_to_use)
                     cursor = connection.cursor_sync()
+                    # HACK: To work around dialect differences between
+                    # Athena/Aurora/Redshift for now. This should be replaced by
+                    # a more robust translation layer.
+                    if (
+                        engine_to_use == Engine.Athena
+                        and "ascii" in query_rep.raw_query
+                    ):
+                        translated_query = query_rep.raw_query.replace(
+                            "ascii", "codepoint"
+                        )
+                    else:
+                        translated_query = query_rep.raw_query
                     start = universal_now()
-                    await cursor.execute(query_rep.raw_query)
+                    await cursor.execute(translated_query)
                 end = universal_now()
             except (
                 pyodbc.ProgrammingError,
@@ -513,9 +525,23 @@ async def _run_query_impl(
 
             # Extract and return the results, if any.
             try:
-                # Using `fetchall_sync()` is lower overhead than the async interface.
-                results = [tuple(row) for row in cursor.fetchall_sync()]
-                log_verbose(logger, "Responded with %d rows.", len(results))
+                result_row_limit = self._config.result_row_limit()
+                if result_row_limit is not None:
+                    results = []
+                    for _ in range(result_row_limit):
+                        row = cursor.fetchone_sync()
+                        if row is None:
+                            break
+                        results.append(tuple(row))
+                    log_verbose(
+                        logger,
+                        "Responded with %d rows (limited to %d rows).",
+                        len(results),
+                    )
+                else:
+                    # Using `fetchall_sync()` is lower overhead than the async interface.
+                    results = [tuple(row) for row in cursor.fetchall_sync()]
+                    log_verbose(logger, "Responded with %d rows.", len(results))
                 return (
                     results,
                     (cursor.result_schema(results) if retrieve_schema else None),
diff --git a/src/brad/front_end/session.py b/src/brad/front_end/session.py
index 09ae5311..416e2515 100644
--- a/src/brad/front_end/session.py
+++ b/src/brad/front_end/session.py
@@ -11,6 +11,7 @@
 from brad.front_end.engine_connections import EngineConnections
 from brad.planner.estimator import Estimator
 from brad.routing.policy import RoutingPolicy
+from brad.routing.tree_based.forest_policy import ForestPolicy
 from brad.data_stats.postgres_estimator import PostgresEstimator
 from brad.data_stats.stub_estimator import StubEstimator
 from brad.utils.time_periods import universal_now
@@ -117,7 +118,9 @@ async def create_new_session(self) -> Tuple[SessionId, Session]:
             routing_policy_override == RoutingPolicy.ForestTableSelectivity
             or routing_policy_override == RoutingPolicy.Default
         ):
-            if self._config.stub_mode_path() is None:
+            policy = blueprint.get_routing_policy()
+            requires_estimator = isinstance(policy.definite_policy, ForestPolicy)
+            if self._config.stub_mode_path() is None and requires_estimator:
                 estimator: Optional[Estimator] = await PostgresEstimator.connect(
                     self._schema_name, self._config
                 )
diff --git a/workloads/IMDB_extended/set_up_starting_blueprint.py b/workloads/IMDB_extended/set_up_starting_blueprint.py
index be5bf2c1..62589379 100644
--- a/workloads/IMDB_extended/set_up_starting_blueprint.py
+++ b/workloads/IMDB_extended/set_up_starting_blueprint.py
@@ -15,6 +15,7 @@
 from brad.routing.cached import CachedLocationPolicy
 from brad.routing.policy import RoutingPolicy
 from brad.routing.tree_based.forest_policy import ForestPolicy
+from brad.routing.always_one import AlwaysOneRouter
 from brad.utils import set_up_logging
 
 logger = logging.getLogger(__name__)
@@ -83,6 +84,11 @@ def main():
     parser.add_argument(
         "--aurora-provisioning", type=str, help="Format: <instance type>:<num. nodes>"
     )
+    parser.add_argument(
+        "--override-definite-routing",
+        type=str,
+        help="An engine to always route queries for if the indefinite policy does not capture it.",
+    )
     args = parser.parse_args()
     set_up_logging(debug_mode=True)
 
@@ -124,11 +130,15 @@ def main():
 
     # 5. Replace the policy.
     enum_blueprint = EnumeratedBlueprint(blueprint)
-    definite_policy = asyncio.run(
-        ForestPolicy.from_assets(
-            args.schema_name, RoutingPolicy.ForestTableCardinality, assets
+    if args.override_definite_routing is not None:
+        routing_engine = Engine.from_str(args.override_definite_routing)
+        definite_policy = AlwaysOneRouter(routing_engine)
+    else:
+        definite_policy = asyncio.run(
+            ForestPolicy.from_assets(
+                args.schema_name, RoutingPolicy.ForestTableCardinality, assets
+            )
         )
-    )
     replaced_policy = FullRoutingPolicy(
         indefinite_policies=[clp], definite_policy=definite_policy
     )
diff --git a/workloads/chbenchmark/queries.sql b/workloads/chbenchmark/queries.sql
new file mode 100644
index 00000000..6ced3e67
--- /dev/null
+++ b/workloads/chbenchmark/queries.sql
@@ -0,0 +1,22 @@
+select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line group by ol_number order by ol_number;
+select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id;
+select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d;
+select o_ol_cnt, count(*) as order_count from orders where exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt;
+select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(cast(substring(c_state,1,1) as varchar(1))) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc;
+select sum(ol_amount) as revenue from order_line where ol_quantity between 1 and 100000;
+WITH inner_query AS (select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ((n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany'))) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
+select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year;
+select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc;
+select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc;
+select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc;
+select o_ol_cnt,  sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt;
+select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id > 8) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc;
+select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id;
+with revenue (supplier_no, total_revenue) as (select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) order by su_suppkey;
+select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc;
+select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a;
+select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d;
+select sum(ol_amount) as revenue from order_line, item where (ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,3)) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,4)) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,5,3));
+select su_name, su_address from supplier, nation where su_suppkey in (select  mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id  from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having   2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and n_name = 'Germany' order by su_name;
+select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where  l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name;
+select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where  c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id) group by substring(c_state,1,1) order by substring(c_state,1,1);

From 4f760a2fdc8e97d3e53b10c5ed2057576512a526 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 10 May 2024 12:38:37 -0400
Subject: [PATCH 23/30] Adjust initial CH-BenCHmark scenario

---
 experiments/17-chbenchmark/scale_down/COND                  | 6 +++---
 .../17-chbenchmark/scale_down/ch_scale_down_config.yml      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
index f62230bc..ee3a78a8 100644
--- a/experiments/17-chbenchmark/scale_down/COND
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -14,9 +14,9 @@ run_experiment(
     "txn-config-file": "brad.config",
     "txn-warehouses": 1740,
     "txn-scale-factor": 1,  # TBD
-    "t-clients": 1,  # TBD
-    "num-front-ends": 2, # TBD
-    "run-for-s": 2 * 60 * 60,  # 2 hours
+    "t-clients": 4,  # TBD
+    "num-front-ends": 5, # TBD
+    "run-for-s": 1 * 60 * 60,  # 1 hour
     "txn-zipfian-alpha": ZIPFIAN_ALPHA,
     "ra-query-indexes": QUERIES_STR,
     "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
diff --git a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
index 3e40530d..ca39cceb 100644
--- a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
+++ b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
@@ -6,7 +6,7 @@
 # listen on successive ports (e.g., 6584, 6585, etc.).
 front_end_interface: "0.0.0.0"
 front_end_port: 6583
-num_front_ends: 2
+num_front_ends: 5
 
 # If installed and enabled, BRAD will serve its UI from a webserver that listens
 # for connections on this network interface and port.
@@ -127,7 +127,7 @@ std_datasets:
 # Blueprint planning trigger configs.
 
 triggers:
-  enabled: false
+  enabled: true
   check_period_s: 90  # Triggers are checked every X seconds.
   check_period_offset_s: 360  # Wait 6 mins before starting.
 

From ddd44d24cef5c402309f757fdb96176c71eca8aa Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Fri, 10 May 2024 14:57:14 -0400
Subject: [PATCH 24/30] CH-BenCHmark: Adjust starting config again

---
 experiments/17-chbenchmark/scale_down/COND                    | 2 +-
 experiments/17-chbenchmark/scale_down/run_full.sh             | 3 +--
 .../17-chbenchmark/scale_down/set_up_starting_blueprint.sh    | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
index ee3a78a8..fc283ad9 100644
--- a/experiments/17-chbenchmark/scale_down/COND
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -16,7 +16,7 @@ run_experiment(
     "txn-scale-factor": 1,  # TBD
     "t-clients": 4,  # TBD
     "num-front-ends": 5, # TBD
-    "run-for-s": 1 * 60 * 60,  # 1 hour
+    "run-for-s": 2 * 60 * 60,  # 1 hour
     "txn-zipfian-alpha": ZIPFIAN_ALPHA,
     "ra-query-indexes": QUERIES_STR,
     "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
diff --git a/experiments/17-chbenchmark/scale_down/run_full.sh b/experiments/17-chbenchmark/scale_down/run_full.sh
index f06a0504..4d9f43ee 100755
--- a/experiments/17-chbenchmark/scale_down/run_full.sh
+++ b/experiments/17-chbenchmark/scale_down/run_full.sh
@@ -10,12 +10,11 @@ abs_txn_config_file=$(realpath $txn_config_file)
 abs_system_config_file=$(realpath $system_config_file)
 abs_physical_config_file=$(realpath $physical_config_file)
 
-export BRAD_IGNORE_BLUEPRINT=1
 start_brad $abs_system_config_file $abs_physical_config_file
 
 sleep 30
 
-run_tpcc "t_1"
+run_tpcc "t_4"
 start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients
 ra_pid=$runner_pid
 
diff --git a/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
index 1735545e..77ac577b 100755
--- a/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
+++ b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh
@@ -12,8 +12,8 @@ source ../common.sh
 python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
   --schema-name chbenchmark \
   --query-bank-file ../../../workloads/chbenchmark/queries.sql \
-  --redshift-queries "0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \
-  --athena-queries "4" \
+  --redshift-queries "0,1,2,3,5,6,7,8,10,11,12,13,14,15,16,18,19,20,21" \
+  --athena-queries "4,9,17" \
   --redshift-provisioning "dc2.large:16" \
   --aurora-provisioning "db.r6g.2xlarge:1" \
   --system-config-file ch_scale_down_config.yml \

From d72f117574bf84496446491b5bafc70eb61de97c Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 11 May 2024 19:43:13 -0400
Subject: [PATCH 25/30] Additional workload adjustments

---
 experiments/17-chbenchmark/scale_down/COND | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
index fc283ad9..eff507cc 100644
--- a/experiments/17-chbenchmark/scale_down/COND
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -2,6 +2,9 @@ ZIPFIAN_ALPHA = 5.0
 
 # Query indices.
 QUERIES = list(range(22))
+QUERIES.remove(4)
+QUERIES.remove(9)
+QUERIES.remove(17)
 QUERIES_STR = ",".join([str(v) for v in QUERIES])
 
 run_experiment(

From 5aa6bd30db65487d2bd00ab71343abcf17b888ef Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sat, 11 May 2024 23:51:55 +0000
Subject: [PATCH 26/30] Decrease query latency ceiling

---
 .../17-chbenchmark/scale_down/ch_scale_down_config.yml        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
index ca39cceb..bdc3986c 100644
--- a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
+++ b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml
@@ -97,7 +97,7 @@ aurora_min_load_removal_fraction: 0.8
 redshift_min_load_removal_fraction: 0.8
 
 # Blueprint planning performance ceilings.
-query_latency_p90_ceiling_s: 360.0
+query_latency_p90_ceiling_s: 50.0
 txn_latency_p90_ceiling_s: 0.080
 
 # If set to true, BRAD will attempt to use the specified preset Redshift
@@ -156,7 +156,7 @@ triggers:
     threshold: 1.0
 
   query_latency_ceiling:
-    ceiling_s: 360.0
+    ceiling_s: 50.0
     sustained_epochs: 3
 
   txn_latency_ceiling:

From a081664249cc08931447dac706a983d953f907af Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sun, 12 May 2024 16:22:03 -0400
Subject: [PATCH 27/30] Add verbose logging to the transaction runner

---
 .../py-tpcc/pytpcc/runtime/executor.py        | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
index f10f111f..3ea3eec9 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
@@ -41,6 +41,7 @@
 from datetime import datetime
 from pprint import pprint, pformat
 from brad.utils.rand_exponential_backoff import RandomizedExponentialBackoff
+from brad.utils import create_custom_logger
 from typing import Optional
 
 from .. import constants
@@ -103,6 +104,13 @@ def execute(
             logging.info("Not recording detailed stats.")
             options = {}
 
+        verbose_log_dir = out_path / "verbose_logs"
+        verbose_log_dir.mkdir(exist_ok=True)
+        verbose_logger = create_custom_logger(
+            "txn_runner_verbose", str(verbose_log_dir / f"runner_{worker_index}.log")
+        )
+        verbose_logger.info("[T %d] Workload starting...", worker_index)
+
         # Compute warehouse ranges.
         self.worker_index = worker_index
         self.total_workers = total_workers
@@ -117,7 +125,7 @@ def execute(
         logging.info(
             "Worker index %d - Warehouse range: %d to %d (inclusive)",
             self.worker_index,
-            *self.local_warehouse_range
+            *self.local_warehouse_range,
         )
 
         if zipfian_alpha is not None:
@@ -145,9 +153,19 @@ def execute(
             if debug:
                 logging.debug("Executing '%s' transaction" % txn)
             try:
+                verbose_logger.info("[T %d] Issuing transaction %s", worker_index, txn)
                 val = self.driver.executeTransaction(txn, params)
                 backoff = None
+                # if debug: logging.debug("%s\nParameters:\n%s\nResult:\n%s" % (txn, pformat(params), pformat(val)))
+                r.stopTransaction(txn_id)
+                verbose_logger.info(
+                    "[T %d] Finished transaction %s, %d", worker_index, txn, txn_id
+                )
+
             except KeyboardInterrupt:
+                verbose_logger.info(
+                    "[T %d] Aborting early due to KeyboardInterrupt", worker_index
+                )
                 return -1
             except (Exception, AssertionError) as ex:
                 if debug:
@@ -156,6 +174,7 @@ def execute(
                 elif random.random() < 0.01:
                     logging.warning("Aborted transaction: %s: %s", txn, ex)
                     traceback.print_exc(file=sys.stdout)
+                verbose_logger.exception("[T %d] Ran into error", worker_index)
                 if self.stop_on_error:
                     raise
                 r.abortTransaction(txn_id)
@@ -168,16 +187,16 @@ def execute(
                     )
                 wait_s = backoff.wait_time_s()
                 if wait_s is not None:
+                    verbose_logger.info(
+                        "[T %d] Backing off for %.4f seconds", worker_index, wait_s
+                    )
                     time.sleep(wait_s)
 
-                continue
-
-            # if debug: logging.debug("%s\nParameters:\n%s\nResult:\n%s" % (txn, pformat(params), pformat(val)))
-
-            r.stopTransaction(txn_id)
         ## WHILE
 
+        verbose_logger.info("[T %d] Benchmark stopping...", worker_index)
         r.stopBenchmark()
+        verbose_logger.info("[T %d] Benchmark done.", worker_index)
         return r
 
     ## DEF

From 24fb2c1dce1f2893188f8b8fec02f0180a3be0a7 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sun, 12 May 2024 16:28:10 -0400
Subject: [PATCH 28/30] Handle exceptions when attempting rollback

---
 workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
index 3ea3eec9..010ae97a 100644
--- a/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
+++ b/workloads/chbenchmark/py-tpcc/pytpcc/runtime/executor.py
@@ -178,7 +178,14 @@ def execute(
                 if self.stop_on_error:
                     raise
                 r.abortTransaction(txn_id)
-                self.driver.ensureRollback()
+
+                try:
+                    self.driver.ensureRollback()
+                except:  # pylint: disable=bare-except
+                    # This may happen if we try to issue a rollback when the connection has dropped.
+                    verbose_logger.exception(
+                        "[T %d] Ran into error when running rollback.", worker_index
+                    )
 
                 # Back off slightly.
                 if backoff is None:

From 4a793e57cbed7f6fe93ac3e5f9681ca1d13bcbf6 Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sun, 12 May 2024 16:45:48 -0400
Subject: [PATCH 29/30] Additional workload adjustments

---
 experiments/17-chbenchmark/scale_down/COND | 1 +
 1 file changed, 1 insertion(+)

diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
index eff507cc..a8a2a218 100644
--- a/experiments/17-chbenchmark/scale_down/COND
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -5,6 +5,7 @@ QUERIES = list(range(22))
 QUERIES.remove(4)
 QUERIES.remove(9)
 QUERIES.remove(17)
+QUERIES.remove(20)
 QUERIES_STR = ",".join([str(v) for v in QUERIES])
 
 run_experiment(

From 04e8112347f96704eed13bfe19bffcb81b4b920d Mon Sep 17 00:00:00 2001
From: Geoffrey Yu <geoffxy@mit.edu>
Date: Sun, 12 May 2024 23:04:55 -0400
Subject: [PATCH 30/30] CH-BenCHmark: Add support for A+R baseline

---
 experiments/17-chbenchmark/common.sh          | 52 +++++++++++++++++++
 .../17-chbenchmark/scale_down/.gitignore      |  1 +
 experiments/17-chbenchmark/scale_down/COND    | 20 ++++++-
 .../scale_down/run_full_ar_baseline.sh        | 24 +++++++++
 workloads/chbenchmark/queries.sql             |  2 +-
 5 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 experiments/17-chbenchmark/scale_down/.gitignore
 create mode 100755 experiments/17-chbenchmark/scale_down/run_full_ar_baseline.sh

diff --git a/experiments/17-chbenchmark/common.sh b/experiments/17-chbenchmark/common.sh
index 95ee520c..56b91ca2 100644
--- a/experiments/17-chbenchmark/common.sh
+++ b/experiments/17-chbenchmark/common.sh
@@ -32,6 +32,26 @@ function run_tpcc() {
   popd
 }
 
+function run_tpcc_aurora_serverless() {
+  local results_name=$1
+  pushd ../../../workloads/chbenchmark/py-tpcc/
+  local args=(
+    --no-load
+    --config $abs_txn_config_file
+    --warehouses $txn_warehouses
+    --duration $run_for_s
+    --clients $t_clients
+    --scalefactor $txn_scale_factor
+  )
+  if [[ ! -z $txn_zipfian_alpha ]]; then
+    args+=(--zipfian-alpha $txn_zipfian_alpha)
+  fi
+  mkdir -p $COND_OUT/$results_name
+  RECORD_DETAILED_STATS=1 COND_OUT=$COND_OUT/$results_name python3 -m pytpcc.tpcc aurora "${args[@]}" &
+  tpcc_pid=$!
+  popd
+}
+
 function log_workload_point() {
   msg=$1
   now=$(date --utc "+%Y-%m-%d %H:%M:%S")
@@ -74,6 +94,38 @@ function start_repeating_olap_runner() {
   runner_pid=$!
 }
 
+function start_repeating_olap_runner_redshift_serverless() {
+  local ra_clients=$1
+  local ra_gap_s=$2
+  local ra_gap_std_s=$3
+  local query_indexes=$4
+  local results_name=$5
+
+  local args=(
+    --num-clients $ra_clients
+    --num-front-ends $num_front_ends
+    --query-indexes $query_indexes
+    --query-bank-file $ra_query_bank_file
+    --avg-gap-s $ra_gap_s
+    --avg-gap-std-s $ra_gap_std_s
+    --brad-direct
+    --engine redshift
+    --serverless-redshift
+    --schema-name $schema_name
+    --config-file $abs_physical_config_file
+  )
+
+  >&2 echo "[Serial Repeating Analytics] Running with $ra_clients..."
+  results_dir=$COND_OUT/$results_name
+  mkdir -p $results_dir
+
+  log_workload_point $results_name
+  COND_OUT=$results_dir python3.11 ../../../workloads/IMDB_extended/run_repeating_analytics_serial.py "${args[@]}" &
+
+  # This is a special return value variable that we use.
+  runner_pid=$!
+}
+
 function graceful_shutdown() {
   for pid_var in "$@"; do
     kill -INT $pid_var
diff --git a/experiments/17-chbenchmark/scale_down/.gitignore b/experiments/17-chbenchmark/scale_down/.gitignore
new file mode 100644
index 00000000..0949a3cb
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/.gitignore
@@ -0,0 +1 @@
+aurora.config
diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND
index a8a2a218..a78e79fb 100644
--- a/experiments/17-chbenchmark/scale_down/COND
+++ b/experiments/17-chbenchmark/scale_down/COND
@@ -20,7 +20,25 @@ run_experiment(
     "txn-scale-factor": 1,  # TBD
     "t-clients": 4,  # TBD
     "num-front-ends": 5, # TBD
-    "run-for-s": 2 * 60 * 60,  # 1 hour
+    "run-for-s": 2 * 60 * 60,  # 2 hours
+    "txn-zipfian-alpha": ZIPFIAN_ALPHA,
+    "ra-query-indexes": QUERIES_STR,
+    "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
+  },
+)
+
+
+run_experiment(
+  name="run_full_ar",
+  run="./run_full_ar_baseline.sh",
+  options={
+    "physical-config-file": "../../../config/physical_config_chbench.yml",
+    "schema-name": "chbenchmark",
+    "txn-config-file": "aurora.config",
+    "txn-warehouses": 1740,
+    "txn-scale-factor": 1,  # TBD
+    "t-clients": 4,  # TBD
+    "run-for-s": 2 * 60 * 60,  # 2 hours
     "txn-zipfian-alpha": ZIPFIAN_ALPHA,
     "ra-query-indexes": QUERIES_STR,
     "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql",
diff --git a/experiments/17-chbenchmark/scale_down/run_full_ar_baseline.sh b/experiments/17-chbenchmark/scale_down/run_full_ar_baseline.sh
new file mode 100755
index 00000000..4e904032
--- /dev/null
+++ b/experiments/17-chbenchmark/scale_down/run_full_ar_baseline.sh
@@ -0,0 +1,24 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source ../common.sh
+extract_named_arguments $@
+
+# Resolve paths into absolute paths
+abs_txn_config_file=$(realpath $txn_config_file)
+abs_physical_config_file=$(realpath $physical_config_file)
+
+sleep 30
+
+run_tpcc_aurora_serverless "t_4"
+start_repeating_olap_runner_redshift_serverless 1 10 5 $ra_query_indexes "ch_1"
+ra_pid=$runner_pid
+
+sleep $run_for_s
+
+# Shut down.
+kill $tpcc_pid
+kill $ra_pid
+wait $tpcc_pid
+wait $ra_pid
diff --git a/workloads/chbenchmark/queries.sql b/workloads/chbenchmark/queries.sql
index 6ced3e67..c21976be 100644
--- a/workloads/chbenchmark/queries.sql
+++ b/workloads/chbenchmark/queries.sql
@@ -4,7 +4,7 @@ select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from cust
 select o_ol_cnt, count(*) as order_count from orders where exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt;
 select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(cast(substring(c_state,1,1) as varchar(1))) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc;
 select sum(ol_amount) as revenue from order_line where ol_quantity between 1 and 100000;
-WITH inner_query AS (select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ((n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany'))) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
+WITH inner_query AS (select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(cast(substring(c_state,1,1) as varchar(1))) = n2.n_nationkey and ((n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany'))) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
 select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year;
 select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc;
 select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc;