From e8fd93e8c5528917f933f7d651e033a359c655cb Mon Sep 17 00:00:00 2001 From: Geoffrey Yu Date: Sat, 27 Apr 2024 20:15:51 -0400 Subject: [PATCH] Add data gathering scripts for CH-BenCHmark (load and instance types) (#501) Part of #487. --- .../calibration/load_chbench/cond_config.toml | 0 .../load_chbench/gather_redshift.sh | 86 +++++++++++++++++++ tools/calibration/load_chbench/redshift/COND | 49 +++++++++++ .../load_chbench/sample_full_queries.py | 29 +++++++ .../load_chbench/selected_queries.sql | 22 +++++ .../calibration/load_chbench/test_queries.py | 43 ++++++++++ 6 files changed, 229 insertions(+) create mode 100644 tools/calibration/load_chbench/cond_config.toml create mode 100755 tools/calibration/load_chbench/gather_redshift.sh create mode 100644 tools/calibration/load_chbench/redshift/COND create mode 100644 tools/calibration/load_chbench/sample_full_queries.py create mode 100644 tools/calibration/load_chbench/selected_queries.sql create mode 100644 tools/calibration/load_chbench/test_queries.py diff --git a/tools/calibration/load_chbench/cond_config.toml b/tools/calibration/load_chbench/cond_config.toml new file mode 100644 index 00000000..e69de29b diff --git a/tools/calibration/load_chbench/gather_redshift.sh b/tools/calibration/load_chbench/gather_redshift.sh new file mode 100755 index 00000000..37bd23aa --- /dev/null +++ b/tools/calibration/load_chbench/gather_redshift.sh @@ -0,0 +1,86 @@ +#! /bin/bash + +if [ -z $2 ]; then + >&2 echo "Usage: $0 " + >&2 echo "The config path should be relative to the redshift/ subdirectory." + exit 1 +fi + +export BRAD_CONFIG=$1 +cluster_identifier=$2 + +export BRAD_SCHEMA="chbenchmark" + +function run_warm_up() { + >&2 echo "Running warm up..." + pushd redshift + python3 -m brad.calibration.measure_load --run-warmup --engine redshift --query-file ../../../../tools/calibration/load_chbench/selected_queries.sql + popd +} + +function sync_redshift_resize() { + raw_instance=$1 + target_instance_type=${raw_instance//_/.} + target_node_count=$2 + + if [[ $target_node_count = "2" ]] && [[ $raw_instance = "dc2_large" ]]; then + >&2 echo "Skipping initial resize to $raw_instance $target_node_count (special case)" + return + fi + + # Try an elastic resize first. + >&2 echo "Resizing Redshift cluster to $target_instance_type with $target_node_count nodes (attempt elastic)" + aws redshift resize-cluster --cluster-identifier "$cluster_identifier" --cluster-type multi-node --node-type "$target_instance_type" --number-of-nodes "$target_node_count" --no-classic --region us-east-1 > /dev/null + result=$? + + # Resize Redshift cluster + if [ $result -ne 0 ]; then + >&2 echo "Classic resizing Redshift cluster to $target_instance_type with $target_node_count nodes" + aws redshift modify-cluster --cluster-identifier "$cluster_identifier" --node-type "$target_instance_type" --number-of-nodes "$target_node_count" > /dev/null + fi + + sleep 60 + + # Wait for resize to complete + while true; do + cluster_status=$(aws redshift describe-clusters --cluster-identifier "$cluster_identifier" --query 'Clusters[0].ClusterStatus' --output text) + if [[ $cluster_status == "available" ]]; then + break + fi + >&2 echo "Waiting for resize to complete..." + sleep 10 + done +} + +function run_cfg() { + instance_type=$1 + num_nodes=$2 + + >&2 echo "$instance_type $num_nodes" + sync_redshift_resize $instance_type $num_nodes + >&2 echo "Warming up..." + run_warm_up + >&2 echo "Running..." + cond run "//redshift:${instance_type}-${num_nodes}" +} + +>&2 echo "Running $cluster_identifier" +>&2 echo "Config $BRAD_CONFIG" +>&2 echo "Cluster id $cluster_identifier" +sleep 10 + +run_cfg "dc2_large" 2 +run_cfg "dc2_large" 4 +run_cfg "dc2_large" 8 +run_cfg "dc2_large" 16 +run_cfg "ra3_xlplus" 2 +run_cfg "ra3_xlplus" 4 +run_cfg "ra3_xlplus" 8 +run_cfg "ra3_4xlarge" 8 +run_cfg "ra3_4xlarge" 4 +run_cfg "ra3_4xlarge" 2 + +sleep 60 + +>&2 echo "Done. Pausing $cluster_identifier..." +aws redshift pause-cluster --cluster-identifier "$cluster_identifier" diff --git a/tools/calibration/load_chbench/redshift/COND b/tools/calibration/load_chbench/redshift/COND new file mode 100644 index 00000000..7dfa96a0 --- /dev/null +++ b/tools/calibration/load_chbench/redshift/COND @@ -0,0 +1,49 @@ +from itertools import product + + +AVG_GAP_S = 3 +RUN_FOR_S = 5 * 60 # 5 minutes +NUM_CLIENTS = [1, 2, 4, 6] +WAIT_BEFORE_START = 10 +NUM_QUERIES = 22 + + +# Relative to experiment definition directories. +QUERY_BANK = "../selected_queries.sql" + + +CLUSTER_CONFIGS = [ + ("dc2_large", 2), + ("dc2_large", 4), + ("dc2_large", 8), + ("dc2_large", 16), + ("ra3_xlplus", 2), + ("ra3_xlplus", 4), + ("ra3_xlplus", 8), + ("ra3_4xlarge", 2), + ("ra3_4xlarge", 4), + ("ra3_4xlarge", 8), +] + + +for inst, nodes in CLUSTER_CONFIGS: + cfg_name = f"{inst}-{nodes}" + run_experiment_group( + name=cfg_name, + run="python3 -m brad.calibration.measure_load", + experiments=[ + ExperimentInstance( + name=f"{cfg_name}-{clients}-q{query_idx}", + options={ + "num-clients": clients, + "specific-query-idx": query_idx, + "run-for-s": RUN_FOR_S, + "avg-gap-s": AVG_GAP_S, + "wait-before-start": WAIT_BEFORE_START, + "query-file": QUERY_BANK, + "engine": "redshift", + }, + ) + for query_idx, clients in product(range(NUM_QUERIES), NUM_CLIENTS) + ], + ) diff --git a/tools/calibration/load_chbench/sample_full_queries.py b/tools/calibration/load_chbench/sample_full_queries.py new file mode 100644 index 00000000..02ee1385 --- /dev/null +++ b/tools/calibration/load_chbench/sample_full_queries.py @@ -0,0 +1,29 @@ +import argparse +import random + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--query-file", type=str, required=True) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--num-query-blocks", type=int, default=22) + parser.add_argument("--queries-per-block", type=int, default=200) + args = parser.parse_args() + + prng = random.Random(args.seed) + + with open(args.query_file, "r", encoding="UTF-8") as file: + queries = [line.strip() for line in file] + + selected = [] + for qidx in range(args.num_query_blocks): + offset = prng.randint(0, args.queries_per_block - 1) + selected.append(queries[qidx * args.queries_per_block + offset]) + + with open("selected_queries.sql", "w", encoding="UTF-8") as file: + for q in selected: + print(q, file=file) + + +if __name__ == "__main__": + main() diff --git a/tools/calibration/load_chbench/selected_queries.sql b/tools/calibration/load_chbench/selected_queries.sql new file mode 100644 index 00000000..8700fcf9 --- /dev/null +++ b/tools/calibration/load_chbench/selected_queries.sql @@ -0,0 +1,22 @@ +select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line where ol_amount <= 33.06648003661816 group by ol_number order by ol_number; +select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and s_quantity >= 11.777062936461403 and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and su_suppkey >= 2941.508980163913 and i_id <= 79857.67421953629 and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id; +select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and o_w_id >= 42.7486611465113 and ol_amount <= 99.70737680321619 and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d; +select o_ol_cnt, count(*) as order_count from orders where o_carrier_id <= 6.749521428520183 and exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt; +select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and n_nationkey <= 99.21862547006236 and s_order_cnt >= 42.71343586058127 and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(substring(c_state,1,1)) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc; +select sum(ol_amount) as revenue from order_line where ol_quantity <= 5.0 and ol_quantity between 1 and 100000; +WITH inner_query AS ( select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and c_id <= 2665.5792747107366 and s_quantity <= 72.4600053847094 and su_suppkey >= 528.9934672447876 and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ( (n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany') ) ) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year; +select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and o_w_id >= 583.5206747942913 and s_order_cnt <= 81.86012188607452 and su_suppkey >= 689.8054116558625 and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year; +select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and o_id <= 2939.378308830152 and s_quantity <= 86.59608959532211 and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc; +select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and n_nationkey <= 116.40294250401558 and o_id <= 1560.082691309974 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc; +select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_suppkey >= 2406.641955682944 and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and s_quantity >= 27.35152833573742 and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc; +select o_ol_cnt, sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_amount <= 36.57742392006392 and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt; +select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id >= 4.392919247526648 ) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc; +select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id and i_id <= 82830.86056286634; +with revenue (supplier_no, total_revenue) as ( select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and s_quantity >= 52.409029036617504 group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) and su_suppkey >= 600.4811082997699 order by su_suppkey; +select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and i_price >= 17.67656701310919 and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc; +select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_quantity <= 5.0 and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a; +select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_d_id <= 6.652803502875462 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d; +select sum(ol_amount) as revenue from order_line, item where ( ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 97.10832030687996 and ol_w_id in (1,2,3) ) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 79.78326318023088 and ol_w_id in (1,2,4) ) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price >= 5.450023586551352 and ol_w_id in (1,5,3) ); +select su_name, su_address from supplier, nation where su_suppkey in (select mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having 2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and su_suppkey <= 8996.163667412242 and n_name = 'Germany' order by su_name; +select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and su_suppkey <= 8526.675416612981 and o_w_id >= 369.02551642220345 and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name; +select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id and o_w_id <= 1264.1427731874844 ) group by substring(c_state,1,1) order by substring(c_state,1,1); diff --git a/tools/calibration/load_chbench/test_queries.py b/tools/calibration/load_chbench/test_queries.py new file mode 100644 index 00000000..72d807da --- /dev/null +++ b/tools/calibration/load_chbench/test_queries.py @@ -0,0 +1,43 @@ +import argparse +import asyncio +from brad.config.file import ConfigFile +from brad.connection.factory import ConnectionFactory +from brad.config.engine import Engine +from brad.provisioning.directory import Directory + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--schema-name", type=str, required=True) + parser.add_argument("--physical-config-file", type=str, required=True) + parser.add_argument("--query-file", type=str, required=True) + args = parser.parse_args() + + with open(args.query_file, "r", encoding="UTF-8") as file: + queries = [line.strip() for line in file] + + config = ConfigFile.load_from_physical_config(args.physical_config_file) + directory = Directory(config) + asyncio.run(directory.refresh()) + connection = ConnectionFactory.connect_to_sync( + Engine.Redshift, args.schema_name, config, directory, autocommit=True + ) + + cursor = connection.cursor_sync() + num_succeeded = 0 + for idx, q in enumerate(queries): + try: + print("Running query", idx, "of", len(queries) - 1) + cursor.execute_sync(q) + num_succeeded += 1 + except Exception as ex: + print("Query", idx, "failed with error", str(ex)) + + if num_succeeded == len(queries): + print("All succeeded.") + else: + print((len(queries) - num_succeeded), "failed.") + + +if __name__ == "__main__": + main()