Skip to content

Commit

Permalink
Add data gathering scripts for CH-BenCHmark (load and instance types) (
Browse files Browse the repository at this point in the history
…#501)

Part of #487.
  • Loading branch information
geoffxy authored Apr 28, 2024
1 parent f5cb084 commit e8fd93e
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 0 deletions.
Empty file.
86 changes: 86 additions & 0 deletions tools/calibration/load_chbench/gather_redshift.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#! /bin/bash

if [ -z $2 ]; then
>&2 echo "Usage: $0 <config_path> <cluster id>"
>&2 echo "The config path should be relative to the redshift/ subdirectory."
exit 1
fi

export BRAD_CONFIG=$1
cluster_identifier=$2

export BRAD_SCHEMA="chbenchmark"

function run_warm_up() {
>&2 echo "Running warm up..."
pushd redshift
python3 -m brad.calibration.measure_load --run-warmup --engine redshift --query-file ../../../../tools/calibration/load_chbench/selected_queries.sql
popd
}

function sync_redshift_resize() {
raw_instance=$1
target_instance_type=${raw_instance//_/.}
target_node_count=$2

if [[ $target_node_count = "2" ]] && [[ $raw_instance = "dc2_large" ]]; then
>&2 echo "Skipping initial resize to $raw_instance $target_node_count (special case)"
return
fi

# Try an elastic resize first.
>&2 echo "Resizing Redshift cluster to $target_instance_type with $target_node_count nodes (attempt elastic)"
aws redshift resize-cluster --cluster-identifier "$cluster_identifier" --cluster-type multi-node --node-type "$target_instance_type" --number-of-nodes "$target_node_count" --no-classic --region us-east-1 > /dev/null
result=$?

# Resize Redshift cluster
if [ $result -ne 0 ]; then
>&2 echo "Classic resizing Redshift cluster to $target_instance_type with $target_node_count nodes"
aws redshift modify-cluster --cluster-identifier "$cluster_identifier" --node-type "$target_instance_type" --number-of-nodes "$target_node_count" > /dev/null
fi

sleep 60

# Wait for resize to complete
while true; do
cluster_status=$(aws redshift describe-clusters --cluster-identifier "$cluster_identifier" --query 'Clusters[0].ClusterStatus' --output text)
if [[ $cluster_status == "available" ]]; then
break
fi
>&2 echo "Waiting for resize to complete..."
sleep 10
done
}

function run_cfg() {
instance_type=$1
num_nodes=$2

>&2 echo "$instance_type $num_nodes"
sync_redshift_resize $instance_type $num_nodes
>&2 echo "Warming up..."
run_warm_up
>&2 echo "Running..."
cond run "//redshift:${instance_type}-${num_nodes}"
}

>&2 echo "Running $cluster_identifier"
>&2 echo "Config $BRAD_CONFIG"
>&2 echo "Cluster id $cluster_identifier"
sleep 10

run_cfg "dc2_large" 2
run_cfg "dc2_large" 4
run_cfg "dc2_large" 8
run_cfg "dc2_large" 16
run_cfg "ra3_xlplus" 2
run_cfg "ra3_xlplus" 4
run_cfg "ra3_xlplus" 8
run_cfg "ra3_4xlarge" 8
run_cfg "ra3_4xlarge" 4
run_cfg "ra3_4xlarge" 2

sleep 60

>&2 echo "Done. Pausing $cluster_identifier..."
aws redshift pause-cluster --cluster-identifier "$cluster_identifier"
49 changes: 49 additions & 0 deletions tools/calibration/load_chbench/redshift/COND
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from itertools import product


AVG_GAP_S = 3
RUN_FOR_S = 5 * 60 # 5 minutes
NUM_CLIENTS = [1, 2, 4, 6]
WAIT_BEFORE_START = 10
NUM_QUERIES = 22


# Relative to experiment definition directories.
QUERY_BANK = "../selected_queries.sql"


CLUSTER_CONFIGS = [
("dc2_large", 2),
("dc2_large", 4),
("dc2_large", 8),
("dc2_large", 16),
("ra3_xlplus", 2),
("ra3_xlplus", 4),
("ra3_xlplus", 8),
("ra3_4xlarge", 2),
("ra3_4xlarge", 4),
("ra3_4xlarge", 8),
]


for inst, nodes in CLUSTER_CONFIGS:
cfg_name = f"{inst}-{nodes}"
run_experiment_group(
name=cfg_name,
run="python3 -m brad.calibration.measure_load",
experiments=[
ExperimentInstance(
name=f"{cfg_name}-{clients}-q{query_idx}",
options={
"num-clients": clients,
"specific-query-idx": query_idx,
"run-for-s": RUN_FOR_S,
"avg-gap-s": AVG_GAP_S,
"wait-before-start": WAIT_BEFORE_START,
"query-file": QUERY_BANK,
"engine": "redshift",
},
)
for query_idx, clients in product(range(NUM_QUERIES), NUM_CLIENTS)
],
)
29 changes: 29 additions & 0 deletions tools/calibration/load_chbench/sample_full_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import argparse
import random


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--query-file", type=str, required=True)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--num-query-blocks", type=int, default=22)
parser.add_argument("--queries-per-block", type=int, default=200)
args = parser.parse_args()

prng = random.Random(args.seed)

with open(args.query_file, "r", encoding="UTF-8") as file:
queries = [line.strip() for line in file]

selected = []
for qidx in range(args.num_query_blocks):
offset = prng.randint(0, args.queries_per_block - 1)
selected.append(queries[qidx * args.queries_per_block + offset])

with open("selected_queries.sql", "w", encoding="UTF-8") as file:
for q in selected:
print(q, file=file)


if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions tools/calibration/load_chbench/selected_queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line where ol_amount <= 33.06648003661816 group by ol_number order by ol_number;
select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and s_quantity >= 11.777062936461403 and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and su_suppkey >= 2941.508980163913 and i_id <= 79857.67421953629 and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id;
select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and o_w_id >= 42.7486611465113 and ol_amount <= 99.70737680321619 and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d;
select o_ol_cnt, count(*) as order_count from orders where o_carrier_id <= 6.749521428520183 and exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt;
select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and n_nationkey <= 99.21862547006236 and s_order_cnt >= 42.71343586058127 and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(substring(c_state,1,1)) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc;
select sum(ol_amount) as revenue from order_line where ol_quantity <= 5.0 and ol_quantity between 1 and 100000;
WITH inner_query AS ( select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and c_id <= 2665.5792747107366 and s_quantity <= 72.4600053847094 and su_suppkey >= 528.9934672447876 and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ( (n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany') ) ) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year;
select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and o_w_id >= 583.5206747942913 and s_order_cnt <= 81.86012188607452 and su_suppkey >= 689.8054116558625 and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year;
select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and o_id <= 2939.378308830152 and s_quantity <= 86.59608959532211 and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc;
select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and n_nationkey <= 116.40294250401558 and o_id <= 1560.082691309974 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc;
select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_suppkey >= 2406.641955682944 and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and s_quantity >= 27.35152833573742 and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc;
select o_ol_cnt, sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_amount <= 36.57742392006392 and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt;
select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id >= 4.392919247526648 ) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc;
select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id and i_id <= 82830.86056286634;
with revenue (supplier_no, total_revenue) as ( select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and s_quantity >= 52.409029036617504 group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) and su_suppkey >= 600.4811082997699 order by su_suppkey;
select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and i_price >= 17.67656701310919 and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc;
select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_quantity <= 5.0 and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a;
select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_d_id <= 6.652803502875462 and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d;
select sum(ol_amount) as revenue from order_line, item where ( ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 97.10832030687996 and ol_w_id in (1,2,3) ) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price <= 79.78326318023088 and ol_w_id in (1,2,4) ) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price >= 5.450023586551352 and ol_w_id in (1,5,3) );
select su_name, su_address from supplier, nation where su_suppkey in (select mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having 2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and su_suppkey <= 8996.163667412242 and n_name = 'Germany' order by su_name;
select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and su_suppkey <= 8526.675416612981 and o_w_id >= 369.02551642220345 and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name;
select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id and o_w_id <= 1264.1427731874844 ) group by substring(c_state,1,1) order by substring(c_state,1,1);
43 changes: 43 additions & 0 deletions tools/calibration/load_chbench/test_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
import asyncio
from brad.config.file import ConfigFile
from brad.connection.factory import ConnectionFactory
from brad.config.engine import Engine
from brad.provisioning.directory import Directory


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--schema-name", type=str, required=True)
parser.add_argument("--physical-config-file", type=str, required=True)
parser.add_argument("--query-file", type=str, required=True)
args = parser.parse_args()

with open(args.query_file, "r", encoding="UTF-8") as file:
queries = [line.strip() for line in file]

config = ConfigFile.load_from_physical_config(args.physical_config_file)
directory = Directory(config)
asyncio.run(directory.refresh())
connection = ConnectionFactory.connect_to_sync(
Engine.Redshift, args.schema_name, config, directory, autocommit=True
)

cursor = connection.cursor_sync()
num_succeeded = 0
for idx, q in enumerate(queries):
try:
print("Running query", idx, "of", len(queries) - 1)
cursor.execute_sync(q)
num_succeeded += 1
except Exception as ex:
print("Query", idx, "failed with error", str(ex))

if num_succeeded == len(queries):
print("All succeeded.")
else:
print((len(queries) - num_succeeded), "failed.")


if __name__ == "__main__":
main()

0 comments on commit e8fd93e

Please sign in to comment.