From ccd45c438ae61c15d7bbe5e5c641d07300ef2a13 Mon Sep 17 00:00:00 2001 From: Geoffrey Yu Date: Thu, 9 May 2024 23:43:14 -0400 Subject: [PATCH] Add experiment configs for CH-BenCHmark scenario (#509) Part of #487. --- experiments/17-chbenchmark/common.sh | 58 +++++- experiments/17-chbenchmark/debug/COND | 25 +++ .../17-chbenchmark/debug/debug_config.yml | 12 +- experiments/17-chbenchmark/debug/run_full.sh | 25 +++ .../debug/set_up_starting_blueprint.sh | 20 +++ experiments/17-chbenchmark/scale_down/COND | 24 +++ .../17-chbenchmark/scale_down/brad.config | 6 + .../scale_down/ch_scale_down_config.yml | 167 ++++++++++++++++++ .../17-chbenchmark/scale_down/run_full.sh | 25 +++ .../scale_down/set_up_starting_blueprint.sh | 21 +++ src/brad/config/file.py | 6 + src/brad/daemon/daemon.py | 6 +- src/brad/front_end/front_end.py | 34 +++- src/brad/front_end/session.py | 5 +- .../set_up_starting_blueprint.py | 18 +- workloads/chbenchmark/queries.sql | 22 +++ 16 files changed, 457 insertions(+), 17 deletions(-) create mode 100755 experiments/17-chbenchmark/debug/run_full.sh create mode 100755 experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh create mode 100644 experiments/17-chbenchmark/scale_down/COND create mode 100644 experiments/17-chbenchmark/scale_down/brad.config create mode 100644 experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml create mode 100755 experiments/17-chbenchmark/scale_down/run_full.sh create mode 100755 experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh create mode 100644 workloads/chbenchmark/queries.sql diff --git a/experiments/17-chbenchmark/common.sh b/experiments/17-chbenchmark/common.sh index 2db49e0e..95ee520c 100644 --- a/experiments/17-chbenchmark/common.sh +++ b/experiments/17-chbenchmark/common.sh @@ -13,6 +13,7 @@ function start_brad() { } function run_tpcc() { + local results_name=$1 pushd ../../../workloads/chbenchmark/py-tpcc/ local args=( --no-load @@ -25,11 +26,66 @@ function run_tpcc() { if [[ ! -z $txn_zipfian_alpha ]]; then args+=(--zipfian-alpha $txn_zipfian_alpha) fi - RECORD_DETAILED_STATS=1 python3 -m pytpcc.tpcc brad "${args[@]}" & + mkdir -p $COND_OUT/$results_name + RECORD_DETAILED_STATS=1 COND_OUT=$COND_OUT/$results_name python3 -m pytpcc.tpcc brad "${args[@]}" & tpcc_pid=$! popd } +function log_workload_point() { + msg=$1 + now=$(date --utc "+%Y-%m-%d %H:%M:%S") + echo "$now,$msg" >> $COND_OUT/points.log +} + +function start_repeating_olap_runner() { + local ra_clients=$1 + local ra_gap_s=$2 + local ra_gap_std_s=$3 + local query_indexes=$4 + local results_name=$5 + local client_offset=$6 + + local args=( + --num-clients $ra_clients + --num-front-ends $num_front_ends + --query-indexes $query_indexes + --query-bank-file $ra_query_bank_file + --avg-gap-s $ra_gap_s + --avg-gap-std-s $ra_gap_std_s + ) + + if [[ ! -z $ra_query_frequency_path ]]; then + args+=(--query-frequency-path $ra_query_frequency_path) + fi + + if [[ ! -z $client_offset ]]; then + args+=(--client-offset $client_offset) + fi + + >&2 echo "[Serial Repeating Analytics] Running with $ra_clients..." + results_dir=$COND_OUT/$results_name + mkdir -p $results_dir + + log_workload_point $results_name + COND_OUT=$results_dir python3.11 ../../../workloads/IMDB_extended/run_repeating_analytics_serial.py "${args[@]}" & + + # This is a special return value variable that we use. + runner_pid=$! +} + +function graceful_shutdown() { + for pid_var in "$@"; do + kill -INT $pid_var + done + for pid_var in "$@"; do + wait $pid_var + done + + kill -INT $brad_pid + wait $brad_pid +} + function extract_named_arguments() { # Evaluates any environment variables in this script's arguments. This script # should only be run on trusted input. diff --git a/experiments/17-chbenchmark/debug/COND b/experiments/17-chbenchmark/debug/COND index 7feaa352..7f403f67 100644 --- a/experiments/17-chbenchmark/debug/COND +++ b/experiments/17-chbenchmark/debug/COND @@ -81,3 +81,28 @@ run_experiment( "txn-zipfian-alpha": ZIPFIAN_ALPHA, }, ) + +# Query indices. +QUERIES = list(range(22)) +QUERIES.remove(4) +QUERIES.remove(13) +QUERIES_STR = ",".join([str(v) for v in QUERIES]) + +run_experiment( + name="run_full", + run="./run_full.sh", + options={ + "physical-config-file": "../../../config/physical_config_chbench.yml", + "system-config-file": "debug_config.yml", # Relative to one level up. + "schema-name": "chbenchmark", + "txn-config-file": "brad.config", + "txn-warehouses": 1740, + "txn-scale-factor": 1, # TBD + "t-clients": 1, # TBD + "num-front-ends": 2, # TBD + "run-for-s": 60 * 60, # One hour + "txn-zipfian-alpha": ZIPFIAN_ALPHA, + "ra-query-indexes": QUERIES_STR, + "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql", + }, +) diff --git a/experiments/17-chbenchmark/debug/debug_config.yml b/experiments/17-chbenchmark/debug/debug_config.yml index c279878d..3b8a6015 100644 --- a/experiments/17-chbenchmark/debug/debug_config.yml +++ b/experiments/17-chbenchmark/debug/debug_config.yml @@ -6,7 +6,7 @@ # listen on successive ports (e.g., 6584, 6585, etc.). front_end_interface: "0.0.0.0" front_end_port: 6583 -num_front_ends: 1 +num_front_ends: 2 # If installed and enabled, BRAD will serve its UI from a webserver that listens # for connections on this network interface and port. @@ -42,7 +42,7 @@ front_end_query_latency_buffer_size: 100 # `default` means to use the policy encoded in the blueprint. Other values will # override the blueprint. -routing_policy: always_aurora +routing_policy: default # Whether to disable table movement for benchmark purposes (i.e., keep all # tables on all engines.) @@ -104,6 +104,8 @@ txn_latency_p90_ceiling_s: 0.030 # clusters instead of resizing the main Redshift cluster. use_preset_redshift_clusters: false +result_row_limit: 10 + # Used for ordering blueprints during planning. comparator: type: benefit_perf_ceiling # or `perf_ceiling` @@ -119,10 +121,8 @@ comparator: # Used for precomputed predictions. std_datasets: - - name: regular - path: workloads/IMDB_100GB/regular_test/ - - name: adhoc - path: workloads/IMDB_100GB/adhoc_test/ + - name: chbenchmark + path: workloads/chbenchmark/ # Blueprint planning trigger configs. diff --git a/experiments/17-chbenchmark/debug/run_full.sh b/experiments/17-chbenchmark/debug/run_full.sh new file mode 100755 index 00000000..f06a0504 --- /dev/null +++ b/experiments/17-chbenchmark/debug/run_full.sh @@ -0,0 +1,25 @@ +#! /bin/bash + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh +extract_named_arguments $@ + +# Resolve paths into absolute paths +abs_txn_config_file=$(realpath $txn_config_file) +abs_system_config_file=$(realpath $system_config_file) +abs_physical_config_file=$(realpath $physical_config_file) + +export BRAD_IGNORE_BLUEPRINT=1 +start_brad $abs_system_config_file $abs_physical_config_file + +sleep 30 + +run_tpcc "t_1" +start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients +ra_pid=$runner_pid + +sleep $run_for_s + +# Shut down. +graceful_shutdown $tpcc_pid $ra_pid diff --git a/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh new file mode 100755 index 00000000..2e7c9986 --- /dev/null +++ b/experiments/17-chbenchmark/debug/set_up_starting_blueprint.sh @@ -0,0 +1,20 @@ +#! /bin/bash + +if [ -z $1 ]; then + >&2 echo "Usage: $0 path/to/physical/config.yml" + exit 1 +fi + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh + +python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \ + --schema-name chbenchmark \ + --query-bank-file ../../../workloads/chbenchmark/queries.sql \ + --redshift-queries "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \ + --redshift-provisioning "dc2.large:16" \ + --aurora-provisioning "db.r6g.xlarge:1" \ + --system-config-file debug_config.yml \ + --physical-config-file $1 \ + --override-definite-routing redshift diff --git a/experiments/17-chbenchmark/scale_down/COND b/experiments/17-chbenchmark/scale_down/COND new file mode 100644 index 00000000..f62230bc --- /dev/null +++ b/experiments/17-chbenchmark/scale_down/COND @@ -0,0 +1,24 @@ +ZIPFIAN_ALPHA = 5.0 + +# Query indices. +QUERIES = list(range(22)) +QUERIES_STR = ",".join([str(v) for v in QUERIES]) + +run_experiment( + name="run_full", + run="./run_full.sh", + options={ + "physical-config-file": "../../../config/physical_config_chbench.yml", + "system-config-file": "ch_scale_down_config.yml", # Relative to one level up. + "schema-name": "chbenchmark", + "txn-config-file": "brad.config", + "txn-warehouses": 1740, + "txn-scale-factor": 1, # TBD + "t-clients": 1, # TBD + "num-front-ends": 2, # TBD + "run-for-s": 2 * 60 * 60, # 2 hours + "txn-zipfian-alpha": ZIPFIAN_ALPHA, + "ra-query-indexes": QUERIES_STR, + "ra-query-bank-file": "../../../workloads/chbenchmark/queries.sql", + }, +) diff --git a/experiments/17-chbenchmark/scale_down/brad.config b/experiments/17-chbenchmark/scale_down/brad.config new file mode 100644 index 00000000..c71fe1e5 --- /dev/null +++ b/experiments/17-chbenchmark/scale_down/brad.config @@ -0,0 +1,6 @@ +# BradDriver Configuration File +[brad] +host = localhost +port = 6583 +isolation_level = REPEATABLE READ +use_worker_offset = true diff --git a/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml new file mode 100644 index 00000000..3e40530d --- /dev/null +++ b/experiments/17-chbenchmark/scale_down/ch_scale_down_config.yml @@ -0,0 +1,167 @@ +# This file contains configurations that are used by BRAD. These are default +# values and should be customized for specific situations. + +# BRAD's front end servers will listen for client connections on this interface +# and port. If `num_front_ends` is greater than one, subsequent front ends will +# listen on successive ports (e.g., 6584, 6585, etc.). +front_end_interface: "0.0.0.0" +front_end_port: 6583 +num_front_ends: 2 + +# If installed and enabled, BRAD will serve its UI from a webserver that listens +# for connections on this network interface and port. +ui_interface: "0.0.0.0" +ui_port: 7583 + +# Logging paths. If the value is in ALL_CAPS (with underscores), it is +# interpreted as an environment variable (BRAD will log to the path stored in +# the environment variable). + +# Where BRAD's daemon process will write its logs. +daemon_log_file: COND_OUT + +# Where BRAD's front end processes will write their logs. +front_end_log_path: COND_OUT + +# Where BRAD's blueprint planner will write debug logs. +planner_log_path: COND_OUT + +# Where BRAD's metrics loggers will write their logs. +metrics_log_path: COND_OUT + +# Probability that each transactional query will be logged. +txn_log_prob: 0.10 + +# Set to a non-zero value enable automatic data syncing. When this is set to 0, +# automatic syncing is disabled. +data_sync_period_seconds: 0 + +# BRAD's front end servers will report their metrics at regular intervals. +front_end_metrics_reporting_period_seconds: 30 +front_end_query_latency_buffer_size: 100 + +# `default` means to use the policy encoded in the blueprint. Other values will +# override the blueprint. +routing_policy: default + +# Whether to disable table movement for benchmark purposes (i.e., keep all +# tables on all engines.) +disable_table_movement: true + +# Epoch length for metrics and forecasting. This is the granularity at which +# metrics/forecasting will be performed. +epoch_length: + weeks: 0 + days: 0 + hours: 0 + minutes: 1 + +# Blueprint planning strategy. +strategy: fp_query_based_beam + +# Used to specify the period of time over which to use data for planning. +# Currrently, this is a "look behind" window for the workload. +planning_window: + weeks: 0 + days: 0 + hours: 1 + minutes: 0 + +# Used to aggregate metrics collected in the planning window. +metrics_agg: + method: ewm # 'mean' is another option + alpha: 0.86466472 # 1 - 1 / e^2 + +# Used during planning. +reinterpret_second_as: 1 + +# The query distribution must change by at least this much for a new blueprint +# to be accepted. +query_dist_change_frac: 0.1 + +# The search bound for the provisioning. +max_provisioning_multiplier: 2.5 + +# Flag options for blueprint planning. +use_io_optimized_aurora: true +use_recorded_routing_if_available: true +ensure_tables_together_on_one_engine: true + +# Loads used to prime the system when no information is available. +aurora_initialize_load_fraction: 0.25 +redshift_initialize_load_fraction: 0.25 + +# BRAD will not reduce predicted load lower than these values. Raise these +# values to be more conservative against mispredictions. +aurora_min_load_removal_fraction: 0.8 +redshift_min_load_removal_fraction: 0.8 + +# Blueprint planning performance ceilings. +query_latency_p90_ceiling_s: 360.0 +txn_latency_p90_ceiling_s: 0.080 + +# If set to true, BRAD will attempt to use the specified preset Redshift +# clusters instead of resizing the main Redshift cluster. +use_preset_redshift_clusters: false + +result_row_limit: 10 + +# Used for ordering blueprints during planning. +comparator: + type: benefit_perf_ceiling # or `perf_ceiling` + + benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator + weeks: 0 + days: 0 + hours: 24 + minutes: 0 + + penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator + penalty_power: 8 # Only used by the `benefit_perf_ceiling` comparator + +# Used for precomputed predictions. +std_datasets: + - name: chbenchmark + path: workloads/chbenchmark/ + +# Blueprint planning trigger configs. + +triggers: + enabled: false + check_period_s: 90 # Triggers are checked every X seconds. + check_period_offset_s: 360 # Wait 6 mins before starting. + + # Triggers will not fire for at least this many minutes after a new blueprint + # takes effect. Usually this should be greater than zero to give BRAD + # sufficient time to observe the effect of the blueprint on the workload. BRAD + # may wait longer to ensure metrics are also available for this many minutes. + observe_new_blueprint_mins: 5 + + elapsed_time: + disabled: true + multiplier: 60 # Multiplier over `planning_window`. + + redshift_cpu: + lo: 15 + hi: 85 + sustained_epochs: 3 + + aurora_cpu: + lo: 15 + hi: 85 + sustained_epochs: 3 + + variable_costs: + disabled: true + threshold: 1.0 + + query_latency_ceiling: + ceiling_s: 360.0 + sustained_epochs: 3 + + txn_latency_ceiling: + ceiling_s: 0.080 + sustained_epochs: 3 + + recent_change: + delay_epochs: 5 diff --git a/experiments/17-chbenchmark/scale_down/run_full.sh b/experiments/17-chbenchmark/scale_down/run_full.sh new file mode 100755 index 00000000..f06a0504 --- /dev/null +++ b/experiments/17-chbenchmark/scale_down/run_full.sh @@ -0,0 +1,25 @@ +#! /bin/bash + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh +extract_named_arguments $@ + +# Resolve paths into absolute paths +abs_txn_config_file=$(realpath $txn_config_file) +abs_system_config_file=$(realpath $system_config_file) +abs_physical_config_file=$(realpath $physical_config_file) + +export BRAD_IGNORE_BLUEPRINT=1 +start_brad $abs_system_config_file $abs_physical_config_file + +sleep 30 + +run_tpcc "t_1" +start_repeating_olap_runner 1 10 5 $ra_query_indexes "ch_1" $t_clients +ra_pid=$runner_pid + +sleep $run_for_s + +# Shut down. +graceful_shutdown $tpcc_pid $ra_pid diff --git a/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh new file mode 100755 index 00000000..1735545e --- /dev/null +++ b/experiments/17-chbenchmark/scale_down/set_up_starting_blueprint.sh @@ -0,0 +1,21 @@ +#! /bin/bash + +if [ -z $1 ]; then + >&2 echo "Usage: $0 path/to/physical/config.yml" + exit 1 +fi + +script_loc=$(cd $(dirname $0) && pwd -P) +cd $script_loc +source ../common.sh + +python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \ + --schema-name chbenchmark \ + --query-bank-file ../../../workloads/chbenchmark/queries.sql \ + --redshift-queries "0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" \ + --athena-queries "4" \ + --redshift-provisioning "dc2.large:16" \ + --aurora-provisioning "db.r6g.2xlarge:1" \ + --system-config-file ch_scale_down_config.yml \ + --physical-config-file $1 \ + --override-definite-routing redshift diff --git a/src/brad/config/file.py b/src/brad/config/file.py index c14facc2..fe781c23 100644 --- a/src/brad/config/file.py +++ b/src/brad/config/file.py @@ -288,6 +288,12 @@ def ui_port(self) -> int: else: return 7583 + def result_row_limit(self) -> Optional[int]: + try: + return self._raw["result_row_limit"] + except KeyError: + return None + def _extract_log_path(self, config_key: str) -> Optional[pathlib.Path]: if config_key not in self._raw: return None diff --git a/src/brad/daemon/daemon.py b/src/brad/daemon/daemon.py index f634eb05..56045c7b 100644 --- a/src/brad/daemon/daemon.py +++ b/src/brad/daemon/daemon.py @@ -66,6 +66,7 @@ from brad.planner.workload.builder import WorkloadBuilder from brad.planner.workload.provider import LoggedWorkloadProvider from brad.routing.policy import RoutingPolicy +from brad.routing.tree_based.forest_policy import ForestPolicy from brad.row_list import RowList from brad.utils.time_periods import period_start, universal_now from brad.ui.manager import UiManager @@ -328,7 +329,10 @@ async def _run_setup(self) -> None: or self._config.routing_policy == RoutingPolicy.Default ): logger.info("Setting up the cardinality estimator...") - if is_stub_mode: + blueprint = self._blueprint_mgr.get_blueprint() + policy = blueprint.get_routing_policy() + requires_estimator = isinstance(policy.definite_policy, ForestPolicy) + if is_stub_mode or not requires_estimator: estimator: Estimator = StubEstimator() else: estimator = await PostgresEstimator.connect( diff --git a/src/brad/front_end/front_end.py b/src/brad/front_end/front_end.py index f7f871fe..560ba211 100644 --- a/src/brad/front_end/front_end.py +++ b/src/brad/front_end/front_end.py @@ -453,8 +453,20 @@ async def _run_query_impl( else: connection = session.engines.get_reader_connection(engine_to_use) cursor = connection.cursor_sync() + # HACK: To work around dialect differences between + # Athena/Aurora/Redshift for now. This should be replaced by + # a more robust translation layer. + if ( + engine_to_use == Engine.Athena + and "ascii" in query_rep.raw_query + ): + translated_query = query_rep.raw_query.replace( + "ascii", "codepoint" + ) + else: + translated_query = query_rep.raw_query start = universal_now() - await cursor.execute(query_rep.raw_query) + await cursor.execute(translated_query) end = universal_now() except ( pyodbc.ProgrammingError, @@ -513,9 +525,23 @@ async def _run_query_impl( # Extract and return the results, if any. try: - # Using `fetchall_sync()` is lower overhead than the async interface. - results = [tuple(row) for row in cursor.fetchall_sync()] - log_verbose(logger, "Responded with %d rows.", len(results)) + result_row_limit = self._config.result_row_limit() + if result_row_limit is not None: + results = [] + for _ in range(result_row_limit): + row = cursor.fetchone_sync() + if row is None: + break + results.append(tuple(row)) + log_verbose( + logger, + "Responded with %d rows (limited to %d rows).", + len(results), + ) + else: + # Using `fetchall_sync()` is lower overhead than the async interface. + results = [tuple(row) for row in cursor.fetchall_sync()] + log_verbose(logger, "Responded with %d rows.", len(results)) return ( results, (cursor.result_schema(results) if retrieve_schema else None), diff --git a/src/brad/front_end/session.py b/src/brad/front_end/session.py index 09ae5311..416e2515 100644 --- a/src/brad/front_end/session.py +++ b/src/brad/front_end/session.py @@ -11,6 +11,7 @@ from brad.front_end.engine_connections import EngineConnections from brad.planner.estimator import Estimator from brad.routing.policy import RoutingPolicy +from brad.routing.tree_based.forest_policy import ForestPolicy from brad.data_stats.postgres_estimator import PostgresEstimator from brad.data_stats.stub_estimator import StubEstimator from brad.utils.time_periods import universal_now @@ -117,7 +118,9 @@ async def create_new_session(self) -> Tuple[SessionId, Session]: routing_policy_override == RoutingPolicy.ForestTableSelectivity or routing_policy_override == RoutingPolicy.Default ): - if self._config.stub_mode_path() is None: + policy = blueprint.get_routing_policy() + requires_estimator = isinstance(policy.definite_policy, ForestPolicy) + if self._config.stub_mode_path() is None and requires_estimator: estimator: Optional[Estimator] = await PostgresEstimator.connect( self._schema_name, self._config ) diff --git a/workloads/IMDB_extended/set_up_starting_blueprint.py b/workloads/IMDB_extended/set_up_starting_blueprint.py index be5bf2c1..62589379 100644 --- a/workloads/IMDB_extended/set_up_starting_blueprint.py +++ b/workloads/IMDB_extended/set_up_starting_blueprint.py @@ -15,6 +15,7 @@ from brad.routing.cached import CachedLocationPolicy from brad.routing.policy import RoutingPolicy from brad.routing.tree_based.forest_policy import ForestPolicy +from brad.routing.always_one import AlwaysOneRouter from brad.utils import set_up_logging logger = logging.getLogger(__name__) @@ -83,6 +84,11 @@ def main(): parser.add_argument( "--aurora-provisioning", type=str, help="Format: :" ) + parser.add_argument( + "--override-definite-routing", + type=str, + help="An engine to always route queries for if the indefinite policy does not capture it.", + ) args = parser.parse_args() set_up_logging(debug_mode=True) @@ -124,11 +130,15 @@ def main(): # 5. Replace the policy. enum_blueprint = EnumeratedBlueprint(blueprint) - definite_policy = asyncio.run( - ForestPolicy.from_assets( - args.schema_name, RoutingPolicy.ForestTableCardinality, assets + if args.override_definite_routing is not None: + routing_engine = Engine.from_str(args.override_definite_routing) + definite_policy = AlwaysOneRouter(routing_engine) + else: + definite_policy = asyncio.run( + ForestPolicy.from_assets( + args.schema_name, RoutingPolicy.ForestTableCardinality, assets + ) ) - ) replaced_policy = FullRoutingPolicy( indefinite_policies=[clp], definite_policy=definite_policy ) diff --git a/workloads/chbenchmark/queries.sql b/workloads/chbenchmark/queries.sql new file mode 100644 index 00000000..6ced3e67 --- /dev/null +++ b/workloads/chbenchmark/queries.sql @@ -0,0 +1,22 @@ +select ol_number, sum(ol_quantity) as sum_qty, sum(ol_amount) as sum_amount, avg(ol_quantity) as avg_qty, avg(ol_amount) as avg_amount, count(*) as count_order from order_line group by ol_number order by ol_number; +select su_suppkey, su_name, n_name, i_id, i_name, su_address, su_phone, su_comment from item, supplier, stock, nation, region, (select s_i_id as m_i_id, min(s_quantity) as m_s_quantity from stock, supplier, nation, region where mod((s_w_id*s_i_id),10000)=su_suppkey and su_nationkey=n_nationkey and n_regionkey=r_regionkey and r_name like 'Europ%' group by s_i_id) m where i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and i_data like '%b' and r_name like 'Europ%' and i_id=m_i_id and s_quantity = m_s_quantity order by n_name, su_name, i_id; +select ol_o_id, ol_w_id, ol_d_id, sum(ol_amount) as revenue, o_entry_d from customer, new_order, orders, order_line where c_state like 'A%' and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and no_w_id = o_w_id and no_d_id = o_d_id and no_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by ol_o_id, ol_w_id, ol_d_id, o_entry_d order by revenue desc, o_entry_d; +select o_ol_cnt, count(*) as order_count from orders where exists (select * from order_line where o_id = ol_o_id and o_w_id = ol_w_id and o_d_id = ol_d_id and ol_delivery_d >= o_entry_d) group by o_ol_cnt order by o_ol_cnt; +select n_name, sum(ol_amount) as revenue from customer, orders, order_line, stock, supplier, nation, region where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id=o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ascii(cast(substring(c_state,1,1) as varchar(1))) = su_nationkey and su_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'Europe' group by n_name order by revenue desc; +select sum(ol_amount) as revenue from order_line where ol_quantity between 1 and 100000; +WITH inner_query AS (select su_nationkey as supp_nation, substring(c_state,1,1) as cust_nation, extract(year from o_entry_d) as l_year, ol_amount as revenue from supplier, stock, order_line, orders, customer, nation n1, nation n2 where ol_supply_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and su_nationkey = n1.n_nationkey and ascii(substring(c_state,1,1)) = n2.n_nationkey and ((n1.n_name = 'Germany' and n2.n_name = 'Cambodia') or (n1.n_name = 'Cambodia' and n2.n_name = 'Germany'))) SELECT supp_nation, cust_nation, l_year, sum(revenue) as revenue FROM inner_query group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year; +select extract(year from o_entry_d) as l_year, sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share from item, supplier, stock, order_line, orders, customer, nation n1, nation n2, region where i_id = s_i_id and ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id),10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and n1.n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) and n1.n_regionkey = r_regionkey and ol_i_id < 1000 and r_name = 'Europe' and su_nationkey = n2.n_nationkey and i_data like '%b' and i_id = ol_i_id group by extract(year from o_entry_d) order by l_year; +select n_name, extract(year from o_entry_d) as l_year, sum(ol_amount) as sum_profit from item, stock, supplier, order_line, orders, nation where ol_i_id = s_i_id and ol_supply_w_id = s_w_id and mod((s_w_id * s_i_id), 10000) = su_suppkey and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and ol_i_id = i_id and su_nationkey = n_nationkey and i_data like '%BB' group by n_name, extract(year from o_entry_d) order by n_name, l_year desc; +select c_id, c_last, sum(ol_amount) as revenue, c_city, c_phone, n_name from customer, orders, order_line, nation where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d and n_nationkey = ascii(cast(substring(c_state,1,1) as varchar(1))) group by c_id, c_last, c_city, c_phone, n_name order by revenue desc; +select s_i_id, sum(s_order_cnt) as ordercount from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany' group by s_i_id having sum(s_order_cnt) > (select sum(s_order_cnt) * .005 from stock, supplier, nation where mod((s_w_id * s_i_id),10000) = su_suppkey and su_nationkey = n_nationkey and n_name = 'Germany') order by ordercount desc; +select o_ol_cnt, sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count from orders, order_line where ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id and o_entry_d <= ol_delivery_d group by o_ol_cnt order by o_ol_cnt; +select c_count, count(*) as custdist from (select c_id, count(o_id) from customer left outer join orders on ( c_w_id = o_w_id and c_d_id = o_d_id and c_id = o_c_id and o_carrier_id > 8) group by c_id) as c_orders (c_id, c_count) group by c_count order by custdist desc, c_count desc; +select 100.00 * sum(case when i_data like 'PR%' then ol_amount else 0 end) / (1+sum(ol_amount)) as promo_revenue from order_line, item where ol_i_id = i_id; +with revenue (supplier_no, total_revenue) as (select mod((s_w_id * s_i_id),10000) as supplier_no, sum(ol_amount) as total_revenue from order_line, stock where ol_i_id = s_i_id and ol_supply_w_id = s_w_id group by mod((s_w_id * s_i_id),10000)) select su_suppkey, su_name, su_address, su_phone, total_revenue from supplier, revenue where su_suppkey = supplier_no and total_revenue = (select max(total_revenue) from revenue) order by su_suppkey; +select i_name, substring(i_data, 1, 3) as brand, i_price, count(distinct (mod((s_w_id * s_i_id),10000))) as supplier_cnt from stock, item where i_id = s_i_id and i_data not like 'zz%' and (mod((s_w_id * s_i_id),10000) not in (select su_suppkey from supplier where su_comment like '%bad%')) group by i_name, substring(i_data, 1, 3), i_price order by supplier_cnt desc; +select sum(ol_amount) / 2.0 as avg_yearly from order_line, (select i_id, avg(ol_quantity) as a from item, order_line where i_data like '%b' and ol_i_id = i_id group by i_id) t where ol_i_id = t.i_id and ol_quantity < t.a; +select c_last, c_id o_id, o_entry_d, o_ol_cnt, sum(ol_amount) from customer, orders, order_line where c_id = o_c_id and c_w_id = o_w_id and c_d_id = o_d_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_o_id = o_id group by o_id, o_w_id, o_d_id, c_id, c_last, o_entry_d, o_ol_cnt having sum(ol_amount) > 200 order by sum(ol_amount) desc, o_entry_d; +select sum(ol_amount) as revenue from order_line, item where (ol_i_id = i_id and i_data like '%a' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,3)) or ( ol_i_id = i_id and i_data like '%b' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,2,4)) or ( ol_i_id = i_id and i_data like '%c' and ol_quantity >= 1 and ol_quantity <= 10 and i_price between 1 and 400000 and ol_w_id in (1,5,3)); +select su_name, su_address from supplier, nation where su_suppkey in (select mod(s_i_id * s_w_id, 10000) from stock, order_line where s_i_id in (select i_id from item where i_data like 'co%') and ol_i_id=s_i_id group by s_i_id, s_w_id, s_quantity having 2*s_quantity > sum(ol_quantity)) and su_nationkey = n_nationkey and n_name = 'Germany' order by su_name; +select su_name, count(*) as numwait from supplier, order_line l1, orders, stock, nation where ol_o_id = o_id and ol_w_id = o_w_id and ol_d_id = o_d_id and ol_w_id = s_w_id and ol_i_id = s_i_id and mod((s_w_id * s_i_id),10000) = su_suppkey and l1.ol_delivery_d > o_entry_d and not exists (select * from order_line l2 where l2.ol_o_id = l1.ol_o_id and l2.ol_w_id = l1.ol_w_id and l2.ol_d_id = l1.ol_d_id and l2.ol_delivery_d > l1.ol_delivery_d) and su_nationkey = n_nationkey and n_name = 'Germany' group by su_name order by numwait desc, su_name; +select substring(c_state,1,1) as country, count(*) as numcust, sum(c_balance) as totacctbal from customer where substring(c_phone,1,1) in ('1','2','3','4','5','6','7') and c_balance > (select avg(c_BALANCE) from customer where c_balance > 0.00 and substring(c_phone,1,1) in ('1','2','3','4','5','6','7')) and not exists (select * from orders where o_c_id = c_id and o_w_id = c_w_id and o_d_id = c_d_id) group by substring(c_state,1,1) order by substring(c_state,1,1);