diff --git a/datafusion/README.md b/datafusion/README.md index 788cad30e..1b9583e6d 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -2,7 +2,7 @@ DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check -We use parquet file here and create an external table for it; and then do the queries. +We use parquet file here and create an external table for it; and then execute the queries. ## Generate benchmark results @@ -10,29 +10,29 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe 1. manually start a AWS EC2 instance - `c6a.4xlarge` - - Amazon Linux 2 AMI + - Ubuntu 22.04 or later - Root 500GB gp2 SSD - no EBS optimized - no instance store -1. wait for status check passed, then ssh to EC2 `ssh ec2-user@{ip}` -1. `sudo yum update -y` and `sudo yum install gcc git -y` +1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` 1. `git clone https://github.com/ClickHouse/ClickBench` 1. `cd ClickBench/datafusion` 1. `vi benchmark.sh` and modify following line to target Datafusion version + + ```bash + git checkout 46.0.0 ``` - git checkout 45.0.0 - ``` + 1. `bash benchmark.sh` -### Know Issues: +### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) 3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 - ## Generate full human readable results (for debugging) 1. install datafusion-cli 2. download the parquet ```wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create.sh queries.sh``` or ```bash run2.sh``` +3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index c38784333..8d064f2a4 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -11,9 +11,9 @@ sudo apt-get install --yes gcc echo "Install DataFusion main branch" git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/datafusion-cli -git checkout 45.0.0 -CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release +cd arrow-datafusion/ +git checkout 46.0.0 +CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli export PATH="`pwd`/target/release:$PATH" cd ../.. diff --git a/datafusion/queries.sql b/datafusion/queries.sql index a70d081b6..9a183cd6e 100644 --- a/datafusion/queries.sql +++ b/datafusion/queries.sql @@ -26,7 +26,7 @@ SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; diff --git a/datafusion/results/partitioned.json b/datafusion/results/partitioned.json index 996ff7bb4..b8dbd91c3 100644 --- a/datafusion/results/partitioned.json +++ b/datafusion/results/partitioned.json @@ -1,55 +1,55 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2024-03-29", + "date": "2024-04-18", "machine": "c6a.4xlarge, 500gb gp2", "cluster_size": 1, - "comment": "v45.0.0 (26058ac)", + "comment": "v46.0.0 (d5ca830)", "tags": ["Rust", "column-oriented", "embedded", "stateless"], "load_time": 0, - "data_size": 14779976446, + "data_size": 14737666736, "result": [ -[0.060, 0.022, 0.021], -[0.109, 0.034, 0.035], -[0.198, 0.085, 0.083], -[0.391, 0.088, 0.084], -[1.143, 0.846, 0.872], -[1.020, 0.856, 0.855], -[0.086, 0.032, 0.028], -[0.118, 0.037, 0.037], -[1.102, 0.962, 0.942], -[1.353, 1.070, 1.045], -[0.487, 0.260, 0.263], -[0.663, 0.291, 0.286], -[1.114, 0.893, 0.901], -[2.596, 1.410, 1.360], -[1.133, 0.860, 0.854], -[1.132, 1.020, 1.001], -[2.668, 1.835, 1.866], -[2.557, 1.694, 1.704], -[5.337, 3.714, 3.794], -[0.263, 0.082, 0.082], -[9.891, 1.109, 1.125], -[11.284, 1.331, 1.348], -[21.820, 2.617, 2.631], -[55.448, 9.609, 9.630], -[2.687, 0.452, 0.453], -[0.804, 0.368, 0.364], -[2.704, 0.517, 0.520], -[9.662, 1.553, 1.507], -[9.988, 9.801, 9.769], -[0.526, 0.421, 0.403], -[2.371, 0.802, 0.812], -[5.944, 0.904, 0.903], -[4.827, 3.645, 3.565], -[10.196, 3.767, 3.792], -[10.234, 3.823, 3.844], -[1.397, 1.270, 1.303], -[0.328, 0.146, 0.147], -[0.196, 0.085, 0.105], -[0.328, 0.147, 0.150], -[0.482, 0.220, 0.219], -[0.198, 0.076, 0.076], -[0.189, 0.088, 0.076], -[0.179, 0.064, 0.075] +[0.059, 0.020, 0.020], +[0.117, 0.035, 0.035], +[0.209, 0.086, 0.079], +[0.385, 0.088, 0.096], +[1.009, 0.867, 0.877], +[1.017, 0.852, 0.856], +[0.088, 0.031, 0.032], +[0.129, 0.038, 0.039], +[1.113, 0.950, 0.953], +[1.329, 1.036, 1.046], +[0.504, 0.268, 0.259], +[0.610, 0.284, 0.284], +[1.095, 0.895, 0.900], +[2.546, 1.359, 1.371], +[1.089, 0.869, 0.851], +[1.117, 1.004, 0.990], +[2.606, 1.808, 1.835], +[2.547, 1.681, 1.683], +[5.335, 3.695, 3.710], +[0.252, 0.082, 0.080], +[9.956, 1.101, 1.101], +[11.266, 1.306, 1.279], +[21.829, 2.602, 2.560], +[55.527, 9.548, 9.435], +[2.696, 0.458, 0.449], +[0.808, 0.365, 0.355], +[2.703, 0.534, 0.506], +[9.639, 1.475, 1.485], +[10.099, 9.803, 9.646], +[0.530, 0.436, 0.443], +[2.379, 0.813, 0.798], +[5.949, 0.879, 0.886], +[4.779, 3.684, 3.607], +[10.204, 3.673, 3.732], +[10.217, 3.722, 3.728], +[1.340, 1.237, 1.231], +[0.308, 0.145, 0.146], +[0.196, 0.104, 0.086], +[0.303, 0.152, 0.148], +[0.455, 0.212, 0.231], +[0.206, 0.096, 0.077], +[0.194, 0.074, 0.073], +[0.177, 0.065, 0.065] ] } diff --git a/datafusion/results/single.json b/datafusion/results/single.json index ff02df720..17533c1d9 100644 --- a/datafusion/results/single.json +++ b/datafusion/results/single.json @@ -1,55 +1,55 @@ { "system": "DataFusion (Parquet, single)", - "date": "2024-03-29", + "date": "2024-04-18", "machine": "c6a.4xlarge, 500gb gp2", "cluster_size": 1, - "comment": "v45.0.0 (26058ac)", + "comment": "v46.0.0 (d5ca830)", "tags": ["Rust", "column-oriented", "embedded", "stateless"], "load_time": 0, "data_size": 14779976446, "result": [ -[0.103, 0.063, 0.070], -[0.136, 0.080, 0.079], -[0.219, 0.117, 0.116], -[0.348, 0.128, 0.128], -[1.045, 0.913, 0.924], -[1.102, 0.959, 0.964], -[0.114, 0.066, 0.078], -[0.152, 0.088, 0.087], -[1.160, 1.021, 1.005], -[1.324, 1.077, 1.110], -[0.470, 0.297, 0.297], -[0.567, 0.317, 0.312], -[1.140, 0.984, 0.984], -[2.681, 1.388, 1.459], -[1.106, 0.952, 0.939], -[1.185, 1.062, 1.067], -[2.647, 1.943, 1.937], -[2.524, 1.787, 1.787], -[5.212, 3.749, 3.825], -[0.272, 0.115, 0.122], -[9.741, 1.205, 1.190], -[11.298, 1.552, 1.497], -[22.086, 3.670, 3.620], -[55.936, 10.118, 10.120], -[2.553, 0.572, 0.591], -[0.792, 0.519, 0.512], -[2.561, 0.639, 0.634], -[9.600, 1.650, 1.682], -[10.898, 10.343, 10.278], -[0.556, 0.455, 0.459], -[2.282, 0.938, 0.932], -[5.685, 1.033, 1.025], -[4.576, 3.773, 3.780], -[10.309, 3.906, 3.927], -[10.317, 3.969, 4.025], -[1.395, 1.251, 1.253], -[0.364, 0.202, 0.199], -[0.284, 0.163, 0.164], -[0.385, 0.216, 0.198], -[0.541, 0.302, 0.295], -[0.224, 0.115, 0.111], -[0.215, 0.108, 0.111], -[0.193, 0.102, 0.100] +[0.106, 0.057, 0.057], +[0.142, 0.072, 0.072], +[0.212, 0.119, 0.115], +[0.341, 0.132, 0.120], +[0.993, 0.892, 0.899], +[1.081, 0.961, 0.958], +[0.125, 0.076, 0.078], +[0.159, 0.092, 0.083], +[1.134, 0.990, 0.999], +[1.306, 1.108, 1.068], +[0.480, 0.291, 0.298], +[0.583, 0.314, 0.317], +[1.170, 0.989, 0.969], +[2.625, 1.568, 1.491], +[1.108, 0.940, 0.941], +[1.155, 1.050, 1.052], +[2.631, 1.924, 1.944], +[2.517, 1.772, 1.777], +[5.173, 3.765, 3.733], +[0.422, 0.115, 0.117], +[9.540, 1.131, 1.131], +[11.292, 1.457, 1.445], +[22.116, 3.578, 3.502], +[55.971, 9.908, 9.810], +[2.567, 0.584, 0.577], +[0.805, 0.512, 0.533], +[2.582, 0.646, 0.650], +[9.643, 1.626, 1.602], +[10.635, 10.439, 10.218], +[0.547, 0.485, 0.479], +[2.288, 0.927, 0.942], +[5.684, 1.036, 1.000], +[4.545, 3.741, 3.723], +[10.211, 3.884, 3.853], +[10.251, 3.871, 3.879], +[1.405, 1.281, 1.282], +[0.360, 0.198, 0.211], +[0.280, 0.182, 0.169], +[0.345, 0.199, 0.209], +[0.508, 0.281, 0.296], +[0.214, 0.110, 0.114], +[0.206, 0.112, 0.110], +[0.190, 0.103, 0.103] ] } diff --git a/datafusion/run.sh b/datafusion/run.sh index ac429d570..c9234f07b 100755 --- a/datafusion/run.sh +++ b/datafusion/run.sh @@ -31,7 +31,7 @@ cat queries.sql | while read -r query; do # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines # 3. use sed to take the second line # 4. use awk to take the number we want - RES=`datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Elapsed" |sed -n 2p | awk '{ print $2 }'` + RES=$(datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Elapsed" |sed -n 2p | awk '{ print $2 }') [[ $RES != "" ]] && \ echo -n "$RES" || \ echo -n "null"