From d9c8ec61eaf6655ed4bae2ac68773c3654ee62ec Mon Sep 17 00:00:00 2001 From: Albert Xing Date: Wed, 11 Sep 2024 13:52:45 -0700 Subject: [PATCH] Add learned index (RadixSpline) Rust bindings (#511) Add end to end tpch benchmarking code add build script that compiles and links c++ radixspline library add safe rust module for running radixspline Note: may need to manually delete radixspline.o and libradixspline.a from sandbox/qe/RadixSplineLib to force recompilation when changes are made to the C++ radixspline code. --------- Co-authored-by: Geoffrey Yu --- .gitignore | 4 + .gitmodules | 3 + sandbox/qe/Cargo.lock | 142 +++++++++++- sandbox/qe/Cargo.toml | 17 +- sandbox/qe/RadixSpline | 1 + sandbox/qe/RadixSplineLib/radixspline.cpp | 27 +++ sandbox/qe/RadixSplineLib/radixspline.h | 18 ++ sandbox/qe/build_bindings.rs | 86 ++++++++ sandbox/qe/redshift.py | 254 ++++++++++++++++++++++ sandbox/qe/src/bin/bench_tpch.rs | 17 +- sandbox/qe/src/lib.rs | 3 + sandbox/qe/src/radixspline.rs | 50 +++++ sandbox/qe/tpch-every.sh | 3 +- sandbox/qe/tpch-individual.sh | 26 ++- sandbox/qe/{tpch.sh => tpch-setup.sh} | 12 +- sandbox/qe/tpch/column_to_binary.py | 21 ++ sandbox/qe/tpch/headers.json | 18 +- sandbox/qe/tpch/tbl_to_csv.py | 5 +- 18 files changed, 665 insertions(+), 42 deletions(-) create mode 100644 .gitmodules create mode 160000 sandbox/qe/RadixSpline create mode 100644 sandbox/qe/RadixSplineLib/radixspline.cpp create mode 100644 sandbox/qe/RadixSplineLib/radixspline.h create mode 100644 sandbox/qe/build_bindings.rs create mode 100644 sandbox/qe/redshift.py create mode 100644 sandbox/qe/src/radixspline.rs rename sandbox/qe/{tpch.sh => tpch-setup.sh} (76%) create mode 100644 sandbox/qe/tpch/column_to_binary.py diff --git a/.gitignore b/.gitignore index 2b56a765..d1370fed 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,7 @@ sandbox/qe/.brad_qe_repl_history # BRAD UI. ui/dist/ ui/node_modules/ + +*.tmp +*.o +*.a diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..1d0f7f1c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "sandbox/qe/RadixSpline"] + path = sandbox/qe/RadixSpline + url = https://github.com/learnedsystems/RadixSpline diff --git a/sandbox/qe/Cargo.lock b/sandbox/qe/Cargo.lock index c3210ece..01aea2eb 100644 --- a/sandbox/qe/Cargo.lock +++ b/sandbox/qe/Cargo.lock @@ -407,6 +407,29 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "bindgen" +version = "0.69.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +dependencies = [ + "bitflags 2.4.2", + "cexpr", + "clang-sys", + "itertools", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.50", + "which", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -455,8 +478,11 @@ name = "brad_qe" version = "0.1.0" dependencies = [ "arrow", + "bindgen", + "cc", "clap", "csv", + "cty", "datafusion", "futures", "rand", @@ -526,11 +552,22 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.86" +version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9fa1897e4325be0d68d48df6aa1a71ac2ed4d27723887e7754192705350730" +checksum = "065a29261d53ba54260972629f9ca6bffa69bac13cd1fed61420f7fa68b9f8bd" dependencies = [ + "jobserver", "libc", + "once_cell", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", ] [[package]] @@ -573,6 +610,17 @@ dependencies = [ "phf_codegen", ] +[[package]] +name = "clang-sys" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.1" @@ -728,6 +776,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "dashmap" version = "5.5.3" @@ -1265,6 +1319,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "humantime" version = "2.1.0" @@ -1335,6 +1398,15 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.68" @@ -1350,6 +1422,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "lexical-core" version = "0.8.5" @@ -1420,6 +1498,16 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libloading" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" +dependencies = [ + "cfg-if", + "windows-targets 0.52.0", +] + [[package]] name = "libm" version = "0.2.8" @@ -1495,6 +1583,12 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.2" @@ -1536,6 +1630,16 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num" version = "0.4.1" @@ -1818,6 +1922,16 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "prettyplease" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" +dependencies = [ + "proc-macro2", + "syn 2.0.50", +] + [[package]] name = "proc-macro2" version = "1.0.78" @@ -1931,6 +2045,12 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.4.0" @@ -2057,6 +2177,12 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -2525,6 +2651,18 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/sandbox/qe/Cargo.toml b/sandbox/qe/Cargo.toml index 9b9650eb..f02fdada 100644 --- a/sandbox/qe/Cargo.toml +++ b/sandbox/qe/Cargo.toml @@ -2,6 +2,8 @@ name = "brad_qe" version = "0.1.0" edition = "2021" +# NOTE: This currently does not work. (C++ standard library linking error) +# build = "build.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -14,11 +16,20 @@ arrow = "50.0.0" rustyline = "10.0.0" rand = { version = "0.8.5", features = ["small_rng"] } csv = "1.3.0" +cty = "0.2.2" -[[bin]] -name = "brad_qe_repl" -path = "src/bin/repl.rs" +[build-dependencies] +bindgen = "0.69.4" +cc = "1.0.96" [[bin]] name = "bench_q3" path = "src/bin/bench_q3.rs" + +[[bin]] +name = "brad_qe_repl" +path = "src/bin/repl.rs" + +# [[bin]] +# name = "test_radixspline" +# path = "src/bin/test_radixspline.rs" diff --git a/sandbox/qe/RadixSpline b/sandbox/qe/RadixSpline new file mode 160000 index 00000000..ab96aa59 --- /dev/null +++ b/sandbox/qe/RadixSpline @@ -0,0 +1 @@ +Subproject commit ab96aa59d429e7423beba2350bdcdf88952df282 diff --git a/sandbox/qe/RadixSplineLib/radixspline.cpp b/sandbox/qe/RadixSplineLib/radixspline.cpp new file mode 100644 index 00000000..38b55492 --- /dev/null +++ b/sandbox/qe/RadixSplineLib/radixspline.cpp @@ -0,0 +1,27 @@ +#include "radixspline.h" + +void* build(const uint64_t* ks, uint64_t size) { + RSData* rs = new RSData; + rs->keys = std::vector(size); + memcpy(rs->keys.data(), ks, size * sizeof(uint64_t)); + uint64_t min = rs->keys.front(); + uint64_t max = rs->keys.back(); + rs::Builder rsb(min, max); + for (const auto& key : rs->keys) rsb.AddKey(key); + rs::RadixSpline rso = rsb.Finalize(); + rs->rspline = rso; + return (void*)rs; +} + +bool lookup(void* ptr, uint64_t key) { + RSData* rs = (RSData*) ptr; + rs::SearchBound bound = rs->rspline.GetSearchBound(key); + auto start = begin(rs->keys) + bound.begin, last = begin(rs->keys) + bound.end; + auto iter = std::lower_bound(start, last, key); + return iter != rs->keys.end() && *iter == key; +} + +void clear(void* ptr) { + RSData* rs = (RSData*) ptr; + delete rs; +} diff --git a/sandbox/qe/RadixSplineLib/radixspline.h b/sandbox/qe/RadixSplineLib/radixspline.h new file mode 100644 index 00000000..e5b6b6c9 --- /dev/null +++ b/sandbox/qe/RadixSplineLib/radixspline.h @@ -0,0 +1,18 @@ +#include "../RadixSpline/include/rs/builder.h" +#include + + +struct RSData { + std::vector keys; + rs::RadixSpline rspline; +}; + +extern "C" { + + int32_t add(int32_t a, int32_t b); + void* build(const uint64_t* ks, uint64_t size); + + bool lookup(void* ptr, uint64_t key); + + void clear(void* ptr); +} diff --git a/sandbox/qe/build_bindings.rs b/sandbox/qe/build_bindings.rs new file mode 100644 index 00000000..2b422f74 --- /dev/null +++ b/sandbox/qe/build_bindings.rs @@ -0,0 +1,86 @@ +use std::env; +use std::path::PathBuf; + +// NOTE: This file should be named `build.rs` once it is working. + +fn main() { + // This is the directory where the `c` library is located. + let libdir_path = PathBuf::from("RadixSplineLib") + // Canonicalize the path as `rustc-link-search` requires an absolute + // path. + .canonicalize() + .expect("cannot canonicalize path"); + + // This is the path to the `c` headers file. + let headers_path = libdir_path.join("radixspline.h"); + let headers_path_str = headers_path.to_str().expect("Path is not a valid string"); + + // This is the path to the intermediate object file for our library. + let obj_path = libdir_path.join("radixspline.o"); + // This is the path to the static library file. + let lib_path = libdir_path.join("libradixspline.a"); + + // Run `clang` to compile the `radixspline.cpp` file into a `radixspline.o` object file. + // Unwrap if it is not possible to spawn the process. + if !std::process::Command::new("clang++") + .arg("-c") + .arg("-o") + .arg(&obj_path) + .arg(libdir_path.join("radixspline.cpp")) + .output() + .expect("could not spawn `clang`") + .status + .success() + { + // Panic if the command was not successful. + panic!("could not compile object file"); + } + + // Run `ar` to generate the `libradixspline.a` file from the `radixspline.o` file. + // Unwrap if it is not possible to spawn the process. + if !std::process::Command::new("ar") + .arg("rcus") + .arg(lib_path) + .arg(obj_path) + .output() + .expect("could not spawn `ar`") + .status + .success() + { + // Panic if the command was not successful. + panic!("could not emit library file"); + } + + // Tell cargo to look for shared libraries in the specified directory + println!("cargo:rustc-link-search=native={}", libdir_path.to_str().unwrap()); + + // Tell cargo to tell rustc to link our `radixspline` library. Cargo will + // automatically know it must look for a `libradixspline.a` file. + println!("cargo:rustc-link-lib=radixspline"); + println!("cargo:rustc-link-lib=stdc++"); + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + .opaque_type("^(std::.*)$") + .allowlist_function("build") + .allowlist_function("lookup") + .allowlist_function("clear") + // The input header we would like to generate + // bindings for. + .header(headers_path_str) + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"); + bindings + .write_to_file(out_path) + .expect("Couldn't write bindings!"); +} diff --git a/sandbox/qe/redshift.py b/sandbox/qe/redshift.py new file mode 100644 index 00000000..a7426230 --- /dev/null +++ b/sandbox/qe/redshift.py @@ -0,0 +1,254 @@ +import argparse +import os +import redshift_connector +import time + + +def init(cs): + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "nation" ( + "n_nationkey" INT, + "n_name" CHAR(25), + "n_regionkey" INT, + "n_comment" VARCHAR(152), + PRIMARY KEY ("n_nationkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "region" ( + "r_regionkey" INT, + "r_name" CHAR(25), + "r_comment" VARCHAR(152), + PRIMARY KEY ("r_regionkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "supplier" ( + "s_suppkey" INT, + "s_name" CHAR(25), + "s_address" VARCHAR(40), + "s_nationkey" INT, + "s_phone" CHAR(15), + "s_acctbal" DECIMAL(15,2), + "s_comment" VARCHAR(101), + PRIMARY KEY ("s_suppkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "customer" ( + "c_custkey" INT, + "c_name" VARCHAR(25), + "c_address" VARCHAR(40), + "c_nationkey" INT, + "c_phone" CHAR(15), + "c_acctbal" DECIMAL(15,2), + "c_mktsegment" CHAR(10), + "c_comment" VARCHAR(117), + PRIMARY KEY ("c_custkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "part" ( + "p_partkey" INT, + "p_name" VARCHAR(55), + "p_mfgr" CHAR(25), + "p_brand" CHAR(10), + "p_type" VARCHAR(25), + "p_size" INT, + "p_container" CHAR(10), + "p_retailprice" DECIMAL(15,2) , + "p_comment" VARCHAR(23) , + PRIMARY KEY ("p_partkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "partsupp" ( + "ps_partkey" INT, + "ps_suppkey" INT, + "ps_availqty" INT, + "ps_supplycost" DECIMAL(15,2), + "ps_comment" VARCHAR(199), + PRIMARY KEY ("ps_partkey", "ps_suppkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "orders" ( + "o_orderkey" INT, + "o_custkey" INT, + "o_orderstatus" CHAR(1), + "o_totalprice" DECIMAL(15,2), + "o_orderdate" DATE, + "o_orderpriority" CHAR(15), + "o_clerk" CHAR(15), + "o_shippriority" INT, + "o_comment" VARCHAR(79), + PRIMARY KEY ("o_orderkey")); + """ + ) + + cs.execute( + """ + CREATE TABLE IF NOT EXISTS "lineitem"( + "l_orderkey" INT, + "l_partkey" INT, + "l_suppkey" INT, + "l_linenumber" INT, + "l_quantity" DECIMAL(15,2), + "l_extendedprice" DECIMAL(15,2), + "l_discount" DECIMAL(15,2), + "l_tax" DECIMAL(15,2), + "l_returnflag" CHAR(1), + "l_linestatus" CHAR(1), + "l_shipdate" DATE, + "l_commitdate" DATE, + "l_receiptdate" DATE, + "l_shipinstruct" CHAR(25), + "l_shipmode" CHAR(10), + "l_comment" VARCHAR(44) + ); + """ + ) + cs.execute( + """ + COPY nation + FROM 's3://geoffxy-research/shared/sf1/nation.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY region + FROM 's3://geoffxy-research/shared/sf1/region.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY supplier + FROM 's3://geoffxy-research/shared/sf1/supplier.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY customer + FROM 's3://geoffxy-research/shared/sf1/customer.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY part + FROM 's3://geoffxy-research/shared/sf1/part.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY partsupp + FROM 's3://geoffxy-research/shared/sf1/partsupp.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY orders + FROM 's3://geoffxy-research/shared/sf1/orders.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + cs.execute( + """ + COPY lineitem + FROM 's3://geoffxy-research/shared/sf1/lineitem.csv' + IAM_ROLE 'arn:aws:iam::498725316081:role/service-role/AmazonRedshift-CommandsAccessRole-20230606T052021' + CSV + IGNOREHEADER 1; + """ + ) + + +def time_query(cs, query_dir, i): + query_path = os.path.join(query_dir, f"{i}.sql") + if not os.path.exists(query_path): + print(f"File {query_path} does not exist.") + return + + with open(query_path, "r") as file: + query = file.read() + start_time = time.time() + cs.execute(query) + end_time = time.time() + elapsed_time = end_time - start_time + print(f"Query in file {i}.sql took {elapsed_time:.4f} seconds to execute.") + + +def run(cs): + # Where TPC-H queries are located + query_dir = "/spinning/axing/queries" + + for i in range(1, 23): + time_query(cs, query_dir, i) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Benchmark TPC-H on Redshift. takes in a argument "init"|"run"' + ) + parser.add_argument( + "task", + choices=["init", "run"], + help="init on first time running, run to run the benchmark", + ) + + args = parser.parse_args() + + conn = redshift_connector.connect( + host="redshift-axing.cv1pkocptzr2.us-east-1.redshift.amazonaws.com", + database="tpch", + port=5439, + user="awsuser", + password="axingUROP2024", + ) + + conn.rollback() + conn.autocommit = True + conn.run("VACUUM") + + cs = conn.cursor() + + cs.execute("SET enable_result_cache_for_session TO OFF") + + if args.task == "init": + init(cs) + elif args.task == "run": + run(cs) + else: + print(f"Unknown task: {args.task}") diff --git a/sandbox/qe/src/bin/bench_tpch.rs b/sandbox/qe/src/bin/bench_tpch.rs index 462183c4..3a22fcd8 100644 --- a/sandbox/qe/src/bin/bench_tpch.rs +++ b/sandbox/qe/src/bin/bench_tpch.rs @@ -85,22 +85,19 @@ async fn run_query_and_print_results( async fn run_tpch_queries(db: &DB, repetitions: u32) -> Result<(), DataFusionError> { let mut writer = csv::Writer::from_writer(io::stdout()); writer - .write_record(&["query", "avg_run_time_ms"]) + .write_record(&["query", "run_time_ms"]) .map_err(|e| DataFusionError::External(Box::new(e)))?; for q in 1..23 { let path = format!("./tpch/queries/{q}.sql"); let query = fs::read_to_string(path)?; - let mut total_time = 0; // Discard first run to allow for caching/initialization overhead run_timed_query(&db, &query).await?; for _ in 0..repetitions { let rt = run_timed_query(&db, &query).await?; - total_time += &rt.as_millis(); + writer + .write_record(&[&q.to_string(), &rt.as_millis().to_string()]) + .map_err(|e| DataFusionError::External(Box::new(e)))?; } - let avg_time = total_time / (repetitions as u128); - writer - .write_record(&[&q.to_string(), &avg_time.to_string()]) - .map_err(|e| DataFusionError::External(Box::new(e)))?; } writer.flush()?; Ok(()) @@ -240,6 +237,12 @@ async fn main() -> Result<(), DataFusionError> { eprintln!("\nNo modifications."); } } + Some(ref s) if s == "qs_explain" => { + let query = String::from(QUERY_SIMPLE); + let orig_physical_plan = db.to_physical_plan(&query).await?; + let dpp = displayable(orig_physical_plan.as_ref()); + eprintln!("\nOriginal plan\n{}", dpp.indent(false)); + } _ => (), } diff --git a/sandbox/qe/src/lib.rs b/sandbox/qe/src/lib.rs index 564c28ca..44aee2e2 100644 --- a/sandbox/qe/src/lib.rs +++ b/sandbox/qe/src/lib.rs @@ -23,6 +23,9 @@ pub mod ops; /// Utilities for rewriting DataFusion `ExecutionPlan`s. pub mod rewrite; +// RadixSpline bindings. Currently not working. +// pub mod radixspline; + /// Represents an "open" IOHTAP database. Eventually, the DB should run as a /// daemon process. For now it is just an embedded DB (similar to SQLite). pub struct DB { diff --git a/sandbox/qe/src/radixspline.rs b/sandbox/qe/src/radixspline.rs new file mode 100644 index 00000000..bee9e69a --- /dev/null +++ b/sandbox/qe/src/radixspline.rs @@ -0,0 +1,50 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +use arrow::{array::{UInt64Array, Array}, record_batch::RecordBatch}; +use std::os::raw::c_void; + +pub struct RadixSpline { + rs_ptr: *mut c_void, +} + +impl RadixSpline { + + pub fn build_simple(data: [u64; 15]) -> RadixSpline { + unsafe { + let rs_ptr = build(data.as_ptr(), 15); + RadixSpline { + rs_ptr, + } + } + } + + pub fn build(record_batch: &RecordBatch, column_index: usize) -> RadixSpline { + let column = record_batch.column(column_index); + let u64_array = column.as_any().downcast_ref::().unwrap(); + + let ptr = u64_array.values().as_ptr(); + let size = column.len() as u64; + unsafe { + let rs_ptr = build(ptr, size); + RadixSpline { + rs_ptr, + } + } + } + + pub fn lookup(&self, key: u64) -> bool { + unsafe { + lookup(self.rs_ptr, key) + } + } + + pub fn clear(&self) { + unsafe { + clear(self.rs_ptr) + } + } +} diff --git a/sandbox/qe/tpch-every.sh b/sandbox/qe/tpch-every.sh index e65fb254..fbb43a6b 100755 --- a/sandbox/qe/tpch-every.sh +++ b/sandbox/qe/tpch-every.sh @@ -4,7 +4,6 @@ cd ~/brad/sandbox/qe for q in `seq 1 22`; do echo "timing query $q"; - timeout --foreground 2m ./tpch-individual.sh $q + timeout --foreground 5m ./tpch-individual.sh $q # timeout --foreground 30s ./tpch-individual.sh 2; done; - diff --git a/sandbox/qe/tpch-individual.sh b/sandbox/qe/tpch-individual.sh index 5b39f914..57d00021 100755 --- a/sandbox/qe/tpch-individual.sh +++ b/sandbox/qe/tpch-individual.sh @@ -6,21 +6,23 @@ # mv *.sql ~/brad/sandbox/qe/queries/ # cd ~ -NUMRUNS=10 +NUMRUNS=3 + +total_time=0 if [ $# -gt 0 ]; then cd ~ sleep 10 docker exec -ti postgres psql -U postgres -d tpch -o /dev/null -c '\i /data/queries/'$1'.sql' | cat - start=$(date +%s.%N) # Capture the start time with nanoseconds as a decimal - for i in `seq 1 $NUMRUNS`;do docker exec -ti postgres psql -U postgres -d tpch -o /dev/null -c '\i /data/queries/'$1'.sql' | cat; done; - end=$(date +%s.%N) # Capture the end time with nanoseconds as a decimal - elapsed=$(awk "BEGIN{print $end - $start}") # Calculate the total elapsed time in seconds with awk - average=$(awk "BEGIN{printf \"%.3f\", ($elapsed / $NUMRUNS)}") # Calculate the average and format to 3 decimal places - echo "Average time to run query "$1": "$average" seconds" + for i in `seq 1 $NUMRUNS`; + do + start=$(date +%s.%N); # Capture the start time with nanoseconds as a decimal + docker exec postgres psql -U postgres -d tpch -o /dev/null -c '\i /data/queries/'$1'.sql' | cat; + end=$(date +%s.%N); # Capture the end time with nanoseconds as a decimal + elapsed=$(awk "BEGIN{print $end - $start}"); # Calculate the total elapsed time in seconds with awk + total_time=$(awk "BEGIN{print $total_time + $elapsed}"); + echo "$1,$elapsed" + done; + average=$(awk "BEGIN{printf \"%.3f\", ($total_time / $NUMRUNS)}") # Calculate the average and format to 3 decimal places + # echo "Average time to run query "$1": "$average" seconds" fi - - - - - diff --git a/sandbox/qe/tpch.sh b/sandbox/qe/tpch-setup.sh similarity index 76% rename from sandbox/qe/tpch.sh rename to sandbox/qe/tpch-setup.sh index c4e6f0d8..c87f8186 100755 --- a/sandbox/qe/tpch.sh +++ b/sandbox/qe/tpch-setup.sh @@ -9,9 +9,14 @@ if [ $# -gt 0 ]; then # generate data for specified Scale Factor cd ~/TPC-Hv3.0.1/dbgen - ./dbgen -f -s $1 - mv *.tbl /tmp/tpcdata - cd ~ + # ./dbgen -f -s $1 + # mv *.tbl /tmp/tpcdata + # python3 ~/brad/sandbox/qe/tpch/tbl_to_csv.py /tmp/tpcdata + cd - + + # Uncomment if want to save generated info somewhere other than /tmp + # mkdir -p csvsf$1 + # cp /tmp/tpcdata/*.csv csvsf$1 docker stop postgres docker rm postgres @@ -27,4 +32,3 @@ fi # time { # for q in `seq 1 22`;do docker exec -ti postgres psql -U postgres -d tpch -o /dev/null -c '\i /data/queries/'$q'.sql' | cat; done; # } - diff --git a/sandbox/qe/tpch/column_to_binary.py b/sandbox/qe/tpch/column_to_binary.py new file mode 100644 index 00000000..6006e97a --- /dev/null +++ b/sandbox/qe/tpch/column_to_binary.py @@ -0,0 +1,21 @@ +import csv +import struct + + +def process_csv_to_binary(input_csv, output_bin, column_index): + with open(input_csv, "r", newline="") as csvfile: + reader = csv.reader(csvfile, delimiter="|") + data = [ + int(row[column_index]) for row in reader + ] # Extract the column and convert to integers + + with open(output_bin, "wb") as binfile: + # Write the number of items as a 64-bit unsigned integer + binfile.write(struct.pack("