diff --git a/Cargo.lock b/Cargo.lock index 90b2ba3..89614db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,7 +19,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -245,6 +245,10 @@ name = "arrow-schema" version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +dependencies = [ + "serde", + "serde_json", +] [[package]] name = "arrow-select" @@ -325,6 +329,19 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bitflags" version = "2.9.4" @@ -395,7 +412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0acb89ccf798a28683f00089d0630dfaceec087234eae0d308c05ddeaa941b40" dependencies = [ "ambient-authority", - "rand", + "rand 0.8.5", ] [[package]] @@ -426,9 +443,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.40" +version = "1.2.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d05d92f4b1fd76aad469d46cdd858ca761576082cd37df81416691e50199fb" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" dependencies = [ "find-msvc-tools", "shlex", @@ -436,9 +453,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chrono" @@ -665,6 +682,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crunchy" version = "0.2.4" @@ -692,6 +715,113 @@ dependencies = [ "memchr", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "arrow-ipc", + "arrow-schema", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "datafusion-catalog" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "log", + "object_store", + "tokio", +] + [[package]] name = "datafusion-common" version = "49.0.1" @@ -707,17 +837,121 @@ dependencies = [ "indexmap", "libc", "log", + "object_store", "paste", "sqlparser", "tokio", "web-time", ] +[[package]] +name = "datafusion-common-runtime" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "glob", + "itertools", + "log", + "object_store", + "rand 0.9.2", + "tokio", + "url", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + [[package]] name = "datafusion-doc" version = "49.0.1" source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +[[package]] +name = "datafusion-execution" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + [[package]] name = "datafusion-expr" version = "49.0.1" @@ -750,6 +984,50 @@ dependencies = [ "paste", ] +[[package]] +name = "datafusion-functions" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools", + "log", + "rand 0.9.2", + "regex", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + [[package]] name = "datafusion-functions-aggregate-common" version = "49.0.1" @@ -762,6 +1040,38 @@ dependencies = [ "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-functions-table" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + [[package]] name = "datafusion-functions-window-common" version = "49.0.1" @@ -771,6 +1081,55 @@ dependencies = [ "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-macros" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "datafusion-expr", + "quote", + "syn", +] + +[[package]] +name = "datafusion-optimizer" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap", + "itertools", + "log", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "paste", + "petgraph", +] + [[package]] name = "datafusion-physical-expr-common" version = "49.0.1" @@ -784,6 +1143,108 @@ dependencies = [ "itertools", ] +[[package]] +name = "datafusion-physical-optimizer" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools", + "log", +] + +[[package]] +name = "datafusion-physical-plan" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-pruning" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools", + "log", +] + +[[package]] +name = "datafusion-session" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "49.0.1" +source = "git+https://github.com/influxdata/arrow-datafusion.git?rev=8347a71f62d4fef8d37548f22b93877170039357#8347a71f62d4fef8d37548f22b93877170039357" +dependencies = [ + "arrow", + "bigdecimal", + "datafusion-common", + "datafusion-expr", + "indexmap", + "log", + "regex", + "sqlparser", +] + [[package]] name = "datafusion-udf-wasm-arrow2bytes" version = "0.1.0" @@ -816,11 +1277,14 @@ name = "datafusion-udf-wasm-host" version = "0.1.0" dependencies = [ "arrow", + "datafusion", "datafusion-common", "datafusion-expr", + "datafusion-sql", "datafusion-udf-wasm-arrow2bytes", "datafusion-udf-wasm-bundle", "insta", + "sqlparser", "tar", "tempfile", "tokio", @@ -960,9 +1424,15 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.3" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + +[[package]] +name = "fixedbitset" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0399f9d26e5191ce32c498bebd31e7a3ceabc2745f0ac54af3f335126c3f24b3" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" @@ -1103,19 +1573,19 @@ checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.7+wasi-0.2.4", + "wasip2", ] [[package]] @@ -1129,6 +1599,12 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "h2" version = "0.4.12" @@ -1150,13 +1626,14 @@ dependencies = [ [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] @@ -1197,6 +1674,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "1.3.1" @@ -1243,6 +1726,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + [[package]] name = "hyper" version = "1.7.0" @@ -1565,9 +2054,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.176" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libm" @@ -1604,6 +2093,15 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.28" @@ -1665,7 +2163,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" dependencies = [ "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "windows-sys 0.59.0", ] @@ -1765,12 +2263,59 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http", + "humantime", + "itertools", + "parking_lot", + "percent-encoding", + "thiserror", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "paste" version = "1.0.15" @@ -1783,6 +2328,18 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", + "serde", +] + [[package]] name = "phf" version = "0.12.1" @@ -1974,8 +2531,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", ] [[package]] @@ -1985,7 +2552,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -1997,6 +2574,15 @@ dependencies = [ "getrandom 0.2.16", ] +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2022,9 +2608,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -2034,9 +2620,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -2045,9 +2631,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "ring" @@ -2169,6 +2755,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.27" @@ -2263,12 +2855,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -2294,9 +2886,9 @@ dependencies = [ [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "subtle" @@ -2365,6 +2957,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", + "getrandom 0.3.4", "once_cell", "rustix 1.1.2", "windows-sys 0.61.2", @@ -2565,6 +3158,17 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "uuid" +version = "1.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +dependencies = [ + "getrandom 0.3.4", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "version_check" version = "0.9.5" @@ -2596,15 +3200,6 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" -[[package]] -name = "wasi" -version = "0.14.7+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" -dependencies = [ - "wasip2", -] - [[package]] name = "wasip2" version = "1.0.1+wasi-0.2.4" @@ -2641,6 +3236,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.104" @@ -3013,6 +3621,16 @@ dependencies = [ "wasmtime", ] +[[package]] +name = "web-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "web-time" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index ed3372b..95f8392 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,12 +17,15 @@ resolver = "3" [workspace.dependencies] arrow = { version = "55.2.0", default-features = false, features = ["ipc"] } chrono = { version = "0.4.42", default-features = false } +datafusion = { version = "49.0.1", default-features = false } datafusion-common = { version = "49.0.1", default-features = false } datafusion-expr = { version = "49.0.1", default-features = false } +datafusion-sql = { version = "49.0.1", default-features = false } datafusion-udf-wasm-arrow2bytes = { path = "arrow2bytes", version = "0.1.0" } datafusion-udf-wasm-bundle = { path = "guests/bundle", version = "0.1.0" } datafusion-udf-wasm-guest = { path = "guests/rust", version = "0.1.0" } datafusion-udf-wasm-python = { path = "guests/python", version = "0.1.0" } +sqlparser = { version = "0.55.0", default-features = false, features = ["std", "visitor"] } tokio = { version = "1.48.0", default-features = false } pyo3 = { version = "0.27.1", default-features = false, features = ["macros"] } tar = { version = "0.4.44", default-features = false } @@ -61,8 +64,10 @@ private_intra_doc_links = "deny" [patch.crates-io] # use same DataFusion fork as InfluxDB # See https://github.com/influxdata/arrow-datafusion/pull/72 +datafusion = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "8347a71f62d4fef8d37548f22b93877170039357" } datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "8347a71f62d4fef8d37548f22b93877170039357" } datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "8347a71f62d4fef8d37548f22b93877170039357" } +datafusion-sql = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "8347a71f62d4fef8d37548f22b93877170039357" } # faster tests [profile.dev.package] diff --git a/host/Cargo.toml b/host/Cargo.toml index a873730..fc5b09c 100644 --- a/host/Cargo.toml +++ b/host/Cargo.toml @@ -10,9 +10,12 @@ workspace = true [dependencies] arrow.workspace = true +datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datafusion-udf-wasm-arrow2bytes.workspace = true +datafusion-sql.workspace = true +sqlparser.workspace = true tar.workspace = true tempfile.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread", "sync"] } diff --git a/host/src/lib.rs b/host/src/lib.rs index b49e77d..9f63aed 100644 --- a/host/src/lib.rs +++ b/host/src/lib.rs @@ -2,6 +2,7 @@ //! //! //! [DataFusion]: https://datafusion.apache.org/ +//! use std::{any::Any, io::Cursor, ops::DerefMut, sync::Arc}; use arrow::datatypes::DataType; @@ -36,6 +37,7 @@ mod bindings; mod conversion; mod error; mod tokio_helpers; +pub mod udf_query; /// State of the WASM payload. struct WasmStateImpl { diff --git a/host/src/udf_query.rs b/host/src/udf_query.rs new file mode 100644 index 0000000..6a2cfc7 --- /dev/null +++ b/host/src/udf_query.rs @@ -0,0 +1,210 @@ +//! Embedded SQL approach for executing Python UDFs within SQL queries. + +use std::collections::HashMap; + +use datafusion::execution::TaskContext; +use datafusion_common::{DataFusionError, Result as DataFusionResult}; +use datafusion_sql::parser::{DFParserBuilder, Statement}; +use sqlparser::ast::{CreateFunctionBody, Expr, Statement as SqlStatement, Value}; +use sqlparser::dialect::dialect_from_str; + +use crate::{WasmComponentPrecompiled, WasmScalarUdf}; + +/// Supported UDF languages +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +pub enum UDFLanguage { + /// The Python programming language + Python, + /// An unsupported UDF language + Unsupported, +} + +impl From for UDFLanguage { + fn from(value: String) -> Self { + match value.to_lowercase().as_str() { + "python" => Self::Python, + _ => Self::Unsupported, + } + } +} + +/// A [ParsedQuery] contains the extracted UDFs and SQL query string +#[derive(Debug)] +pub struct ParsedQuery { + /// Extracted UDFs from the query + pub udfs: Vec, + /// SQL query string with UDF definitions removed + pub sql: String, +} + +/// Handles the registration and invocation of UDF queries in DataFusion with a +/// pre-compiled WASM component. +pub struct UdfQueryParser<'a> { + /// Pre-compiled WASM component. + /// Necessary to create UDFs. + components: HashMap, +} + +impl std::fmt::Debug for UdfQueryParser<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("UdfQueryParser") + .field("session_ctx", &"SessionContext { ... }") + .field("components", &self.components) + .finish() + } +} + +impl<'a> UdfQueryParser<'a> { + /// Registers the UDF query in DataFusion. + pub async fn new( + components: HashMap, + ) -> DataFusionResult { + Ok(Self { components }) + } + + /// Parses a SQL query that defines & uses Python UDFs into a [ParsedQuery]. + pub async fn parse( + &self, + udf_query: &str, + task_ctx: &TaskContext, + ) -> DataFusionResult { + let (code, sql, lang) = self.parse_inner(udf_query, task_ctx)?; + + let component = self.components.get(&lang).ok_or_else(|| { + DataFusionError::Plan(format!( + "no WASM component registered for language: {:?}", + lang + )) + })?; + + let udfs = WasmScalarUdf::new(component, code).await?; + Ok(ParsedQuery { udfs, sql }) + } + + /// Parse the combined query to extract the chosen UDF language, UDF + /// definitions, and SQL statements. + fn parse_inner( + &self, + query: &str, + task_ctx: &TaskContext, + ) -> DataFusionResult<(String, String, UDFLanguage)> { + let options = task_ctx.session_config().options(); + + let dialect = dialect_from_str(options.sql_parser.dialect.clone()).expect("valid dialect"); + let recursion_limit = options.sql_parser.recursion_limit; + + let statements = DFParserBuilder::new(query) + .with_dialect(dialect.as_ref()) + .with_recursion_limit(recursion_limit) + .build()? + .parse_statements()?; + + let mut udf_code = String::new(); + let mut sql = String::new(); + + // Python is the only supported UDF language at this time. + let udf_language = UDFLanguage::Python; + for s in statements { + let Statement::Statement(stmt) = s else { + continue; + }; + + match parse_udf(*stmt)? { + Parsed::Udf { code, language } => { + udf_code.push_str(&code); + udf_code.push('\n'); + // FIXME: handle multiple languages in a single query + if language != udf_language { + return Err(DataFusionError::Plan( + "only Python UDFs are supported at this time".to_string(), + )); + } + } + Parsed::Other(statement) => { + sql.push_str(&statement); + sql.push_str(";\n"); + } + } + } + + if udf_code.is_empty() { + return Err(DataFusionError::Plan( + "UDF not defined in query".to_string(), + )); + } + + if sql.is_empty() { + return Err(DataFusionError::Plan("no SQL query found".to_string())); + } + + Ok((udf_code, sql, udf_language)) + } +} + +/// Represents a parsed SQL statement +enum Parsed { + /// A UDF definition + Udf { + /// UDF code + code: String, + /// UDF language + language: UDFLanguage, + }, + /// Any other SQL statement + Other(String), +} + +/// Parse a single SQL statement to extract a UDF +fn parse_udf(stmt: SqlStatement) -> DataFusionResult { + match stmt { + SqlStatement::CreateFunction(cf) => { + let function_body = cf.function_body.as_ref(); + + let language = if let Some(lang) = cf.language.as_ref() { + UDFLanguage::from(lang.to_string()) + } else { + return Err(DataFusionError::Plan( + "function language is required for UDFs".to_string(), + )); + }; + + let code = match function_body { + Some(body) => extract_function_body(body), + None => Err(DataFusionError::Plan( + "function body is required for UDFs".to_string(), + )), + }?; + + Ok(Parsed::Udf { + code: code.to_string(), + language, + }) + } + _ => Ok(Parsed::Other(stmt.to_string())), + } +} + +/// Extracts the code from the function body, adding it to `code`. +fn extract_function_body(body: &CreateFunctionBody) -> DataFusionResult<&str> { + match body { + CreateFunctionBody::AsAfterOptions(e) | CreateFunctionBody::AsBeforeOptions(e) => { + expression_into_str(e) + } + CreateFunctionBody::Return(_) => Err(DataFusionError::Plan( + "`RETURN` function body not supported for Python UDFs".to_string(), + )), + } +} + +/// Attempt to convert an `Expr` into a `str` +fn expression_into_str(expr: &Expr) -> DataFusionResult<&str> { + match expr { + Expr::Value(v) => match &v.value { + Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => Ok(s), + _ => Err(DataFusionError::Plan("expected string value".to_string())), + }, + _ => Err(DataFusionError::Plan( + "expected value expression".to_string(), + )), + } +} diff --git a/host/tests/integration_tests/mod.rs b/host/tests/integration_tests/mod.rs index f0b8705..9d0eec3 100644 --- a/host/tests/integration_tests/mod.rs +++ b/host/tests/integration_tests/mod.rs @@ -1,3 +1,4 @@ mod python; mod rust; mod test_utils; +mod udf_query; diff --git a/host/tests/integration_tests/python/mod.rs b/host/tests/integration_tests/python/mod.rs index 328379d..0cb865f 100644 --- a/host/tests/integration_tests/python/mod.rs +++ b/host/tests/integration_tests/python/mod.rs @@ -3,5 +3,5 @@ mod examples; mod inspection; mod runtime; mod state; -mod test_utils; +pub(crate) mod test_utils; mod types; diff --git a/host/tests/integration_tests/python/test_utils.rs b/host/tests/integration_tests/python/test_utils.rs index a38c79a..94f6741 100644 --- a/host/tests/integration_tests/python/test_utils.rs +++ b/host/tests/integration_tests/python/test_utils.rs @@ -4,7 +4,7 @@ use tokio::sync::OnceCell; static COMPONENT: OnceCell = OnceCell::const_new(); -async fn python_component() -> &'static WasmComponentPrecompiled { +pub(crate) async fn python_component() -> &'static WasmComponentPrecompiled { COMPONENT .get_or_init(async || { WasmComponentPrecompiled::new(datafusion_udf_wasm_bundle::BIN_PYTHON.into()) diff --git a/host/tests/integration_tests/udf_query.rs b/host/tests/integration_tests/udf_query.rs new file mode 100644 index 0000000..09d06af --- /dev/null +++ b/host/tests/integration_tests/udf_query.rs @@ -0,0 +1,170 @@ +use std::collections::HashMap; + +use datafusion::{ + assert_batches_eq, + prelude::{DataFrame, SessionContext}, +}; +use datafusion_common::Result as DataFusionResult; +use datafusion_udf_wasm_host::udf_query::{ParsedQuery, UDFLanguage, UdfQueryParser}; + +use crate::integration_tests::python::test_utils::python_component; + +/// A helper struct for invoking UDF queries and validating their results. +struct UdfQueryInvocator; + +impl UdfQueryInvocator { + async fn invoke( + ctx: &SessionContext, + parsed_query: ParsedQuery, + ) -> DataFusionResult { + for udf in parsed_query.udfs { + let scalar_udf = datafusion_expr::ScalarUDF::new_from_impl(udf); + ctx.register_udf(scalar_udf); + } + + ctx.sql(&parsed_query.sql).await + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_basic() { + let query = r#" +CREATE FUNCTION add_one() +LANGUAGE python +AS ' +def add_one(x: int) -> int: + return x + 1 +'; + +SELECT add_one(1); +"#; + + let ctx = SessionContext::new(); + let component = python_component().await; + + let parser = UdfQueryParser::new(HashMap::from_iter([(UDFLanguage::Python, component)])) + .await + .unwrap(); + let parsed_query = parser.parse(query, ctx.task_ctx().as_ref()).await.unwrap(); + + let df = UdfQueryInvocator::invoke(&ctx, parsed_query).await.unwrap(); + let batch = df.collect().await.unwrap(); + + assert_batches_eq!( + [ + "+-------------------+", + "| add_one(Int64(1)) |", + "+-------------------+", + "| 2 |", + "+-------------------+", + ], + &batch + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_multiple_functions() { + let query = r#" +CREATE FUNCTION add_one() +LANGUAGE python +AS ' +def add_one(x: int) -> int: + return x + 1 +'; + +CREATE FUNCTION multiply_two() +LANGUAGE python +AS ' +def multiply_two(x: int) -> int: + return x * 2 +'; + +SELECT add_one(1), multiply_two(3); +"#; + + let ctx = SessionContext::new(); + let component = python_component().await; + + let parser = UdfQueryParser::new(HashMap::from_iter([(UDFLanguage::Python, component)])) + .await + .unwrap(); + let parsed_query = parser.parse(query, ctx.task_ctx().as_ref()).await.unwrap(); + + let df = UdfQueryInvocator::invoke(&ctx, parsed_query).await.unwrap(); + let batch = df.collect().await.unwrap(); + + assert_batches_eq!( + [ + "+-------------------+------------------------+", + "| add_one(Int64(1)) | multiply_two(Int64(3)) |", + "+-------------------+------------------------+", + "| 2 | 6 |", + "+-------------------+------------------------+", + ], + &batch + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_multiple_functions_single_statement() { + let query = r#" +CREATE FUNCTION add_one() +LANGUAGE python +AS ' +def add_one(x: int) -> int: + return x + 1 + +def multiply_two(x: int) -> int: + return x * 2 +'; + +SELECT add_one(1), multiply_two(3); +"#; + + let ctx = SessionContext::new(); + let component = python_component().await; + + let parser = UdfQueryParser::new(HashMap::from_iter([(UDFLanguage::Python, component)])) + .await + .unwrap(); + let parsed_query = parser.parse(query, ctx.task_ctx().as_ref()).await.unwrap(); + + let df = UdfQueryInvocator::invoke(&ctx, parsed_query).await.unwrap(); + let batch = df.collect().await.unwrap(); + + assert_batches_eq!( + [ + "+-------------------+------------------------+", + "| add_one(Int64(1)) | multiply_two(Int64(3)) |", + "+-------------------+------------------------+", + "| 2 | 6 |", + "+-------------------+------------------------+", + ], + &batch + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_empty_string() { + let query = r#" +CREATE FUNCTION add_one() +LANGUAGE python +AS ''; + +SELECT add_one(1) +"#; + + let ctx = SessionContext::new(); + let component = python_component().await; + + let parser = UdfQueryParser::new(HashMap::from_iter([(UDFLanguage::Python, component)])) + .await + .unwrap(); + let parsed_query = parser.parse(query, ctx.task_ctx().as_ref()).await.unwrap(); + + let r = UdfQueryInvocator::invoke(&ctx, parsed_query).await; + assert!(r.is_err()); + + let err = r.err().unwrap(); + assert!(err.message().contains("Invalid function 'add_one'")); +}