apache · houqp · Jan 29, 2022 · Jan 11, 2022 · Jan 11, 2022 · Jan 11, 2022
diff --git a/.env b/.env
@@ -47,7 +47,7 @@ FEDORA=33
 PYTHON=3.6
 LLVM=11
 CLANG_TOOLS=8
-RUST=nightly-2021-10-23
+RUST=nightly-2022-01-17
 GO=1.15
 NODE=14
 MAVEN=3.5.4

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -3,9 +3,7 @@ updates:
   - package-ecosystem: cargo
     directory: "/"
     schedule:
-      interval: weekly
-      day: sunday
-      time: "7:00"
+      interval: daily
     open-pull-requests-limit: 10
     target-branch: master
-    labels: [auto-dependencies]
+    labels: [auto-dependencies]
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -293,7 +293,7 @@ jobs:
     strategy:
       matrix:
         arch: [amd64]
-        rust: [nightly-2021-10-23]
+        rust: [nightly-2022-01-17]
     steps:
       - uses: actions/checkout@v2
         with:

diff --git a/Cargo.toml b/Cargo.toml
@@ -28,13 +28,11 @@ members = [
     "ballista-examples",
 ]
 
-exclude = ["python"]
-
 [profile.release]
 lto = true
 codegen-units = 1
 
 [patch.crates-io]
-arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "ef7937dfe56033c2cc491482c67587b52cd91554" }
+arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", branch = "main" }
 #arrow2 = { git = "https://github.com/blaze-init/arrow2.git", branch = "shuffle_ipc" }
 #parquet2 = { git = "https://github.com/blaze-init/parquet2.git", branch = "meta_new" }
diff --git a/README.md b/README.md
@@ -49,19 +49,25 @@ the convenience of an SQL interface or a DataFrame API.
 
 ## Known Uses
 
+Projects that adapt to or serve as plugins to DataFusion:
+
+- [datafusion-python](https://github.com/datafusion-contrib/datafusion-python)
+- [datafusion-java](https://github.com/datafusion-contrib/datafusion-java)
+- [datafusion-ruby](https://github.com/j-a-m-l/datafusion-ruby)
+- [datafusion-objectstore-s3](https://github.com/datafusion-contrib/datafusion-objectstore-s3)
+- [datafusion-hdfs-native](https://github.com/datafusion-contrib/datafusion-hdfs-native)
+
 Here are some of the projects known to use DataFusion:
 
 - [Ballista](ballista) Distributed Compute Platform
 - [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust)
 - [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust)
-- [datafusion-python](https://pypi.org/project/datafusion)
-- [datafusion-java](https://github.com/datafusion-contrib/datafusion-java)
-- [datafusion-ruby](https://github.com/j-a-m-l/datafusion-ruby)
 - [delta-rs](https://github.com/delta-io/delta-rs)
 - [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database
 - [ROAPI](https://github.com/roapi/roapi)
 - [Tensorbase](https://github.com/tensorbase/tensorbase)
 - [Squirtle](https://github.com/DSLAM-UMD/Squirtle)
+- [VegaFusion](https://vegafusion.io/) Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar
 
 (if you know of another project, please submit a PR to add a link!)
 
@@ -134,6 +140,60 @@ datafusion = "6.0.0"
 
 DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](https://arrow.apache.org/datafusion/cli/index.html) for more information.
 
+# Roadmap
+
+A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding.
+
+## 2022 Q1
+
+### DataFusion Core
+
+- Publish official Arrow2 branch
+- Implementation of memory manager (i.e. to enable spilling to disk as needed)
+
+### Benchmarking
+
+- Inclusion in Db-Benchmark with all quries covered
+- All TPCH queries covered
+
+### Performance Improvements
+
+- Predicate evaluation
+- Improve multi-column comparisons (that can't be vectorized at the moment)
+- Null constant support
+
+### New Features
+
+- Read JSON as table
+- Simplify DDL with Datafusion-Cli
+- Add Decimal128 data type and the attendant features such as Arrow Kernel and UDF support
+- Add new experimental e-graph based optimizer
+
+### Ballista
+
+- Begin work on design documents and plan / priorities for development
+
+### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib]))
+
+- Stable S3 support
+- Begin design discussions and prototyping of a stream provider
+
+## Beyond 2022 Q1
+
+There is no clear timeline for the below, but community members have expressed interest in working on these topics.
+
+### DataFusion Core
+
+- Custom SQL support
+- Split DataFusion into multiple crates
+- Push based query execution and code generation
+
+### Ballista
+
+- Evolve architecture so that it can be deployed in a multi-tenant cloud native environment
+- Ensure Ballista is scalable, elastic, and stable for production usage
+- Develop distributed ML capabilities
+
 # Status
 
 ## General
@@ -266,7 +326,7 @@ This library currently supports many SQL constructs, including
 - `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)`
 - Many mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`.
 - `WHERE` to filter
-- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `VAR`, `STDDEV` (sample and population)
+- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `CORR`, `VAR`, `COVAR`, `STDDEV` (sample and population)
 - `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST`
 
 ## Supported Functions

diff --git a/ballista-examples/Cargo.toml b/ballista-examples/Cargo.toml
@@ -26,7 +26,7 @@ license = "Apache-2.0"
 keywords = [ "arrow", "distributed", "query", "sql" ]
 edition = "2021"
 publish = false
-rust-version = "1.57"
+rust-version = "1.58"
 
 [dependencies]
 datafusion = { path = "../datafusion" }

diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml
@@ -24,7 +24,7 @@ homepage = "https://github.com/apache/arrow-datafusion"
 repository = "https://github.com/apache/arrow-datafusion"
 authors = ["Apache Arrow <[email protected]>"]
 edition = "2021"
-rust-version = "1.57"
+rust-version = "1.58"
 
 [dependencies]
 ballista-core = { path = "../core", version = "0.6.0" }
@@ -33,6 +33,8 @@ ballista-scheduler = { path = "../scheduler", version = "0.6.0", optional = true
 futures = "0.3"
 log = "0.4"
 tokio = "1.0"
+tempfile = "3"
+sqlparser = "0.13"
 
 datafusion = { path = "../../../datafusion", version = "6.0.0" }
 

diff --git a/ballista/rust/client/src/columnar_batch.rs b/ballista/rust/client/src/columnar_batch.rs
@@ -23,8 +23,9 @@ use datafusion::arrow::{
     array::ArrayRef,
     compute::aggregate::estimated_bytes_size,
     datatypes::{DataType, Schema},
-    record_batch::RecordBatch,
 };
+use datafusion::field_util::{FieldExt, SchemaExt};
+use datafusion::record_batch::RecordBatch;
 use datafusion::scalar::ScalarValue;
 
 pub type MaybeColumnarBatch = Result<Option<ColumnarBatch>>;
@@ -44,7 +45,7 @@ impl ColumnarBatch {
             .enumerate()
             .map(|(i, array)| {
                 (
-                    batch.schema().field(i).name().clone(),
+                    batch.schema().field(i).name().to_string(),
                     ColumnarValue::Columnar(array.clone()),
                 )
             })
@@ -61,7 +62,7 @@ impl ColumnarBatch {
             .fields()
             .iter()
             .enumerate()
-            .map(|(i, f)| (f.name().clone(), values[i].clone()))
+            .map(|(i, f)| (f.name().to_string(), values[i].clone()))
             .collect();
 
         Self {