Skip to content

WIP: Test enabling Parquet filter pushdown with parquet caching page cache reader #15506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 85 additions & 72 deletions Cargo.lock

Large diffs are not rendered by default.

32 changes: 25 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,19 @@ ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
apache-avro = { version = "0.17", default-features = false }
arrow = { version = "54.2.1", features = [
arrow = { version = "54.3.1", features = [
"prettyprint",
"chrono-tz",
] }
arrow-buffer = { version = "54.1.0", default-features = false }
arrow-flight = { version = "54.2.1", features = [
arrow-buffer = { version = "54.3.1", default-features = false }
arrow-flight = { version = "54.3.1", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "54.2.0", default-features = false, features = [
arrow-ipc = { version = "54.3.1", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "54.1.0", default-features = false }
arrow-schema = { version = "54.1.0", default-features = false }
arrow-ord = { version = "54.3.1", default-features = false }
arrow-schema = { version = "54.3.1", default-features = false }
async-trait = "0.1.88"
bigdecimal = "0.4.7"
bytes = "1.10"
Expand Down Expand Up @@ -149,7 +149,7 @@ itertools = "0.14"
log = "^0.4"
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
parquet = { version = "54.2.1", default-features = false, features = [
parquet = { version = "54.3.1", default-features = false, features = [
"arrow",
"async",
"object_store",
Expand Down Expand Up @@ -206,3 +206,21 @@ used_underscore_binding = "warn"
[workspace.lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
unused_qualifications = "deny"


## Temporary arrow-rs patch
## https://github.com/apache/arrow-rs/pull/6921cli

[patch.crates-io]
arrow = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-array = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-buffer = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-cast = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-data = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-ipc = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-schema = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-select = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-string = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-ord = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
arrow-flight = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
parquet = { git = "https://github.com/XiangpengHao/arrow-rs.git", rev = "7c10b4a93d99328928434462a044098993442a0f" }
2 changes: 1 addition & 1 deletion datafusion/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ log = { workspace = true }
object_store = { workspace = true, optional = true }
parquet = { workspace = true, optional = true, default-features = true }
paste = "1.0.15"
pyo3 = { version = "0.23.5", optional = true }
pyo3 = { version = "0.24.0", optional = true }
recursive = { workspace = true, optional = true }
sqlparser = { workspace = true }
tokio = { workspace = true }
Expand Down
4 changes: 2 additions & 2 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,12 +432,12 @@ config_namespace! {

/// (reading) If true, filter expressions are be applied during the parquet decoding operation to
/// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
pub pushdown_filters: bool, default = false
pub pushdown_filters: bool, default = true

/// (reading) If true, filter expressions evaluated during the parquet decoding operation
/// will be reordered heuristically to minimize the cost of evaluation. If false,
/// the filters are applied in the same order as written in the query
pub reorder_filters: bool, default = false
pub reorder_filters: bool, default = true

/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
/// and `Binary/BinaryLarge` with `BinaryView`.
Expand Down
28 changes: 26 additions & 2 deletions datafusion/functions-aggregate/benches/array_agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,23 @@ use std::sync::Arc;

use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder,
PrimitiveArray,
};
use arrow::datatypes::{Field, Int64Type};
use arrow::util::bench_util::create_primitive_array;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::Accumulator;
use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator;

use arrow::buffer::OffsetBuffer;
use arrow::util::test_util::seedable_rng;
use rand::distributions::{Distribution, Standard};
use rand::prelude::StdRng;
use rand::Rng;
use rand::SeedableRng;

/// Returns fixed seedable RNG
pub fn seedable_rng() -> StdRng {
StdRng::seed_from_u64(42)
}

fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
let list_item_data_type = values.as_list::<i32>().values().data_type().clone();
Expand All @@ -46,6 +52,24 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
});
}

pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
where
T: ArrowPrimitiveType,
Standard: Distribution<T::Native>,
{
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
Some(rng.gen())
}
})
.collect()
}

/// Create List array with the given item data type, null density, null locations and zero length lists density
/// Creates an random (but fixed-seeded) array of a given size and null density
pub fn create_list_array<T>(
Expand Down
10 changes: 8 additions & 2 deletions datafusion/functions/benches/chr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,21 @@

extern crate criterion;

use arrow::{array::PrimitiveArray, datatypes::Int64Type, util::test_util::seedable_rng};
use arrow::{array::PrimitiveArray, datatypes::Int64Type};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string::chr;
use rand::Rng;
use rand::{Rng, SeedableRng};

use arrow::datatypes::DataType;
use rand::rngs::StdRng;
use std::sync::Arc;

/// Returns fixed seedable RNG
pub fn seedable_rng() -> StdRng {
StdRng::seed_from_u64(42)
}

fn criterion_benchmark(c: &mut Criterion) {
let cot_fn = chr();
let size = 1024;
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/dates.slt
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ query error input contains invalid characters
SELECT to_date('2020-09-08 12/00/00+00:00', '%c', '%+')

# to_date with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_date('2020-09-08 12/00/00+00:00', '%q')

statement ok
Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/expr/date_part.slt
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ SELECT extract(day from arrow_cast('14400 minutes', 'Interval(DayTime)'))
query I
SELECT extract(minute from arrow_cast('14400 minutes', 'Interval(DayTime)'))
----
14400
0

query I
SELECT extract(second from arrow_cast('5.1 seconds', 'Interval(DayTime)'))
Expand All @@ -894,7 +894,7 @@ SELECT extract(second from arrow_cast('5.1 seconds', 'Interval(DayTime)'))
query I
SELECT extract(second from arrow_cast('14400 minutes', 'Interval(DayTime)'))
----
864000
0

query I
SELECT extract(second from arrow_cast('2 months', 'Interval(MonthDayNano)'))
Expand Down Expand Up @@ -954,7 +954,7 @@ from t
order by id;
----
0 0 5
1 0 15
1 0 3
2 0 0
3 2 0
4 0 8
Expand Down
10 changes: 5 additions & 5 deletions datafusion/sqllogictest/test_files/timestamps.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2241,23 +2241,23 @@ query error input contains invalid characters
SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+')

# to_timestamp with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q')

# to_timestamp_nanos with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q')

# to_timestamp_millis with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q')

# to_timestamp_micros with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q')

# to_timestamp_seconds with broken formatting
query error bad or unsupported format string
query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input
SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q')

# Create string timestamp table with different formats
Expand Down
Loading