Skip to content

Commit 31acf45

Browse files
blaginincomphead
andauthored
Add configurable normalization for configuration options and preserve case for S3 paths (#13576)
* Do not normalize values * Fix tests & update docs * Prettier * Lowercase config params * Unify transform and parse * Fix tests * Rename `default_transform` and relax boundaries * Make `compression` case-insensitive * Comment to new line * Deprecate and ignore `enable_options_value_normalization` * Update datafusion/common/src/config.rs * fix typo --------- Co-authored-by: Oleks V <[email protected]>
1 parent d7aeb1a commit 31acf45

File tree

13 files changed

+104
-141
lines changed

13 files changed

+104
-141
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-cli/src/object_storage.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -472,12 +472,13 @@ mod tests {
472472

473473
#[tokio::test]
474474
async fn s3_object_store_builder() -> Result<()> {
475-
let access_key_id = "fake_access_key_id";
476-
let secret_access_key = "fake_secret_access_key";
475+
// "fake" is uppercase to ensure the values are not lowercased when parsed
476+
let access_key_id = "FAKE_access_key_id";
477+
let secret_access_key = "FAKE_secret_access_key";
477478
let region = "fake_us-east-2";
478479
let endpoint = "endpoint33";
479-
let session_token = "fake_session_token";
480-
let location = "s3://bucket/path/file.parquet";
480+
let session_token = "FAKE_session_token";
481+
let location = "s3://bucket/path/FAKE/file.parquet";
481482

482483
let table_url = ListingTableUrl::parse(location)?;
483484
let scheme = table_url.scheme();

datafusion/common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ half = { workspace = true }
5757
hashbrown = { workspace = true }
5858
indexmap = { workspace = true }
5959
libc = "0.2.140"
60+
log = { workspace = true }
6061
object_store = { workspace = true, optional = true }
6162
parquet = { workspace = true, optional = true, default-features = true }
6263
paste = "1.0.15"

datafusion/common/src/config.rs

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
use std::any::Any;
2121
use std::collections::{BTreeMap, HashMap};
22+
use std::error::Error;
2223
use std::fmt::{self, Display};
2324
use std::str::FromStr;
2425

@@ -29,7 +30,9 @@ use crate::{DataFusionError, Result};
2930

3031
/// A macro that wraps a configuration struct and automatically derives
3132
/// [`Default`] and [`ConfigField`] for it, allowing it to be used
32-
/// in the [`ConfigOptions`] configuration tree
33+
/// in the [`ConfigOptions`] configuration tree.
34+
///
35+
/// `transform` is used to normalize values before parsing.
3336
///
3437
/// For example,
3538
///
@@ -38,7 +41,7 @@ use crate::{DataFusionError, Result};
3841
/// /// Amazing config
3942
/// pub struct MyConfig {
4043
/// /// Field 1 doc
41-
/// field1: String, default = "".to_string()
44+
/// field1: String, transform = str::to_lowercase, default = "".to_string()
4245
///
4346
/// /// Field 2 doc
4447
/// field2: usize, default = 232
@@ -67,9 +70,12 @@ use crate::{DataFusionError, Result};
6770
/// fn set(&mut self, key: &str, value: &str) -> Result<()> {
6871
/// let (key, rem) = key.split_once('.').unwrap_or((key, ""));
6972
/// match key {
70-
/// "field1" => self.field1.set(rem, value),
71-
/// "field2" => self.field2.set(rem, value),
72-
/// "field3" => self.field3.set(rem, value),
73+
/// "field1" => {
74+
/// let value = str::to_lowercase(value);
75+
/// self.field1.set(rem, value.as_ref())
76+
/// },
77+
/// "field2" => self.field2.set(rem, value.as_ref()),
78+
/// "field3" => self.field3.set(rem, value.as_ref()),
7379
/// _ => _internal_err!(
7480
/// "Config value \"{}\" not found on MyConfig",
7581
/// key
@@ -102,15 +108,14 @@ use crate::{DataFusionError, Result};
102108
/// ```
103109
///
104110
/// NB: Misplaced commas may result in nonsensical errors
105-
///
106111
#[macro_export]
107112
macro_rules! config_namespace {
108113
(
109114
$(#[doc = $struct_d:tt])*
110115
$vis:vis struct $struct_name:ident {
111116
$(
112117
$(#[doc = $d:tt])*
113-
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
118+
$field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr
114119
)*$(,)*
115120
}
116121
) => {
@@ -127,9 +132,14 @@ macro_rules! config_namespace {
127132
impl ConfigField for $struct_name {
128133
fn set(&mut self, key: &str, value: &str) -> Result<()> {
129134
let (key, rem) = key.split_once('.').unwrap_or((key, ""));
135+
130136
match key {
131137
$(
132-
stringify!($field_name) => self.$field_name.set(rem, value),
138+
stringify!($field_name) => {
139+
$(let value = $transform(value);)?
140+
$(log::warn!($warn);)?
141+
self.$field_name.set(rem, value.as_ref())
142+
},
133143
)*
134144
_ => return _config_err!(
135145
"Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -211,12 +221,15 @@ config_namespace! {
211221
/// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
212222
pub enable_ident_normalization: bool, default = true
213223

214-
/// When set to true, SQL parser will normalize options value (convert value to lowercase)
215-
pub enable_options_value_normalization: bool, default = true
224+
/// When set to true, SQL parser will normalize options value (convert value to lowercase).
225+
/// Note that this option is ignored and will be removed in the future. All case-insensitive values
226+
/// are normalized automatically.
227+
pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false
216228

217229
/// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
218230
/// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
219231
pub dialect: String, default = "generic".to_string()
232+
// no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive
220233

221234
/// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
222235
/// ignore the length. If false, error if a `VARCHAR` with a length is
@@ -431,7 +444,7 @@ config_namespace! {
431444
///
432445
/// Note that this default setting is not the same as
433446
/// the default parquet writer setting.
434-
pub compression: Option<String>, default = Some("zstd(3)".into())
447+
pub compression: Option<String>, transform = str::to_lowercase, default = Some("zstd(3)".into())
435448

436449
/// (writing) Sets if dictionary encoding is enabled. If NULL, uses
437450
/// default parquet writer setting
@@ -444,7 +457,7 @@ config_namespace! {
444457
/// Valid values are: "none", "chunk", and "page"
445458
/// These values are not case sensitive. If NULL, uses
446459
/// default parquet writer setting
447-
pub statistics_enabled: Option<String>, default = Some("page".into())
460+
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
448461

449462
/// (writing) Sets max statistics size for any column. If NULL, uses
450463
/// default parquet writer setting
@@ -470,7 +483,7 @@ config_namespace! {
470483
/// delta_byte_array, rle_dictionary, and byte_stream_split.
471484
/// These values are not case sensitive. If NULL, uses
472485
/// default parquet writer setting
473-
pub encoding: Option<String>, default = None
486+
pub encoding: Option<String>, transform = str::to_lowercase, default = None
474487

475488
/// (writing) Use any available bloom filters when reading parquet files
476489
pub bloom_filter_on_read: bool, default = true
@@ -971,29 +984,45 @@ impl<F: ConfigField + Default> ConfigField for Option<F> {
971984
}
972985
}
973986

987+
fn default_transform<T>(input: &str) -> Result<T>
988+
where
989+
T: FromStr,
990+
<T as FromStr>::Err: Sync + Send + Error + 'static,
991+
{
992+
input.parse().map_err(|e| {
993+
DataFusionError::Context(
994+
format!(
995+
"Error parsing '{}' as {}",
996+
input,
997+
std::any::type_name::<T>()
998+
),
999+
Box::new(DataFusionError::External(Box::new(e))),
1000+
)
1001+
})
1002+
}
1003+
9741004
#[macro_export]
9751005
macro_rules! config_field {
9761006
($t:ty) => {
1007+
config_field!($t, value => default_transform(value)?);
1008+
};
1009+
1010+
($t:ty, $arg:ident => $transform:expr) => {
9771011
impl ConfigField for $t {
9781012
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
9791013
v.some(key, self, description)
9801014
}
9811015

982-
fn set(&mut self, _: &str, value: &str) -> Result<()> {
983-
*self = value.parse().map_err(|e| {
984-
DataFusionError::Context(
985-
format!(concat!("Error parsing {} as ", stringify!($t),), value),
986-
Box::new(DataFusionError::External(Box::new(e))),
987-
)
988-
})?;
1016+
fn set(&mut self, _: &str, $arg: &str) -> Result<()> {
1017+
*self = $transform;
9891018
Ok(())
9901019
}
9911020
}
9921021
};
9931022
}
9941023

9951024
config_field!(String);
996-
config_field!(bool);
1025+
config_field!(bool, value => default_transform(value.to_lowercase().as_str())?);
9971026
config_field!(usize);
9981027
config_field!(f64);
9991028
config_field!(u64);
@@ -1508,7 +1537,7 @@ macro_rules! config_namespace_with_hashmap {
15081537
$vis:vis struct $struct_name:ident {
15091538
$(
15101539
$(#[doc = $d:tt])*
1511-
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
1540+
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr
15121541
)*$(,)*
15131542
}
15141543
) => {
@@ -1527,7 +1556,10 @@ macro_rules! config_namespace_with_hashmap {
15271556
let (key, rem) = key.split_once('.').unwrap_or((key, ""));
15281557
match key {
15291558
$(
1530-
stringify!($field_name) => self.$field_name.set(rem, value),
1559+
stringify!($field_name) => {
1560+
$(let value = $transform(value);)?
1561+
self.$field_name.set(rem, value.as_ref())
1562+
},
15311563
)*
15321564
_ => _config_err!(
15331565
"Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -1606,7 +1638,7 @@ config_namespace_with_hashmap! {
16061638
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
16071639
/// These values are not case-sensitive. If NULL, uses
16081640
/// default parquet options
1609-
pub compression: Option<String>, default = None
1641+
pub compression: Option<String>, transform = str::to_lowercase, default = None
16101642

16111643
/// Sets if statistics are enabled for the column
16121644
/// Valid values are: "none", "chunk", and "page"

datafusion/core/src/datasource/stream.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ impl TableProviderFactory for StreamTableFactory {
6262
let header = if let Ok(opt) = cmd
6363
.options
6464
.get("format.has_header")
65-
.map(|has_header| bool::from_str(has_header))
65+
.map(|has_header| bool::from_str(has_header.to_lowercase().as_str()))
6666
.transpose()
6767
{
6868
opt.unwrap_or(false)

datafusion/core/tests/config_from_env.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,19 @@ use std::env;
2222
fn from_env() {
2323
// Note: these must be a single test to avoid interference from concurrent execution
2424
let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
25-
env::set_var(env_key, "true");
26-
let config = ConfigOptions::from_env().unwrap();
25+
// valid testing in different cases
26+
for bool_option in ["true", "TRUE", "True", "tRUe"] {
27+
env::set_var(env_key, bool_option);
28+
let config = ConfigOptions::from_env().unwrap();
29+
env::remove_var(env_key);
30+
assert!(config.optimizer.filter_null_join_keys);
31+
}
32+
33+
// invalid testing
34+
env::set_var(env_key, "ttruee");
35+
let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
36+
assert_eq!(err, "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`");
2737
env::remove_var(env_key);
28-
assert!(config.optimizer.filter_null_join_keys);
2938

3039
let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
3140

@@ -37,7 +46,7 @@ fn from_env() {
3746
// for invalid testing
3847
env::set_var(env_key, "abc");
3948
let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
40-
assert_eq!(err, "Error parsing abc as usize\ncaused by\nExternal error: invalid digit found in string");
49+
assert_eq!(err, "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string");
4150

4251
env::remove_var(env_key);
4352
let config = ConfigOptions::from_env().unwrap();

datafusion/sql/src/planner.rs

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ use arrow_schema::*;
2424
use datafusion_common::{
2525
field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError,
2626
};
27+
use sqlparser::ast::TimezoneInfo;
2728
use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo};
2829
use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
2930
use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias};
30-
use sqlparser::ast::{TimezoneInfo, Value};
3131

3232
use datafusion_common::TableReference;
3333
use datafusion_common::{
@@ -38,7 +38,7 @@ use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder};
3838
use datafusion_expr::utils::find_column_exprs;
3939
use datafusion_expr::{col, Expr};
4040

41-
use crate::utils::{make_decimal_type, value_to_string};
41+
use crate::utils::make_decimal_type;
4242
pub use datafusion_expr::planner::ContextProvider;
4343

4444
/// SQL parser options
@@ -56,7 +56,7 @@ impl Default for ParserOptions {
5656
parse_float_as_decimal: false,
5757
enable_ident_normalization: true,
5858
support_varchar_with_length: true,
59-
enable_options_value_normalization: true,
59+
enable_options_value_normalization: false,
6060
}
6161
}
6262
}
@@ -87,32 +87,6 @@ impl IdentNormalizer {
8787
}
8888
}
8989

90-
/// Value Normalizer
91-
#[derive(Debug)]
92-
pub struct ValueNormalizer {
93-
normalize: bool,
94-
}
95-
96-
impl Default for ValueNormalizer {
97-
fn default() -> Self {
98-
Self { normalize: true }
99-
}
100-
}
101-
102-
impl ValueNormalizer {
103-
pub fn new(normalize: bool) -> Self {
104-
Self { normalize }
105-
}
106-
107-
pub fn normalize(&self, value: Value) -> Option<String> {
108-
match (value_to_string(&value), self.normalize) {
109-
(Some(s), true) => Some(s.to_ascii_lowercase()),
110-
(Some(s), false) => Some(s),
111-
(None, _) => None,
112-
}
113-
}
114-
}
115-
11690
/// Struct to store the states used by the Planner. The Planner will leverage the states to resolve
11791
/// CTEs, Views, subqueries and PREPARE statements. The states include
11892
/// Common Table Expression (CTE) provided with WITH clause and
@@ -254,7 +228,6 @@ pub struct SqlToRel<'a, S: ContextProvider> {
254228
pub(crate) context_provider: &'a S,
255229
pub(crate) options: ParserOptions,
256230
pub(crate) ident_normalizer: IdentNormalizer,
257-
pub(crate) value_normalizer: ValueNormalizer,
258231
}
259232

260233
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
@@ -266,13 +239,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
266239
/// Create a new query planner
267240
pub fn new_with_options(context_provider: &'a S, options: ParserOptions) -> Self {
268241
let ident_normalize = options.enable_ident_normalization;
269-
let options_value_normalize = options.enable_options_value_normalization;
270242

271243
SqlToRel {
272244
context_provider,
273245
options,
274246
ident_normalizer: IdentNormalizer::new(ident_normalize),
275-
value_normalizer: ValueNormalizer::new(options_value_normalize),
276247
}
277248
}
278249

datafusion/sql/src/statement.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,8 +1386,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
13861386
return plan_err!("Option {key} is specified multiple times");
13871387
}
13881388

1389-
let Some(value_string) = self.value_normalizer.normalize(value.clone())
1390-
else {
1389+
let Some(value_string) = crate::utils::value_to_string(&value) else {
13911390
return plan_err!("Unsupported Value {}", value);
13921391
};
13931392

0 commit comments

Comments
 (0)