Skip to content

Commit c6f8524

Browse files
committed
feat: escape quote wrap identifiers in describe
1 parent 3e30f77 commit c6f8524

File tree

4 files changed

+79
-8
lines changed

4 files changed

+79
-8
lines changed

datafusion/core/src/dataframe/mod.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ impl DataFrame {
934934
vec![],
935935
original_schema_fields
936936
.clone()
937-
.map(|f| count(col(f.name())).alias(f.name()))
937+
.map(|f| count(col(format!("\"{}\"", f.name()))).alias(f.name()))
938938
.collect::<Vec<_>>(),
939939
),
940940
// null_count aggregation
@@ -943,7 +943,7 @@ impl DataFrame {
943943
original_schema_fields
944944
.clone()
945945
.map(|f| {
946-
sum(case(is_null(col(f.name())))
946+
sum(case(is_null(col(format!("\"{}\"", f.name()))))
947947
.when(lit(true), lit(1))
948948
.otherwise(lit(0))
949949
.unwrap())
@@ -957,7 +957,7 @@ impl DataFrame {
957957
original_schema_fields
958958
.clone()
959959
.filter(|f| f.data_type().is_numeric())
960-
.map(|f| avg(col(f.name())).alias(f.name()))
960+
.map(|f| avg(col(format!("\"{}\"", f.name()))).alias(f.name()))
961961
.collect::<Vec<_>>(),
962962
),
963963
// std aggregation
@@ -966,7 +966,7 @@ impl DataFrame {
966966
original_schema_fields
967967
.clone()
968968
.filter(|f| f.data_type().is_numeric())
969-
.map(|f| stddev(col(f.name())).alias(f.name()))
969+
.map(|f| stddev(col(format!("\"{}\"", f.name()))).alias(f.name()))
970970
.collect::<Vec<_>>(),
971971
),
972972
// min aggregation
@@ -977,7 +977,7 @@ impl DataFrame {
977977
.filter(|f| {
978978
!matches!(f.data_type(), DataType::Binary | DataType::Boolean)
979979
})
980-
.map(|f| min(col(f.name())).alias(f.name()))
980+
.map(|f| min(col(format!("\"{}\"", f.name()))).alias(f.name()))
981981
.collect::<Vec<_>>(),
982982
),
983983
// max aggregation
@@ -988,7 +988,7 @@ impl DataFrame {
988988
.filter(|f| {
989989
!matches!(f.data_type(), DataType::Binary | DataType::Boolean)
990990
})
991-
.map(|f| max(col(f.name())).alias(f.name()))
991+
.map(|f| max(col(format!("\"{}\"", f.name()))).alias(f.name()))
992992
.collect::<Vec<_>>(),
993993
),
994994
// median aggregation
@@ -997,7 +997,7 @@ impl DataFrame {
997997
original_schema_fields
998998
.clone()
999999
.filter(|f| f.data_type().is_numeric())
1000-
.map(|f| median(col(f.name())).alias(f.name()))
1000+
.map(|f| median(col(format!("\"{}\"", f.name()))).alias(f.name()))
10011001
.collect::<Vec<_>>(),
10021002
),
10031003
];
@@ -1043,7 +1043,10 @@ impl DataFrame {
10431043
{
10441044
Arc::new(StringArray::from(vec!["null"]))
10451045
}
1046-
Err(e) => return exec_err!("{}", e),
1046+
// goes straight to error
1047+
Err(e) => {
1048+
return exec_err!("{}", e)
1049+
},
10471050
};
10481051
array_datas.push(array_ref);
10491052
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"run_id":"1747492618-449486000","line":1878,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1878,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n| describe | CoLu.Mn1 |\n+------------+----------+\n| count | 1 |\n| max | a |\n| mean | null |\n| median | null |\n| min | a |\n| null_count | 0 |\n| std | null |\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n| describe | CoLu.Mn1 |\n+------------+----------+\n| count | 1 |\n| null_count | 0 |\n| mean | null |\n| std | null |\n| min | a |\n| max | a |\n| median | null |\n+------------+----------+"}}
2+
{"run_id":"1747537851-743053000","line":1883,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1883,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n| describe | CoLu.Mn1 |\n+------------+----------+\n| count | 1 |\n| max | a |\n| mean | null |\n| median | null |\n| min | a |\n| null_count | 0 |\n| std | null |\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n| describe | CoLu.Mn1 |\n+------------+----------+\n| count | 1 |\n| null_count | 0 |\n| mean | null |\n| std | null |\n| min | a |\n| max | a |\n| median | null |\n+------------+----------+"}}
3+
{"run_id":"1747538015-81091000","line":1883,"new":null,"old":null}

datafusion/core/tests/dataframe/mod.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1852,6 +1852,56 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
18521852
Ok(())
18531853
}
18541854

1855+
#[tokio::test]
1856+
async fn describe_lookup_via_quoted_identifier() -> Result<()> {
1857+
let ctx = SessionContext::new();
1858+
let name = "aggregate_test_100";
1859+
register_aggregate_csv(&ctx, name).await?;
1860+
let df = ctx.table(name);
1861+
1862+
let df = df
1863+
.await?
1864+
.filter(col("c2").eq(lit(3)).and(col("c1").eq(lit("a"))))?
1865+
.limit(0, Some(1))?
1866+
.sort(vec![
1867+
// make the test deterministic
1868+
col("c1").sort(true, true),
1869+
col("c2").sort(true, true),
1870+
col("c3").sort(true, true),
1871+
])?
1872+
.select_columns(&["c1"])?;
1873+
1874+
let df_renamed = df.clone().with_column_renamed("c1", "CoLu.Mn1")?;
1875+
1876+
let describe_result = df_renamed.describe().await?;
1877+
describe_result.clone().sort(
1878+
vec![
1879+
col("describe").sort(true, true),
1880+
col("\"CoLu.Mn1\"").sort(true,true),
1881+
]
1882+
)?.show().await?;
1883+
assert_snapshot!(
1884+
batches_to_sort_string(&describe_result.clone().collect().await?),
1885+
@r###"
1886+
+------------+----------+
1887+
| describe | CoLu.Mn1 |
1888+
+------------+----------+
1889+
| count | 1 |
1890+
| max | a |
1891+
| mean | null |
1892+
| median | null |
1893+
| min | a |
1894+
| null_count | 0 |
1895+
| std | null |
1896+
+------------+----------+
1897+
"###
1898+
);
1899+
1900+
1901+
Ok(())
1902+
}
1903+
1904+
18551905
#[tokio::test]
18561906
async fn cast_expr_test() -> Result<()> {
18571907
let df = test_table()

notes.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# 20250514
2+
3+
LogicalPlan defined in: `datafusion/expr/src/logical_plan/plan.rs`
4+
5+
Dataframe call to describe in: `datafusion/core/src/dataframe/mod.rs`
6+
7+
Dataframe call gets schema name from schema which is called on the logical plan.
8+
9+
Seems like schema call on logicalplan would return some kind of query ast / substrait type thingy.
10+
11+
Gut feeling is that the bug occurs in aggregation of the describe call using a get column by name
12+
command in the final report build but the plan might conform identifiers to lower casing and ignore
13+
non alnum characters to ease schema merging.
14+
15+

0 commit comments

Comments
 (0)