feat: escape quote wrap identifiers in describe

jfahne · jfahne · commit c6f852409f8b · 2025-05-17T23:34:18.000-04:00
diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -934,7 +934,7 @@ impl DataFrame {
                 vec![],
                 original_schema_fields
                     .clone()
-                    .map(|f| count(col(f.name())).alias(f.name()))
+                    .map(|f| count(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
             // null_count aggregation
@@ -943,7 +943,7 @@ impl DataFrame {
                 original_schema_fields
                     .clone()
                     .map(|f| {
-                        sum(case(is_null(col(f.name())))
+                        sum(case(is_null(col(format!("\"{}\"", f.name()))))
                             .when(lit(true), lit(1))
                             .otherwise(lit(0))
                             .unwrap())
@@ -957,7 +957,7 @@ impl DataFrame {
                 original_schema_fields
                     .clone()
                     .filter(|f| f.data_type().is_numeric())
-                    .map(|f| avg(col(f.name())).alias(f.name()))
+                    .map(|f| avg(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
             // std aggregation
@@ -966,7 +966,7 @@ impl DataFrame {
                 original_schema_fields
                     .clone()
                     .filter(|f| f.data_type().is_numeric())
-                    .map(|f| stddev(col(f.name())).alias(f.name()))
+                    .map(|f| stddev(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
             // min aggregation
@@ -977,7 +977,7 @@ impl DataFrame {
                     .filter(|f| {
                         !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
                     })
-                    .map(|f| min(col(f.name())).alias(f.name()))
+                    .map(|f| min(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
             // max aggregation
@@ -988,7 +988,7 @@ impl DataFrame {
                     .filter(|f| {
                         !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
                     })
-                    .map(|f| max(col(f.name())).alias(f.name()))
+                    .map(|f| max(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
             // median aggregation
@@ -997,7 +997,7 @@ impl DataFrame {
                 original_schema_fields
                     .clone()
                     .filter(|f| f.data_type().is_numeric())
-                    .map(|f| median(col(f.name())).alias(f.name()))
+                    .map(|f| median(col(format!("\"{}\"", f.name()))).alias(f.name()))
                     .collect::<Vec<_>>(),
             ),
         ];
@@ -1043,7 +1043,10 @@ impl DataFrame {
                     {
                         Arc::new(StringArray::from(vec!["null"]))
                     }
-                    Err(e) => return exec_err!("{}", e),
+                    // goes straight to error
+                    Err(e) => {
+                        return exec_err!("{}", e)
+                    },
                 };
                 array_datas.push(array_ref);
             }
diff --git a/datafusion/core/tests/dataframe/.mod.rs.pending-snap b/datafusion/core/tests/dataframe/.mod.rs.pending-snap
@@ -0,0 +1,3 @@
+{"run_id":"1747492618-449486000","line":1878,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1878,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n| describe   | CoLu.Mn1 |\n+------------+----------+\n| count      | 1        |\n| max        | a        |\n| mean       | null     |\n| median     | null     |\n| min        | a        |\n| null_count | 0        |\n| std        | null     |\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n| describe   | CoLu.Mn1 |\n+------------+----------+\n| count      | 1        |\n| null_count | 0        |\n| mean       | null     |\n| std        | null     |\n| min        | a        |\n| max        | a        |\n| median     | null     |\n+------------+----------+"}}
+{"run_id":"1747537851-743053000","line":1883,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1883,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n| describe   | CoLu.Mn1 |\n+------------+----------+\n| count      | 1        |\n| max        | a        |\n| mean       | null     |\n| median     | null     |\n| min        | a        |\n| null_count | 0        |\n| std        | null     |\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n| describe   | CoLu.Mn1 |\n+------------+----------+\n| count      | 1        |\n| null_count | 0        |\n| mean       | null     |\n| std        | null     |\n| min        | a        |\n| max        | a        |\n| median     | null     |\n+------------+----------+"}}
+{"run_id":"1747538015-81091000","line":1883,"new":null,"old":null}
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
@@ -1852,6 +1852,56 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn describe_lookup_via_quoted_identifier() -> Result<()> {
+    let ctx = SessionContext::new();
+    let name = "aggregate_test_100";
+    register_aggregate_csv(&ctx, name).await?;
+    let df = ctx.table(name);
+
+    let df = df
+        .await?
+        .filter(col("c2").eq(lit(3)).and(col("c1").eq(lit("a"))))?
+        .limit(0, Some(1))?
+        .sort(vec![
+            // make the test deterministic
+            col("c1").sort(true, true),
+            col("c2").sort(true, true),
+            col("c3").sort(true, true),
+        ])?
+        .select_columns(&["c1"])?;
+
+    let df_renamed = df.clone().with_column_renamed("c1", "CoLu.Mn1")?;
+
+    let describe_result = df_renamed.describe().await?;
+    describe_result.clone().sort(
+        vec![
+            col("describe").sort(true, true),
+            col("\"CoLu.Mn1\"").sort(true,true),
+        ]
+    )?.show().await?;
+    assert_snapshot!(
+        batches_to_sort_string(&describe_result.clone().collect().await?),
+        @r###"
+        +------------+----------+
+        | describe   | CoLu.Mn1 |
+        +------------+----------+
+        | count      | 1        |
+        | max        | a        |
+        | mean       | null     |
+        | median     | null     |
+        | min        | a        |
+        | null_count | 0        |
+        | std        | null     |
+        +------------+----------+
+    "###
+    );
+
+
+    Ok(())
+}
+
+
 #[tokio::test]
 async fn cast_expr_test() -> Result<()> {
     let df = test_table()
diff --git a/notes.md b/notes.md
@@ -0,0 +1,15 @@
+# 20250514
+
+LogicalPlan defined in: `datafusion/expr/src/logical_plan/plan.rs`
+
+Dataframe call to describe in: `datafusion/core/src/dataframe/mod.rs`
+
+Dataframe call gets schema name from schema which is called on the logical plan.
+
+Seems like schema call on logicalplan would return some kind of query ast / substrait type thingy.
+
+Gut feeling is that the bug occurs in aggregation of the describe call using a get column by name
+command in the final report build but the plan might conform identifiers to lower casing and ignore
+non alnum characters to ease schema merging.
+
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+{"run_id":"1747492618-449486000","line":1878,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1878,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n\| describe \| CoLu.Mn1 \|\n+------------+----------+\n\| count \| 1 \|\n\| max \| a \|\n\| mean \| null \|\n\| median \| null \|\n\| min \| a \|\n\| null_count \| 0 \|\n\| std \| null \|\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n\| describe \| CoLu.Mn1 \|\n+------------+----------+\n\| count \| 1 \|\n\| null_count \| 0 \|\n\| mean \| null \|\n\| std \| null \|\n\| min \| a \|\n\| max \| a \|\n\| median \| null \|\n+------------+----------+"}}
	`2`	+{"run_id":"1747537851-743053000","line":1883,"new":{"module_name":"core_integration__dataframe","snapshot_name":"describe_lookup_via_quoted_identifier","metadata":{"source":"datafusion/core/tests/dataframe/mod.rs","assertion_line":1883,"expression":"batches_to_sort_string(&describe_result.clone().collect().await?)"},"snapshot":"+------------+----------+\n\| describe \| CoLu.Mn1 \|\n+------------+----------+\n\| count \| 1 \|\n\| max \| a \|\n\| mean \| null \|\n\| median \| null \|\n\| min \| a \|\n\| null_count \| 0 \|\n\| std \| null \|\n+------------+----------+"},"old":{"module_name":"core_integration__dataframe","metadata":{},"snapshot":"+------------+----------+\n\| describe \| CoLu.Mn1 \|\n+------------+----------+\n\| count \| 1 \|\n\| null_count \| 0 \|\n\| mean \| null \|\n\| std \| null \|\n\| min \| a \|\n\| max \| a \|\n\| median \| null \|\n+------------+----------+"}}
	`3`	`+{"run_id":"1747538015-81091000","line":1883,"new":null,"old":null}`