Skip to content
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6269073
start
paleolimbot Oct 8, 2025
585bba9
builds!
paleolimbot Oct 8, 2025
16ac02d
backward compatible proto
paleolimbot Oct 9, 2025
a702499
move FieldMetadata
paleolimbot Oct 9, 2025
2c98b28
maybe working from proto
paleolimbot Oct 9, 2025
cbf75aa
oof
paleolimbot Oct 9, 2025
9982d32
maybe fix build
paleolimbot Oct 9, 2025
0091bbb
fix doc
paleolimbot Oct 10, 2025
4f01800
fix the name of the list
paleolimbot Oct 10, 2025
9b3a45a
undo trailing whitespace change
paleolimbot Oct 10, 2025
901a011
maybe fix failing tests
paleolimbot Oct 10, 2025
21af946
possibly reusable type equality
paleolimbot Oct 14, 2025
b7404c0
literal class
paleolimbot Oct 14, 2025
94cb418
doc
paleolimbot Oct 14, 2025
b982d4c
one more pass on documenting things
paleolimbot Oct 14, 2025
4f4820b
use the equality checker in one more place
paleolimbot Oct 14, 2025
63ae6fb
update data_types variables that were actually fields
paleolimbot Oct 14, 2025
a085cf1
revert tests
paleolimbot Oct 14, 2025
5475937
fix slt
paleolimbot Oct 14, 2025
05431a2
add a test that fails
paleolimbot Oct 14, 2025
45f15f4
fix test
paleolimbot Oct 16, 2025
4386a85
add tests for expression schema for typed parameters
paleolimbot Oct 16, 2025
8b5622f
more tests
paleolimbot Oct 16, 2025
df5aba8
add placeholder inference test
paleolimbot Oct 16, 2025
08d0e7a
add nullability to test
paleolimbot Oct 16, 2025
2b4b8e7
tests for proto
paleolimbot Oct 16, 2025
57437b9
make sure I don't forget these
paleolimbot Oct 16, 2025
70f3663
fix typo
paleolimbot Oct 16, 2025
58807fd
remove comment
paleolimbot Oct 16, 2025
4bfc5cd
fix
paleolimbot Oct 16, 2025
81c3baf
fmt
paleolimbot Oct 17, 2025
860e013
fix error string comparison
paleolimbot Oct 17, 2025
b6c71b3
add update/insert test
paleolimbot Oct 17, 2025
cfabfd0
Merge branch 'main' into params-with-metadata
paleolimbot Oct 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datafusion/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pub mod file_options;
pub mod format;
pub mod hash_utils;
pub mod instant;
pub mod metadata;
pub mod nested_struct;
mod null_equality;
pub mod parsers;
Expand Down
363 changes: 363 additions & 0 deletions datafusion/common/src/metadata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,363 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::{collections::BTreeMap, sync::Arc};

use arrow::datatypes::{DataType, Field};
use hashbrown::HashMap;

use crate::{error::_plan_err, DataFusionError, ScalarValue};

/// A [`ScalarValue`] with optional [`FieldMetadata`]
#[derive(Debug, Clone)]
pub struct Literal {
pub value: ScalarValue,
pub metadata: Option<FieldMetadata>,
}

impl Literal {
/// Create a new Literal from a scalar value with optional [`FieldMetadata`]
pub fn new(value: ScalarValue, metadata: Option<FieldMetadata>) -> Self {
Self { value, metadata }
}
Comment on lines +25 to +36
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure Literal is the best name here (we already have the trait datafusion_expr::Literal and datafusion_physical_expr::Literal), but ScalarValueAndMetadata seemed verbose and using (ScalarValue, Option<FieldMetadata>) has been used in enough places that I think having its own struct makes sense.

Other places where we pair a ScalarValue with metadata are Expr::Literal, and ConstSimplifyResult::Simplified.

I don't have strong opinions here and am happy to have this be whatever.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My personal opinion is that Literal is OK and that it should be used in Expr::Literal and ConstSimplifyResult::Simplified. However, I believe this could be done in a follow-up PR and an issue such that others can provide input on that.


/// Access the underlying [ScalarValue] storage
pub fn value(&self) -> &ScalarValue {
&self.value
}

/// Access the [FieldMetadata] attached to this value, if any
pub fn metadata(&self) -> Option<&FieldMetadata> {
self.metadata.as_ref()
}

/// Consume self and return components
pub fn into_inner(self) -> (ScalarValue, Option<FieldMetadata>) {
(self.value, self.metadata)
}

/// Cast this literal's storage type
///
/// This operation assumes that if the underlying [ScalarValue] can be casted
/// to a given type that any extension type represented by the metadata is also
/// valid.
pub fn cast_storage_to(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate on why we need this function? It seems like it is used to cast the provided parameter instances to the type of the placeholder. Can't we for now assume that these types match and return an Err if this is not the case? It just seems like we are here trying something that supports logical types / extension types but then we ignore any restrictions they might have on converting to a particular physical type.

Just some thoughts, don't know if this is actually a problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I see the cast was there before so this should be OK for now. Maybe in the future we can consider the allowed casts of a logical type.

&self,
target_type: &DataType,
) -> Result<Self, DataFusionError> {
let new_value = self.value().cast_to(target_type)?;
Ok(Literal::new(new_value, self.metadata.clone()))
}
}

/// Assert equality of data types where one or both sides may have field metadata
///
/// This currently compares absent metadata (e.g., one side was a DataType) and
/// empty metadata (e.g., one side was a field where the field had no metadata)
/// as equal and uses byte-for-byte comparison for the keys and values of the
/// fields, even though this is potentially too strict for some cases (e.g.,
/// extension types where extension metadata is represented by JSON, or cases
/// where field metadata is orthogonal to the interpretation of the data type).
///
/// Returns a planning error with suitably formatted type representations if
/// actual and expected do not compare to equal.
pub fn check_metadata_with_storage_equal(
actual: (
&DataType,
Option<&std::collections::HashMap<String, String>>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason why we're not using FieldMetadata here?

),
expected: (
&DataType,
Option<&std::collections::HashMap<String, String>>,
),
what: &str,
context: &str,
) -> Result<(), DataFusionError> {
Comment on lines +78 to +89
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kind of ugly but there are two places in this PR that do this check (type equality). I figured at least collecting the checks in one place would make it easier to do something smarter later if it comes up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I think its better than duplicating the logic. I think a good solution for this problem will require support for registering instances of logical types.

Should we track that somewhere?

if actual.0 != expected.0 {
return _plan_err!(
"Expected {what} of type {}, got {}{context}",
format_type_and_metadata(expected.0, expected.1),
format_type_and_metadata(actual.0, actual.1)
);
}

let metadata_equal = match (actual.1, expected.1) {
(None, None) => true,
(None, Some(expected_metadata)) => expected_metadata.is_empty(),
(Some(actual_metadata), None) => actual_metadata.is_empty(),
(Some(actual_metadata), Some(expected_metadata)) => {
actual_metadata == expected_metadata
}
};

if !metadata_equal {
return _plan_err!(
"Expected {what} of type {}, got {}{context}",
format_type_and_metadata(expected.0, expected.1),
format_type_and_metadata(actual.0, actual.1)
);
}

Ok(())
}

/// Given a data type represented by storage and optional metadata, generate
/// a user-facing string
///
/// This function exists to reduce the number of Field debug strings that are
/// used to communicate type information in error messages and plan explain
/// renderings.
pub fn format_type_and_metadata(
data_type: &DataType,
metadata: Option<&std::collections::HashMap<String, String>>,
) -> String {
Comment on lines +124 to +127
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to keep the LogicalType::Statement(Prepared {..}) output in the plan text from dumping the debug output from Field so that it is mostly unchanged (unless there is, in fact, metadata). Also helps with the error message above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the best solution would be to delegate the formatting to a registered implementation of a logical type / extension type. This way we can really use type-specific formatting. But I think this is beyond the scope of this PR.

match metadata {
Some(metadata) if !metadata.is_empty() => {
format!("{data_type}<{metadata:?}>")
}
_ => data_type.to_string(),
}
}

/// Literal metadata
///
/// Stores metadata associated with a literal expressions
/// and is designed to be fast to `clone`.
///
/// This structure is used to store metadata associated with a literal expression, and it
/// corresponds to the `metadata` field on [`Field`].
///
/// # Example: Create [`FieldMetadata`] from a [`Field`]
/// ```
/// # use std::collections::HashMap;
/// # use datafusion_common::metadata::FieldMetadata;
/// # use arrow::datatypes::{Field, DataType};
/// # let field = Field::new("c1", DataType::Int32, true)
/// # .with_metadata(HashMap::from([("foo".to_string(), "bar".to_string())]));
/// // Create a new `FieldMetadata` instance from a `Field`
/// let metadata = FieldMetadata::new_from_field(&field);
/// // There is also a `From` impl:
/// let metadata = FieldMetadata::from(&field);
/// ```
///
/// # Example: Update a [`Field`] with [`FieldMetadata`]
/// ```
/// # use datafusion_common::metadata::FieldMetadata;
/// # use arrow::datatypes::{Field, DataType};
/// # let field = Field::new("c1", DataType::Int32, true);
/// # let metadata = FieldMetadata::new_from_field(&field);
/// // Add any metadata from `FieldMetadata` to `Field`
/// let updated_field = metadata.add_to_field(field);
/// ```
///
#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
pub struct FieldMetadata {
Comment on lines +167 to +168
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just moved this from the expr crate so I could us it in ParamValues

/// The inner metadata of a literal expression, which is a map of string
/// keys to string values.
///
/// Note this is not a `HashMap` because `HashMap` does not provide
/// implementations for traits like `Debug` and `Hash`.
inner: Arc<BTreeMap<String, String>>,
}

impl Default for FieldMetadata {
fn default() -> Self {
Self::new_empty()
}
}

impl FieldMetadata {
/// Create a new empty metadata instance.
pub fn new_empty() -> Self {
Self {
inner: Arc::new(BTreeMap::new()),
}
}

/// Merges two optional `FieldMetadata` instances, overwriting any existing
/// keys in `m` with keys from `n` if present.
///
/// This function is commonly used in alias operations, particularly for literals
/// with metadata. When creating an alias expression, the metadata from the original
/// expression (such as a literal) is combined with any metadata specified on the alias.
///
/// # Arguments
///
/// * `m` - The first metadata (typically from the original expression like a literal)
/// * `n` - The second metadata (typically from the alias definition)
///
/// # Merge Strategy
///
/// - If both metadata instances exist, they are merged with `n` taking precedence
/// - Keys from `n` will overwrite keys from `m` if they have the same name
/// - If only one metadata instance exists, it is returned unchanged
/// - If neither exists, `None` is returned
///
/// # Example usage
/// ```rust
/// use datafusion_common::metadata::FieldMetadata;
/// use std::collections::BTreeMap;
///
/// // Create metadata for a literal expression
/// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([
/// ("source".to_string(), "constant".to_string()),
/// ("type".to_string(), "int".to_string()),
/// ])));
///
/// // Create metadata for an alias
/// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([
/// ("description".to_string(), "answer".to_string()),
/// ("source".to_string(), "user".to_string()), // This will override literal's "source"
/// ])));
///
/// // Merge the metadata
/// let merged = FieldMetadata::merge_options(
/// literal_metadata.as_ref(),
/// alias_metadata.as_ref(),
/// );
///
/// // Result contains: {"source": "user", "type": "int", "description": "answer"}
/// assert!(merged.is_some());
/// ```
pub fn merge_options(
m: Option<&FieldMetadata>,
n: Option<&FieldMetadata>,
) -> Option<FieldMetadata> {
match (m, n) {
(Some(m), Some(n)) => {
let mut merged = m.clone();
merged.extend(n.clone());
Some(merged)
}
(Some(m), None) => Some(m.clone()),
(None, Some(n)) => Some(n.clone()),
(None, None) => None,
}
}

/// Create a new metadata instance from a `Field`'s metadata.
pub fn new_from_field(field: &Field) -> Self {
let inner = field
.metadata()
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self {
inner: Arc::new(inner),
}
}

/// Create a new metadata instance from a map of string keys to string values.
pub fn new(inner: BTreeMap<String, String>) -> Self {
Self {
inner: Arc::new(inner),
}
}

/// Get the inner metadata as a reference to a `BTreeMap`.
pub fn inner(&self) -> &BTreeMap<String, String> {
&self.inner
}

/// Return the inner metadata
pub fn into_inner(self) -> Arc<BTreeMap<String, String>> {
self.inner
}

/// Adds metadata from `other` into `self`, overwriting any existing keys.
pub fn extend(&mut self, other: Self) {
if other.is_empty() {
return;
}
let other = Arc::unwrap_or_clone(other.into_inner());
Arc::make_mut(&mut self.inner).extend(other);
}

/// Returns true if the metadata is empty.
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}

/// Returns the number of key-value pairs in the metadata.
pub fn len(&self) -> usize {
self.inner.len()
}

/// Convert this `FieldMetadata` into a `HashMap<String, String>`
pub fn to_hashmap(&self) -> std::collections::HashMap<String, String> {
self.inner
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect()
}

/// Updates the metadata on the Field with this metadata, if it is not empty.
pub fn add_to_field(&self, field: Field) -> Field {
if self.inner.is_empty() {
return field;
}

field.with_metadata(self.to_hashmap())
}
}

impl From<&Field> for FieldMetadata {
fn from(field: &Field) -> Self {
Self::new_from_field(field)
}
}

impl From<BTreeMap<String, String>> for FieldMetadata {
fn from(inner: BTreeMap<String, String>) -> Self {
Self::new(inner)
}
}

impl From<std::collections::HashMap<String, String>> for FieldMetadata {
fn from(map: std::collections::HashMap<String, String>) -> Self {
Self::new(map.into_iter().collect())
}
}

/// From reference
impl From<&std::collections::HashMap<String, String>> for FieldMetadata {
fn from(map: &std::collections::HashMap<String, String>) -> Self {
let inner = map
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self::new(inner)
}
}

/// From hashbrown map
impl From<HashMap<String, String>> for FieldMetadata {
fn from(map: HashMap<String, String>) -> Self {
let inner = map.into_iter().collect();
Self::new(inner)
}
}

impl From<&HashMap<String, String>> for FieldMetadata {
fn from(map: &HashMap<String, String>) -> Self {
let inner = map
.into_iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self::new(inner)
}
}
Loading
Loading