Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@ set(ARROW_SRCS
extension/bool8.cc
extension/json.cc
extension/parquet_variant.cc
extension/variant_builder.cc
extension/variant_internal.cc
extension/variant_shredding.cc
extension/uuid.cc
pretty_print.cc
record_batch.cc
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
# specific language governing permissions and limitations
# under the License.

set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc)
set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc
variant_internal_test.cc variant_builder_test.cc
variant_shredding_test.cc)

if(ARROW_JSON)
list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc)
Expand Down
8 changes: 7 additions & 1 deletion cpp/src/arrow/extension/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
# specific language governing permissions and limitations
# under the License.

canonical_extension_tests = ['bool8_test.cc', 'json_test.cc', 'uuid_test.cc']
canonical_extension_tests = ['bool8_test.cc', 'json_test.cc', 'uuid_test.cc',
'variant_internal_test.cc', 'variant_builder_test.cc',
'variant_shredding_test.cc']

if needs_json
canonical_extension_tests += [
Expand All @@ -40,5 +42,9 @@ install_headers(
'parquet_variant.h',
'uuid.h',
'variable_shape_tensor.h',
# variant_internal.h: public API for variant binary encoding/decoding.
# "internal" refers to the binary encoding internals, not visibility.
'variant_internal.h',
'variant_shredding.h',
],
)
102 changes: 62 additions & 40 deletions cpp/src/arrow/extension/parquet_variant.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,20 @@ namespace arrow::extension {

VariantExtensionType::VariantExtensionType(const std::shared_ptr<DataType>& storage_type)
: ExtensionType(storage_type) {
// GH-45948: Shredded variants will need to handle an optional shredded_value as
// well as value_ becoming optional.

// IsSupportedStorageType should have been called already, asserting that both
// metadata and value are present.
if (storage_type->field(0)->name() == "metadata") {
metadata_ = storage_type->field(0);
value_ = storage_type->field(1);
} else {
value_ = storage_type->field(0);
metadata_ = storage_type->field(1);
// Find fields by name (ordering does not matter per spec).
for (int i = 0; i < storage_type->num_fields(); ++i) {
const auto& f = storage_type->field(i);
if (f->name() == "metadata") {
metadata_ = f;
} else if (f->name() == "value") {
value_ = f;
} else if (f->name() == "typed_value") {
typed_value_ = f;
}
}
// IsSupportedStorageType() should have been called before construction.
DCHECK_NE(metadata_, nullptr);
DCHECK_NE(value_, nullptr);
}

bool VariantExtensionType::ExtensionEquals(const ExtensionType& other) const {
Expand Down Expand Up @@ -71,35 +73,52 @@ bool IsBinaryField(const std::shared_ptr<Field> field) {

bool VariantExtensionType::IsSupportedStorageType(
const std::shared_ptr<DataType>& storage_type) {
// For now we only supported unshredded variants. Unshredded variant storage
// type should be a struct with a binary metadata and binary value.
//
// GH-45948: In shredded variants, the binary value field can be replaced
// with one or more of the following: object, array, typed_value, and
// variant_value.
if (storage_type->id() == Type::STRUCT) {
if (storage_type->num_fields() == 2) {
// Ordering of metadata and value fields does not matter, as we will assign
// these to the VariantExtensionType's member shared_ptrs in the constructor.
// Here we just need to check that they are both present.

const auto& field0 = storage_type->field(0);
const auto& field1 = storage_type->field(1);

bool metadata_and_value_present =
(field0->name() == "metadata" && field1->name() == "value") ||
(field1->name() == "metadata" && field0->name() == "value");

if (metadata_and_value_present) {
// Both metadata and value must be non-nullable binary types for unshredded
// variants. This will change in GH-46948, when we will require a Visitor
// to traverse the structure of the variant.
return IsBinaryField(field0) && IsBinaryField(field1) && !field0->nullable() &&
!field1->nullable();
}
if (storage_type->id() != Type::STRUCT) {
return false;
}

// Find fields by name
std::shared_ptr<Field> metadata_field;
std::shared_ptr<Field> value_field;
std::shared_ptr<Field> typed_value_field;

for (int i = 0; i < storage_type->num_fields(); ++i) {
const auto& f = storage_type->field(i);
if (f->name() == "metadata") {
metadata_field = f;
} else if (f->name() == "value") {
value_field = f;
} else if (f->name() == "typed_value") {
typed_value_field = f;
}
}

// metadata is always required and must be binary-like
if (!metadata_field || !IsBinaryField(metadata_field)) {
return false;
}

// Unshredded: required metadata + required value (both binary)
if (value_field && !typed_value_field) {
return IsBinaryField(value_field) && !metadata_field->nullable() &&
!value_field->nullable();
}

// Shredded: required metadata + optional value + optional typed_value
if (value_field && typed_value_field) {
// metadata must be non-nullable, value must be nullable binary,
// typed_value must be nullable (any type)
return !metadata_field->nullable() && IsBinaryField(value_field) &&
value_field->nullable() && typed_value_field->nullable();
}

// NOTE: The shredding spec allows leaf schemas where `value` is absent
// (typed_value only, for fully-shredded columns with no residual). We
// reject this case for now because the current shredding implementation
// always produces a `value` column. Supporting value-absent schemas
// requires changes to ShredVariantColumn/ReconstructVariantColumn to
// handle the missing residual path. This can be added in a follow-up
// when Parquet reader integration requires it.
return false;
}

Expand All @@ -113,9 +132,12 @@ Result<std::shared_ptr<DataType>> VariantExtensionType::Make(
return std::make_shared<VariantExtensionType>(std::move(storage_type));
}

/// NOTE: this is still experimental. GH-45948 will add shredding support, at which point
/// we need to separate this into unshredded_variant and shredded_variant helper
/// functions.
/// \brief Return a VariantExtensionType instance.
///
/// Supports both unshredded and shredded storage types:
/// - Unshredded: struct{required binary metadata, required binary value}
/// - Shredded: struct{required binary metadata, optional binary value,
/// optional <T> typed_value}
std::shared_ptr<DataType> variant(std::shared_ptr<DataType> storage_type) {
return VariantExtensionType::Make(std::move(storage_type)).ValueOrDie();
}
Expand Down
17 changes: 15 additions & 2 deletions cpp/src/arrow/extension/parquet_variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ class ARROW_EXPORT VariantArray : public ExtensionArray {
/// required binary value;
/// }
///
/// Shredded variant representation:
/// optional group variant_name (VARIANT) {
/// required binary metadata;
/// optional binary value;
/// optional <T> typed_value;
/// }
///
/// To read more about variant encoding, see the variant encoding spec at
/// https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
///
Expand Down Expand Up @@ -69,10 +76,16 @@ class ARROW_EXPORT VariantExtensionType : public ExtensionType {

std::shared_ptr<Field> value() const { return value_; }

/// \brief The typed_value field, or nullptr if unshredded.
std::shared_ptr<Field> typed_value() const { return typed_value_; }

/// \brief Whether this variant has a shredded typed_value column.
bool is_shredded() const { return typed_value_ != nullptr; }

private:
// TODO GH-45948 added shredded_value
std::shared_ptr<Field> metadata_;
std::shared_ptr<Field> value_;
std::shared_ptr<Field> value_; // nullable when shredded
std::shared_ptr<Field> typed_value_; // nullptr if unshredded
};

/// \brief Return a VariantExtensionType instance.
Expand Down
Loading