Skip to content

Commit a30890e

Browse files
[Variant] Support shred_variant for Uuids (#8666)
# Which issue does this PR close? - Closes #8665 # Rationale for this change `shred_variant` currently panics when attempting to shred Variants containing values of the FixedSizeBinary(16) data type
1 parent b8a1926 commit a30890e

File tree

2 files changed

+409
-94
lines changed

2 files changed

+409
-94
lines changed

parquet-variant-compute/src/shred_variant.rs

Lines changed: 250 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,10 +326,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
326326
mod tests {
327327
use super::*;
328328
use crate::VariantArrayBuilder;
329-
use arrow::array::{Array, Float64Array, Int64Array};
329+
use arrow::array::{Array, FixedSizeBinaryArray, Float64Array, Int64Array};
330330
use arrow::datatypes::{DataType, Field, Fields};
331331
use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, VariantBuilder};
332332
use std::sync::Arc;
333+
use uuid::Uuid;
333334

334335
#[test]
335336
fn test_already_shredded_input_error() {
@@ -369,6 +370,73 @@ mod tests {
369370
shred_variant(&input, &list_schema).expect_err("unsupported");
370371
}
371372

373+
#[test]
374+
fn test_invalid_fixed_size_binary_shredding() {
375+
let mock_uuid_1 = Uuid::new_v4();
376+
377+
let input = VariantArray::from_iter([Some(Variant::from(mock_uuid_1)), None]);
378+
379+
// shred_variant only supports FixedSizeBinary(16). Any other length will err.
380+
let err = shred_variant(&input, &DataType::FixedSizeBinary(17)).unwrap_err();
381+
382+
assert_eq!(
383+
err.to_string(),
384+
"Invalid argument error: FixedSizeBinary(17) is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is supported."
385+
);
386+
}
387+
388+
#[test]
389+
fn test_uuid_shredding() {
390+
let mock_uuid_1 = Uuid::new_v4();
391+
let mock_uuid_2 = Uuid::new_v4();
392+
393+
let input = VariantArray::from_iter([
394+
Some(Variant::from(mock_uuid_1)),
395+
None,
396+
Some(Variant::from(false)),
397+
Some(Variant::from(mock_uuid_2)),
398+
]);
399+
400+
let variant_array = shred_variant(&input, &DataType::FixedSizeBinary(16)).unwrap();
401+
402+
// // inspect the typed_value Field and make sure it contains the canonical Uuid extension type
403+
// let typed_value_field = variant_array
404+
// .inner()
405+
// .fields()
406+
// .into_iter()
407+
// .find(|f| f.name() == "typed_value")
408+
// .unwrap();
409+
410+
// assert!(
411+
// typed_value_field
412+
// .try_extension_type::<extension::Uuid>()
413+
// .is_ok()
414+
// );
415+
416+
// probe the downcasted typed_value array to make sure uuids are shredded correctly
417+
let uuids = variant_array
418+
.typed_value_field()
419+
.unwrap()
420+
.as_any()
421+
.downcast_ref::<FixedSizeBinaryArray>()
422+
.unwrap();
423+
424+
assert_eq!(uuids.len(), 4);
425+
426+
assert!(!uuids.is_null(0));
427+
428+
let got_uuid_1: &[u8] = uuids.value(0);
429+
assert_eq!(got_uuid_1, mock_uuid_1.as_bytes());
430+
431+
assert!(uuids.is_null(1));
432+
assert!(uuids.is_null(2));
433+
434+
assert!(!uuids.is_null(3));
435+
436+
let got_uuid_2: &[u8] = uuids.value(3);
437+
assert_eq!(got_uuid_2, mock_uuid_2.as_bytes());
438+
}
439+
372440
#[test]
373441
fn test_primitive_shredding_comprehensive() {
374442
// Test mixed scenarios in a single array
@@ -869,6 +937,187 @@ mod tests {
869937
assert!(value_field3.is_null(0)); // fully shredded, no remaining fields
870938
}
871939

940+
#[test]
941+
fn test_uuid_shredding_in_objects() {
942+
let mock_uuid_1 = Uuid::new_v4();
943+
let mock_uuid_2 = Uuid::new_v4();
944+
let mock_uuid_3 = Uuid::new_v4();
945+
946+
let mut builder = VariantArrayBuilder::new(6);
947+
948+
// Row 0: Fully shredded object with both UUID fields
949+
builder
950+
.new_object()
951+
.with_field("id", mock_uuid_1)
952+
.with_field("session_id", mock_uuid_2)
953+
.finish();
954+
955+
// Row 1: Partially shredded object - UUID fields plus extra field
956+
builder
957+
.new_object()
958+
.with_field("id", mock_uuid_2)
959+
.with_field("session_id", mock_uuid_3)
960+
.with_field("name", "test_user")
961+
.finish();
962+
963+
// Row 2: Missing UUID field (no session_id)
964+
builder.new_object().with_field("id", mock_uuid_1).finish();
965+
966+
// Row 3: Type mismatch - id is UUID but session_id is a string
967+
builder
968+
.new_object()
969+
.with_field("id", mock_uuid_3)
970+
.with_field("session_id", "not-a-uuid")
971+
.finish();
972+
973+
// Row 4: Object with non-UUID value in id field
974+
builder
975+
.new_object()
976+
.with_field("id", 12345i64)
977+
.with_field("session_id", mock_uuid_1)
978+
.finish();
979+
980+
// Row 5: Null
981+
builder.append_null();
982+
983+
let input = builder.build();
984+
985+
let fields = Fields::from(vec![
986+
Field::new("id", DataType::FixedSizeBinary(16), true),
987+
Field::new("session_id", DataType::FixedSizeBinary(16), true),
988+
]);
989+
let target_schema = DataType::Struct(fields);
990+
991+
let result = shred_variant(&input, &target_schema).unwrap();
992+
993+
assert!(result.value_field().is_some());
994+
assert!(result.typed_value_field().is_some());
995+
assert_eq!(result.len(), 6);
996+
997+
let metadata = result.metadata_field();
998+
let value = result.value_field().unwrap();
999+
let typed_value = result
1000+
.typed_value_field()
1001+
.unwrap()
1002+
.as_any()
1003+
.downcast_ref::<arrow::array::StructArray>()
1004+
.unwrap();
1005+
1006+
// Extract id and session_id fields from typed_value struct
1007+
let id_field =
1008+
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("id").unwrap()).unwrap();
1009+
let session_id_field =
1010+
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
1011+
.unwrap();
1012+
1013+
let id_value = id_field
1014+
.value_field()
1015+
.unwrap()
1016+
.as_any()
1017+
.downcast_ref::<BinaryViewArray>()
1018+
.unwrap();
1019+
let id_typed_value = id_field
1020+
.typed_value_field()
1021+
.unwrap()
1022+
.as_any()
1023+
.downcast_ref::<FixedSizeBinaryArray>()
1024+
.unwrap();
1025+
let session_id_value = session_id_field
1026+
.value_field()
1027+
.unwrap()
1028+
.as_any()
1029+
.downcast_ref::<BinaryViewArray>()
1030+
.unwrap();
1031+
let session_id_typed_value = session_id_field
1032+
.typed_value_field()
1033+
.unwrap()
1034+
.as_any()
1035+
.downcast_ref::<FixedSizeBinaryArray>()
1036+
.unwrap();
1037+
1038+
// Row 0: Fully shredded - both UUID fields shred successfully
1039+
assert!(result.is_valid(0));
1040+
1041+
assert!(value.is_null(0)); // fully shredded, no remaining fields
1042+
assert!(id_value.is_null(0));
1043+
assert!(session_id_value.is_null(0));
1044+
1045+
assert!(typed_value.is_valid(0));
1046+
assert!(id_typed_value.is_valid(0));
1047+
assert!(session_id_typed_value.is_valid(0));
1048+
1049+
assert_eq!(id_typed_value.value(0), mock_uuid_1.as_bytes());
1050+
assert_eq!(session_id_typed_value.value(0), mock_uuid_2.as_bytes());
1051+
1052+
// Row 1: Partially shredded - value contains extra name field
1053+
assert!(result.is_valid(1));
1054+
1055+
assert!(value.is_valid(1)); // contains unshredded "name" field
1056+
assert!(typed_value.is_valid(1));
1057+
1058+
assert!(id_value.is_null(1));
1059+
assert!(id_typed_value.is_valid(1));
1060+
assert_eq!(id_typed_value.value(1), mock_uuid_2.as_bytes());
1061+
1062+
assert!(session_id_value.is_null(1));
1063+
assert!(session_id_typed_value.is_valid(1));
1064+
assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
1065+
1066+
// Verify the value field contains the name field
1067+
let row_1_variant = Variant::new(metadata.value(1), value.value(1));
1068+
let Variant::Object(obj) = row_1_variant else {
1069+
panic!("Expected object");
1070+
};
1071+
1072+
assert_eq!(obj.get("name"), Some(Variant::from("test_user")));
1073+
1074+
// Row 2: Missing session_id field
1075+
assert!(result.is_valid(2));
1076+
1077+
assert!(value.is_null(2)); // fully shredded, no extra fields
1078+
assert!(typed_value.is_valid(2));
1079+
1080+
assert!(id_value.is_null(2));
1081+
assert!(id_typed_value.is_valid(2));
1082+
assert_eq!(id_typed_value.value(2), mock_uuid_1.as_bytes());
1083+
1084+
assert!(session_id_value.is_null(2));
1085+
assert!(session_id_typed_value.is_null(2)); // missing field
1086+
1087+
// Row 3: Type mismatch - session_id is a string, not UUID
1088+
assert!(result.is_valid(3));
1089+
1090+
assert!(value.is_null(3)); // no extra fields
1091+
assert!(typed_value.is_valid(3));
1092+
1093+
assert!(id_value.is_null(3));
1094+
assert!(id_typed_value.is_valid(3));
1095+
assert_eq!(id_typed_value.value(3), mock_uuid_3.as_bytes());
1096+
1097+
assert!(session_id_value.is_valid(3)); // type mismatch, stored in value
1098+
assert!(session_id_typed_value.is_null(3));
1099+
let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3));
1100+
assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
1101+
1102+
// Row 4: Type mismatch - id is int64, not UUID
1103+
assert!(result.is_valid(4));
1104+
1105+
assert!(value.is_null(4)); // no extra fields
1106+
assert!(typed_value.is_valid(4));
1107+
1108+
assert!(id_value.is_valid(4)); // type mismatch, stored in value
1109+
assert!(id_typed_value.is_null(4));
1110+
let id_variant = Variant::new(metadata.value(4), id_value.value(4));
1111+
assert_eq!(id_variant, Variant::from(12345i64));
1112+
1113+
assert!(session_id_value.is_null(4));
1114+
assert!(session_id_typed_value.is_valid(4));
1115+
assert_eq!(session_id_typed_value.value(4), mock_uuid_1.as_bytes());
1116+
1117+
// Row 5: Null
1118+
assert!(result.is_null(5));
1119+
}
1120+
8721121
#[test]
8731122
fn test_spec_compliance() {
8741123
let input = VariantArray::from_iter(vec![Variant::from(42i64), Variant::from("hello")]);

0 commit comments

Comments
 (0)