|
17 | 17 |
|
18 | 18 | use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
|
19 | 19 | use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
|
20 |
| -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; |
| 20 | +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; |
21 | 21 | use arrow_buffer::ArrowNativeType;
|
22 | 22 | use arrow_schema::{ArrowError, DataType};
|
23 | 23 | use hashbrown::HashTable;
|
@@ -305,6 +305,63 @@ where
|
305 | 305 | };
|
306 | 306 | }
|
307 | 307 |
|
| 308 | + /// Extends builder with an existing dictionary array. |
| 309 | + /// |
| 310 | + /// This is the same as [`Self::extend`] but is faster as it translates |
| 311 | + /// the dictionary values once rather than doing a lookup for each item in the iterator |
| 312 | + /// |
| 313 | + /// when dictionary values are null (the actual mapped values) the keys are null |
| 314 | + /// |
| 315 | + pub fn extend_dictionary( |
| 316 | + &mut self, |
| 317 | + dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>, |
| 318 | + ) -> Result<(), ArrowError> { |
| 319 | + let values = dictionary.values(); |
| 320 | + |
| 321 | + let v_len = values.len(); |
| 322 | + let k_len = dictionary.keys().len(); |
| 323 | + if v_len == 0 && k_len == 0 { |
| 324 | + return Ok(()); |
| 325 | + } |
| 326 | + |
| 327 | + // All nulls |
| 328 | + if v_len == 0 { |
| 329 | + self.append_nulls(k_len); |
| 330 | + return Ok(()); |
| 331 | + } |
| 332 | + |
| 333 | + if k_len == 0 { |
| 334 | + return Err(ArrowError::InvalidArgumentError( |
| 335 | + "Dictionary keys should not be empty when values are not empty".to_string(), |
| 336 | + )); |
| 337 | + } |
| 338 | + |
| 339 | + // Orphan values will be carried over to the new dictionary |
| 340 | + let mapped_values = values |
| 341 | + .iter() |
| 342 | + // Dictionary values can technically be null, so we need to handle that |
| 343 | + .map(|dict_value| { |
| 344 | + dict_value |
| 345 | + .map(|dict_value| self.get_or_insert_key(dict_value)) |
| 346 | + .transpose() |
| 347 | + }) |
| 348 | + .collect::<Result<Vec<_>, _>>()?; |
| 349 | + |
| 350 | + // Just insert the keys without additional lookups |
| 351 | + dictionary.keys().iter().for_each(|key| match key { |
| 352 | + None => self.append_null(), |
| 353 | + Some(original_dict_index) => { |
| 354 | + let index = original_dict_index.as_usize().min(v_len - 1); |
| 355 | + match mapped_values[index] { |
| 356 | + None => self.append_null(), |
| 357 | + Some(mapped_value) => self.keys_builder.append_value(mapped_value), |
| 358 | + } |
| 359 | + } |
| 360 | + }); |
| 361 | + |
| 362 | + Ok(()) |
| 363 | + } |
| 364 | + |
308 | 365 | /// Builds the `DictionaryArray` and reset this builder.
|
309 | 366 | pub fn finish(&mut self) -> DictionaryArray<K> {
|
310 | 367 | self.dedup.clear();
|
@@ -445,8 +502,9 @@ mod tests {
|
445 | 502 | use super::*;
|
446 | 503 |
|
447 | 504 | use crate::array::Int8Array;
|
| 505 | + use crate::cast::AsArray; |
448 | 506 | use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
|
449 |
| - use crate::{BinaryArray, StringArray}; |
| 507 | + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; |
450 | 508 |
|
451 | 509 | fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
|
452 | 510 | where
|
@@ -664,4 +722,129 @@ mod tests {
|
664 | 722 | assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
|
665 | 723 | assert_eq!(dict.values().len(), 4);
|
666 | 724 | }
|
| 725 | + |
| 726 | + #[test] |
| 727 | + fn test_extend_dictionary() { |
| 728 | + let some_dict = { |
| 729 | + let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new(); |
| 730 | + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); |
| 731 | + builder.extend([None::<&str>]); |
| 732 | + builder.extend(["c", "d", "a"].into_iter().map(Some)); |
| 733 | + builder.append_null(); |
| 734 | + builder.finish() |
| 735 | + }; |
| 736 | + |
| 737 | + let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new(); |
| 738 | + builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); |
| 739 | + builder |
| 740 | + .extend_dictionary(&some_dict.downcast_dict().unwrap()) |
| 741 | + .unwrap(); |
| 742 | + let dict = builder.finish(); |
| 743 | + |
| 744 | + assert_eq!(dict.values().len(), 6); |
| 745 | + |
| 746 | + let values = dict |
| 747 | + .downcast_dict::<GenericByteArray<Utf8Type>>() |
| 748 | + .unwrap() |
| 749 | + .into_iter() |
| 750 | + .collect::<Vec<_>>(); |
| 751 | + |
| 752 | + assert_eq!( |
| 753 | + values, |
| 754 | + [ |
| 755 | + Some("e"), |
| 756 | + Some("e"), |
| 757 | + Some("f"), |
| 758 | + Some("e"), |
| 759 | + Some("d"), |
| 760 | + Some("a"), |
| 761 | + Some("b"), |
| 762 | + Some("c"), |
| 763 | + Some("a"), |
| 764 | + Some("b"), |
| 765 | + Some("c"), |
| 766 | + None, |
| 767 | + Some("c"), |
| 768 | + Some("d"), |
| 769 | + Some("a"), |
| 770 | + None |
| 771 | + ] |
| 772 | + ); |
| 773 | + } |
| 774 | + #[test] |
| 775 | + fn test_extend_dictionary_with_null_in_mapped_value() { |
| 776 | + let some_dict = { |
| 777 | + let mut values_builder = GenericByteBuilder::<Utf8Type>::new(); |
| 778 | + let mut keys_builder = PrimitiveBuilder::<Int32Type>::new(); |
| 779 | + |
| 780 | + // Manually build a dictionary values that the mapped values have null |
| 781 | + values_builder.append_null(); |
| 782 | + keys_builder.append_value(0); |
| 783 | + values_builder.append_value("I like worm hugs"); |
| 784 | + keys_builder.append_value(1); |
| 785 | + |
| 786 | + let values = values_builder.finish(); |
| 787 | + let keys = keys_builder.finish(); |
| 788 | + |
| 789 | + let data_type = DataType::Dictionary( |
| 790 | + Box::new(Int32Type::DATA_TYPE), |
| 791 | + Box::new(Utf8Type::DATA_TYPE), |
| 792 | + ); |
| 793 | + |
| 794 | + let builder = keys |
| 795 | + .into_data() |
| 796 | + .into_builder() |
| 797 | + .data_type(data_type) |
| 798 | + .child_data(vec![values.into_data()]); |
| 799 | + |
| 800 | + DictionaryArray::from(unsafe { builder.build_unchecked() }) |
| 801 | + }; |
| 802 | + |
| 803 | + let some_dict_values = some_dict.values().as_string::<i32>(); |
| 804 | + assert_eq!( |
| 805 | + some_dict_values.into_iter().collect::<Vec<_>>(), |
| 806 | + &[None, Some("I like worm hugs")] |
| 807 | + ); |
| 808 | + |
| 809 | + let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new(); |
| 810 | + builder |
| 811 | + .extend_dictionary(&some_dict.downcast_dict().unwrap()) |
| 812 | + .unwrap(); |
| 813 | + let dict = builder.finish(); |
| 814 | + |
| 815 | + assert_eq!(dict.values().len(), 1); |
| 816 | + |
| 817 | + let values = dict |
| 818 | + .downcast_dict::<GenericByteArray<Utf8Type>>() |
| 819 | + .unwrap() |
| 820 | + .into_iter() |
| 821 | + .collect::<Vec<_>>(); |
| 822 | + |
| 823 | + assert_eq!(values, [None, Some("I like worm hugs")]); |
| 824 | + } |
| 825 | + |
| 826 | + #[test] |
| 827 | + fn test_extend_all_null_dictionary() { |
| 828 | + let some_dict = { |
| 829 | + let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new(); |
| 830 | + builder.append_nulls(2); |
| 831 | + builder.finish() |
| 832 | + }; |
| 833 | + |
| 834 | + let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new(); |
| 835 | + builder |
| 836 | + .extend_dictionary(&some_dict.downcast_dict().unwrap()) |
| 837 | + .unwrap(); |
| 838 | + let dict = builder.finish(); |
| 839 | + |
| 840 | + assert_eq!(dict.values().len(), 0); |
| 841 | + |
| 842 | + let values = dict |
| 843 | + .downcast_dict::<GenericByteArray<Utf8Type>>() |
| 844 | + .unwrap() |
| 845 | + .into_iter() |
| 846 | + .collect::<Vec<_>>(); |
| 847 | + |
| 848 | + assert_eq!(values, [None, None]); |
| 849 | + } |
667 | 850 | }
|
0 commit comments