Skip to content

Commit 2908a80

Browse files
rluvatonalamb
andauthored
add extend_dictionary in dictionary builder for improved performance (apache#6875)
* add `extend_dictionary` in dictionary builder for improved performance * fix extends all nulls * support null in mapped value * adding comment * run `clippy` and `fmt` * fix ci * Apply suggestions from code review Co-authored-by: Andrew Lamb <[email protected]> --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 1dbe523 commit 2908a80

File tree

2 files changed

+379
-6
lines changed

2 files changed

+379
-6
lines changed

arrow-array/src/builder/generic_bytes_dictionary_builder.rs

Lines changed: 185 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
1919
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
20-
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
20+
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray};
2121
use arrow_buffer::ArrowNativeType;
2222
use arrow_schema::{ArrowError, DataType};
2323
use hashbrown::HashTable;
@@ -305,6 +305,63 @@ where
305305
};
306306
}
307307

308+
/// Extends builder with an existing dictionary array.
309+
///
310+
/// This is the same as [`Self::extend`] but is faster as it translates
311+
/// the dictionary values once rather than doing a lookup for each item in the iterator
312+
///
313+
/// when dictionary values are null (the actual mapped values) the keys are null
314+
///
315+
pub fn extend_dictionary(
316+
&mut self,
317+
dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
318+
) -> Result<(), ArrowError> {
319+
let values = dictionary.values();
320+
321+
let v_len = values.len();
322+
let k_len = dictionary.keys().len();
323+
if v_len == 0 && k_len == 0 {
324+
return Ok(());
325+
}
326+
327+
// All nulls
328+
if v_len == 0 {
329+
self.append_nulls(k_len);
330+
return Ok(());
331+
}
332+
333+
if k_len == 0 {
334+
return Err(ArrowError::InvalidArgumentError(
335+
"Dictionary keys should not be empty when values are not empty".to_string(),
336+
));
337+
}
338+
339+
// Orphan values will be carried over to the new dictionary
340+
let mapped_values = values
341+
.iter()
342+
// Dictionary values can technically be null, so we need to handle that
343+
.map(|dict_value| {
344+
dict_value
345+
.map(|dict_value| self.get_or_insert_key(dict_value))
346+
.transpose()
347+
})
348+
.collect::<Result<Vec<_>, _>>()?;
349+
350+
// Just insert the keys without additional lookups
351+
dictionary.keys().iter().for_each(|key| match key {
352+
None => self.append_null(),
353+
Some(original_dict_index) => {
354+
let index = original_dict_index.as_usize().min(v_len - 1);
355+
match mapped_values[index] {
356+
None => self.append_null(),
357+
Some(mapped_value) => self.keys_builder.append_value(mapped_value),
358+
}
359+
}
360+
});
361+
362+
Ok(())
363+
}
364+
308365
/// Builds the `DictionaryArray` and reset this builder.
309366
pub fn finish(&mut self) -> DictionaryArray<K> {
310367
self.dedup.clear();
@@ -445,8 +502,9 @@ mod tests {
445502
use super::*;
446503

447504
use crate::array::Int8Array;
505+
use crate::cast::AsArray;
448506
use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
449-
use crate::{BinaryArray, StringArray};
507+
use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
450508

451509
fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
452510
where
@@ -664,4 +722,129 @@ mod tests {
664722
assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
665723
assert_eq!(dict.values().len(), 4);
666724
}
725+
726+
#[test]
727+
fn test_extend_dictionary() {
728+
let some_dict = {
729+
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
730+
builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
731+
builder.extend([None::<&str>]);
732+
builder.extend(["c", "d", "a"].into_iter().map(Some));
733+
builder.append_null();
734+
builder.finish()
735+
};
736+
737+
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
738+
builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
739+
builder
740+
.extend_dictionary(&some_dict.downcast_dict().unwrap())
741+
.unwrap();
742+
let dict = builder.finish();
743+
744+
assert_eq!(dict.values().len(), 6);
745+
746+
let values = dict
747+
.downcast_dict::<GenericByteArray<Utf8Type>>()
748+
.unwrap()
749+
.into_iter()
750+
.collect::<Vec<_>>();
751+
752+
assert_eq!(
753+
values,
754+
[
755+
Some("e"),
756+
Some("e"),
757+
Some("f"),
758+
Some("e"),
759+
Some("d"),
760+
Some("a"),
761+
Some("b"),
762+
Some("c"),
763+
Some("a"),
764+
Some("b"),
765+
Some("c"),
766+
None,
767+
Some("c"),
768+
Some("d"),
769+
Some("a"),
770+
None
771+
]
772+
);
773+
}
774+
#[test]
775+
fn test_extend_dictionary_with_null_in_mapped_value() {
776+
let some_dict = {
777+
let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
778+
let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
779+
780+
// Manually build a dictionary values that the mapped values have null
781+
values_builder.append_null();
782+
keys_builder.append_value(0);
783+
values_builder.append_value("I like worm hugs");
784+
keys_builder.append_value(1);
785+
786+
let values = values_builder.finish();
787+
let keys = keys_builder.finish();
788+
789+
let data_type = DataType::Dictionary(
790+
Box::new(Int32Type::DATA_TYPE),
791+
Box::new(Utf8Type::DATA_TYPE),
792+
);
793+
794+
let builder = keys
795+
.into_data()
796+
.into_builder()
797+
.data_type(data_type)
798+
.child_data(vec![values.into_data()]);
799+
800+
DictionaryArray::from(unsafe { builder.build_unchecked() })
801+
};
802+
803+
let some_dict_values = some_dict.values().as_string::<i32>();
804+
assert_eq!(
805+
some_dict_values.into_iter().collect::<Vec<_>>(),
806+
&[None, Some("I like worm hugs")]
807+
);
808+
809+
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
810+
builder
811+
.extend_dictionary(&some_dict.downcast_dict().unwrap())
812+
.unwrap();
813+
let dict = builder.finish();
814+
815+
assert_eq!(dict.values().len(), 1);
816+
817+
let values = dict
818+
.downcast_dict::<GenericByteArray<Utf8Type>>()
819+
.unwrap()
820+
.into_iter()
821+
.collect::<Vec<_>>();
822+
823+
assert_eq!(values, [None, Some("I like worm hugs")]);
824+
}
825+
826+
#[test]
827+
fn test_extend_all_null_dictionary() {
828+
let some_dict = {
829+
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
830+
builder.append_nulls(2);
831+
builder.finish()
832+
};
833+
834+
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
835+
builder
836+
.extend_dictionary(&some_dict.downcast_dict().unwrap())
837+
.unwrap();
838+
let dict = builder.finish();
839+
840+
assert_eq!(dict.values().len(), 0);
841+
842+
let values = dict
843+
.downcast_dict::<GenericByteArray<Utf8Type>>()
844+
.unwrap()
845+
.into_iter()
846+
.collect::<Vec<_>>();
847+
848+
assert_eq!(values, [None, None]);
849+
}
667850
}

0 commit comments

Comments
 (0)