@@ -360,6 +360,8 @@ pub fn create_hashes<'a>(
360
360
random_state : & RandomState ,
361
361
hashes_buffer : & ' a mut Vec < u64 > ,
362
362
) -> Result < & ' a mut Vec < u64 > > {
363
+ use crate :: cast:: { as_binary_view_array, as_string_view_array} ;
364
+
363
365
for ( i, col) in arrays. iter ( ) . enumerate ( ) {
364
366
let array = col. as_ref ( ) ;
365
367
// combine hashes with `combine_hashes` for all columns besides the first
@@ -370,8 +372,10 @@ pub fn create_hashes<'a>(
370
372
DataType :: Boolean => hash_array( as_boolean_array( array) ?, random_state, hashes_buffer, rehash) ,
371
373
DataType :: Utf8 => hash_array( as_string_array( array) ?, random_state, hashes_buffer, rehash) ,
372
374
DataType :: LargeUtf8 => hash_array( as_largestring_array( array) , random_state, hashes_buffer, rehash) ,
375
+ DataType :: Utf8View => hash_array( as_string_view_array( array) ?, random_state, hashes_buffer, rehash) ,
373
376
DataType :: Binary => hash_array( as_generic_binary_array:: <i32 >( array) ?, random_state, hashes_buffer, rehash) ,
374
377
DataType :: LargeBinary => hash_array( as_generic_binary_array:: <i64 >( array) ?, random_state, hashes_buffer, rehash) ,
378
+ DataType :: BinaryView => hash_array( as_binary_view_array( array) ?, random_state, hashes_buffer, rehash) ,
375
379
DataType :: FixedSizeBinary ( _) => {
376
380
let array: & FixedSizeBinaryArray = array. as_any( ) . downcast_ref( ) . unwrap( ) ;
377
381
hash_array( array, random_state, hashes_buffer, rehash)
@@ -486,22 +490,57 @@ mod tests {
486
490
Ok ( ( ) )
487
491
}
488
492
489
- #[ test]
490
- fn create_hashes_binary ( ) -> Result < ( ) > {
491
- let byte_array = Arc :: new ( BinaryArray :: from_vec ( vec ! [
492
- & [ 4 , 3 , 2 ] ,
493
- & [ 4 , 3 , 2 ] ,
494
- & [ 1 , 2 , 3 ] ,
495
- ] ) ) ;
493
+ macro_rules! create_hash_binary {
494
+ ( $NAME: ident, $ARRAY: ty) => {
495
+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
496
+ #[ test]
497
+ fn $NAME( ) {
498
+ let binary = [
499
+ Some ( b"short" . to_byte_slice( ) ) ,
500
+ None ,
501
+ Some ( b"long but different 12 bytes string" ) ,
502
+ Some ( b"short2" ) ,
503
+ Some ( b"Longer than 12 bytes string" ) ,
504
+ Some ( b"short" ) ,
505
+ Some ( b"Longer than 12 bytes string" ) ,
506
+ ] ;
507
+
508
+ let binary_array = Arc :: new( binary. iter( ) . cloned( ) . collect:: <$ARRAY>( ) ) ;
509
+ let ref_array = Arc :: new( binary. iter( ) . cloned( ) . collect:: <BinaryArray >( ) ) ;
510
+
511
+ let random_state = RandomState :: with_seeds( 0 , 0 , 0 , 0 ) ;
512
+
513
+ let mut binary_hashes = vec![ 0 ; binary. len( ) ] ;
514
+ create_hashes( & [ binary_array] , & random_state, & mut binary_hashes)
515
+ . unwrap( ) ;
516
+
517
+ let mut ref_hashes = vec![ 0 ; binary. len( ) ] ;
518
+ create_hashes( & [ ref_array] , & random_state, & mut ref_hashes) . unwrap( ) ;
519
+
520
+ // Null values result in a zero hash,
521
+ for ( val, hash) in binary. iter( ) . zip( binary_hashes. iter( ) ) {
522
+ match val {
523
+ Some ( _) => assert_ne!( * hash, 0 ) ,
524
+ None => assert_eq!( * hash, 0 ) ,
525
+ }
526
+ }
496
527
497
- let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
498
- let hashes_buff = & mut vec ! [ 0 ; byte_array. len( ) ] ;
499
- let hashes = create_hashes ( & [ byte_array] , & random_state, hashes_buff) ?;
500
- assert_eq ! ( hashes. len( ) , 3 , ) ;
528
+ // same logical values should hash to the same hash value
529
+ assert_eq!( binary_hashes, ref_hashes) ;
501
530
502
- Ok ( ( ) )
531
+ // Same values should map to same hash values
532
+ assert_eq!( binary[ 0 ] , binary[ 5 ] ) ;
533
+ assert_eq!( binary[ 4 ] , binary[ 6 ] ) ;
534
+
535
+ // different binary should map to different hash values
536
+ assert_ne!( binary[ 0 ] , binary[ 2 ] ) ;
537
+ }
538
+ } ;
503
539
}
504
540
541
+ create_hash_binary ! ( binary_array, BinaryArray ) ;
542
+ create_hash_binary ! ( binary_view_array, BinaryViewArray ) ;
543
+
505
544
#[ test]
506
545
fn create_hashes_fixed_size_binary ( ) -> Result < ( ) > {
507
546
let input_arg = vec ! [ vec![ 1 , 2 ] , vec![ 5 , 6 ] , vec![ 5 , 6 ] ] ;
@@ -517,6 +556,64 @@ mod tests {
517
556
Ok ( ( ) )
518
557
}
519
558
559
+ macro_rules! create_hash_string {
560
+ ( $NAME: ident, $ARRAY: ty) => {
561
+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
562
+ #[ test]
563
+ fn $NAME( ) {
564
+ let strings = [
565
+ Some ( "short" ) ,
566
+ None ,
567
+ Some ( "long but different 12 bytes string" ) ,
568
+ Some ( "short2" ) ,
569
+ Some ( "Longer than 12 bytes string" ) ,
570
+ Some ( "short" ) ,
571
+ Some ( "Longer than 12 bytes string" ) ,
572
+ ] ;
573
+
574
+ let string_array = Arc :: new( strings. iter( ) . cloned( ) . collect:: <$ARRAY>( ) ) ;
575
+ let dict_array = Arc :: new(
576
+ strings
577
+ . iter( )
578
+ . cloned( )
579
+ . collect:: <DictionaryArray <Int8Type >>( ) ,
580
+ ) ;
581
+
582
+ let random_state = RandomState :: with_seeds( 0 , 0 , 0 , 0 ) ;
583
+
584
+ let mut string_hashes = vec![ 0 ; strings. len( ) ] ;
585
+ create_hashes( & [ string_array] , & random_state, & mut string_hashes)
586
+ . unwrap( ) ;
587
+
588
+ let mut dict_hashes = vec![ 0 ; strings. len( ) ] ;
589
+ create_hashes( & [ dict_array] , & random_state, & mut dict_hashes) . unwrap( ) ;
590
+
591
+ // Null values result in a zero hash,
592
+ for ( val, hash) in strings. iter( ) . zip( string_hashes. iter( ) ) {
593
+ match val {
594
+ Some ( _) => assert_ne!( * hash, 0 ) ,
595
+ None => assert_eq!( * hash, 0 ) ,
596
+ }
597
+ }
598
+
599
+ // same logical values should hash to the same hash value
600
+ assert_eq!( string_hashes, dict_hashes) ;
601
+
602
+ // Same values should map to same hash values
603
+ assert_eq!( strings[ 0 ] , strings[ 5 ] ) ;
604
+ assert_eq!( strings[ 4 ] , strings[ 6 ] ) ;
605
+
606
+ // different strings should map to different hash values
607
+ assert_ne!( strings[ 0 ] , strings[ 2 ] ) ;
608
+ }
609
+ } ;
610
+ }
611
+
612
+ create_hash_string ! ( string_array, StringArray ) ;
613
+ create_hash_string ! ( large_string_array, LargeStringArray ) ;
614
+ create_hash_string ! ( string_view_array, StringArray ) ;
615
+ create_hash_string ! ( dict_string_array, DictionaryArray <Int8Type >) ;
616
+
520
617
#[ test]
521
618
// Tests actual values of hashes, which are different if forcing collisions
522
619
#[ cfg( not( feature = "force_hash_collisions" ) ) ]
0 commit comments