@@ -580,7 +580,12 @@ fn append_variadic_buffer_counts(counts: &mut Vec<i64>, array: &ArrayData) {
580
580
DataType :: BinaryView | DataType :: Utf8View => {
581
581
// The spec documents the counts only includes the variadic buffers, not the view/null buffers.
582
582
// https://arrow.apache.org/docs/format/Columnar.html#variadic-buffers
583
- counts. push ( array. buffers ( ) . len ( ) as i64 - 1 ) ;
583
+ let views = array. buffers ( ) [ 0 ] . typed_data :: < u128 > ( ) ;
584
+ if views. iter ( ) . any ( |view| * view as u32 > 12 ) {
585
+ counts. push ( array. buffers ( ) . len ( ) as i64 - 1 ) ;
586
+ } else {
587
+ counts. push ( 0 ) ;
588
+ }
584
589
}
585
590
DataType :: Dictionary ( _, _) => {
586
591
// Do nothing
@@ -1245,6 +1250,63 @@ fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Arra
1245
1250
( offsets, child_data)
1246
1251
}
1247
1252
1253
+ fn update_buffer_index ( value : & mut u128 , new_buffer_index : u32 ) {
1254
+ // keep length、prefix and offset,clear old buffer_index, see [`ByteView`] for detail.
1255
+ let mask = !( ( 0xFFFF_FFFFu128 ) << 64 ) ;
1256
+ * value &= mask;
1257
+
1258
+ // move new buffer index to right position
1259
+ let new_buffer_index = ( new_buffer_index as u128 ) << 64 ;
1260
+
1261
+ // update value with new buffer index
1262
+ * value |= new_buffer_index;
1263
+ }
1264
+
1265
+ fn select_data_buffers ( mut views_slice : Vec < u128 > , data : & ArrayData ) -> Vec < Buffer > {
1266
+ let first_buffer = views_slice. iter ( ) . find ( |view| ( * * view) as u32 > 12 ) ;
1267
+ // all values shorter than 12 bytes.
1268
+ if first_buffer. is_none ( ) {
1269
+ return vec ! [ Buffer :: from_vec( views_slice) ] ;
1270
+ }
1271
+ let first_buffer_index = ( ( * first_buffer. unwrap ( ) ) >> 64 ) as u32 as usize ;
1272
+
1273
+ let last_buffer = views_slice
1274
+ . iter ( )
1275
+ . rfind ( |view| ( * * view) as u32 > 12 )
1276
+ . unwrap ( ) ;
1277
+ let last_buffer_index = ( ( * last_buffer) >> 64 ) as u32 as usize ;
1278
+
1279
+ let data_buffers = & data. buffers ( ) [ 1 ..] ;
1280
+ let sliced_data_buffers = & data_buffers[ first_buffer_index..last_buffer_index + 1 ] ;
1281
+
1282
+ // if first buffer index not 0, we need re-mapping view's buffer index to sliced data buffers
1283
+ if first_buffer_index != 0 {
1284
+ views_slice
1285
+ . iter_mut ( )
1286
+ . filter ( |view| ( * * view) as u32 > 12 )
1287
+ . for_each ( |view| {
1288
+ // new buffer index = original buffer index - offset
1289
+ let new_buffer_index = ( ( * view >> 64 ) as u32 ) - first_buffer_index as u32 ;
1290
+ update_buffer_index ( view, new_buffer_index) ;
1291
+ } ) ;
1292
+ }
1293
+
1294
+ let mut buffers = Vec :: with_capacity ( sliced_data_buffers. len ( ) + 1 ) ;
1295
+ buffers. push ( views_slice. iter ( ) . copied ( ) . collect ( ) ) ;
1296
+ buffers. extend_from_slice ( sliced_data_buffers) ;
1297
+ buffers
1298
+ }
1299
+
1300
+ fn get_byte_view_buffers ( data : & ArrayData ) -> Vec < Buffer > {
1301
+ if data. is_empty ( ) {
1302
+ return Vec :: with_capacity ( 0 ) ;
1303
+ }
1304
+
1305
+ let views_slice = data. buffers ( ) [ 0 ] . typed_data :: < u128 > ( ) ;
1306
+ let views_slice = & views_slice[ data. offset ( ) ..data. offset ( ) + data. len ( ) ] ;
1307
+ select_data_buffers ( views_slice. to_vec ( ) , data)
1308
+ }
1309
+
1248
1310
/// Write array data to a vector of bytes
1249
1311
#[ allow( clippy:: too_many_arguments) ]
1250
1312
fn write_array_data (
@@ -1303,13 +1365,8 @@ fn write_array_data(
1303
1365
) ?;
1304
1366
}
1305
1367
} else if matches ! ( data_type, DataType :: BinaryView | DataType :: Utf8View ) {
1306
- // Slicing the views buffer is safe and easy,
1307
- // but pruning unneeded data buffers is much more nuanced since it's complicated to prove that no views reference the pruned buffers
1308
- //
1309
- // Current implementation just serialize the raw arrays as given and not try to optimize anything.
1310
- // If users wants to "compact" the arrays prior to sending them over IPC,
1311
- // they should consider the gc API suggested in #5513
1312
- for buffer in array_data. buffers ( ) {
1368
+ let view_buffers = get_byte_view_buffers ( array_data) ;
1369
+ for buffer in view_buffers {
1313
1370
offset = write_buffer (
1314
1371
buffer. as_slice ( ) ,
1315
1372
buffers,
0 commit comments