Skip to content

Commit 3a39853

Browse files
committed
support slice view data on writing ipc
1 parent 51c1b4b commit 3a39853

File tree

1 file changed

+65
-8
lines changed

1 file changed

+65
-8
lines changed

arrow-ipc/src/writer.rs

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -580,7 +580,12 @@ fn append_variadic_buffer_counts(counts: &mut Vec<i64>, array: &ArrayData) {
580580
DataType::BinaryView | DataType::Utf8View => {
581581
// The spec documents the counts only includes the variadic buffers, not the view/null buffers.
582582
// https://arrow.apache.org/docs/format/Columnar.html#variadic-buffers
583-
counts.push(array.buffers().len() as i64 - 1);
583+
let views = array.buffers()[0].typed_data::<u128>();
584+
if views.iter().any(|view| *view as u32 > 12) {
585+
counts.push(array.buffers().len() as i64 - 1);
586+
} else {
587+
counts.push(0);
588+
}
584589
}
585590
DataType::Dictionary(_, _) => {
586591
// Do nothing
@@ -1245,6 +1250,63 @@ fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Arra
12451250
(offsets, child_data)
12461251
}
12471252

1253+
fn update_buffer_index(value: &mut u128, new_buffer_index: u32) {
1254+
// keep length、prefix and offset,clear old buffer_index, see [`ByteView`] for detail.
1255+
let mask = !((0xFFFF_FFFFu128) << 64);
1256+
*value &= mask;
1257+
1258+
// move new buffer index to right position
1259+
let new_buffer_index = (new_buffer_index as u128) << 64;
1260+
1261+
// update value with new buffer index
1262+
*value |= new_buffer_index;
1263+
}
1264+
1265+
fn select_data_buffers(mut views_slice: Vec<u128>, data: &ArrayData) -> Vec<Buffer> {
1266+
let first_buffer = views_slice.iter().find(|view| (**view) as u32 > 12);
1267+
// all values shorter than 12 bytes.
1268+
if first_buffer.is_none() {
1269+
return vec![Buffer::from_vec(views_slice)];
1270+
}
1271+
let first_buffer_index = ((*first_buffer.unwrap()) >> 64) as u32 as usize;
1272+
1273+
let last_buffer = views_slice
1274+
.iter()
1275+
.rfind(|view| (**view) as u32 > 12)
1276+
.unwrap();
1277+
let last_buffer_index = ((*last_buffer) >> 64) as u32 as usize;
1278+
1279+
let data_buffers = &data.buffers()[1..];
1280+
let sliced_data_buffers = &data_buffers[first_buffer_index..last_buffer_index + 1];
1281+
1282+
// if first buffer index not 0, we need re-mapping view's buffer index to sliced data buffers
1283+
if first_buffer_index != 0 {
1284+
views_slice
1285+
.iter_mut()
1286+
.filter(|view| (**view) as u32 > 12)
1287+
.for_each(|view| {
1288+
// new buffer index = original buffer index - offset
1289+
let new_buffer_index = ((*view >> 64) as u32) - first_buffer_index as u32;
1290+
update_buffer_index(view, new_buffer_index);
1291+
});
1292+
}
1293+
1294+
let mut buffers = Vec::with_capacity(sliced_data_buffers.len() + 1);
1295+
buffers.push(views_slice.iter().copied().collect());
1296+
buffers.extend_from_slice(sliced_data_buffers);
1297+
buffers
1298+
}
1299+
1300+
fn get_byte_view_buffers(data: &ArrayData) -> Vec<Buffer> {
1301+
if data.is_empty() {
1302+
return Vec::with_capacity(0);
1303+
}
1304+
1305+
let views_slice = data.buffers()[0].typed_data::<u128>();
1306+
let views_slice = &views_slice[data.offset()..data.offset() + data.len()];
1307+
select_data_buffers(views_slice.to_vec(), data)
1308+
}
1309+
12481310
/// Write array data to a vector of bytes
12491311
#[allow(clippy::too_many_arguments)]
12501312
fn write_array_data(
@@ -1303,13 +1365,8 @@ fn write_array_data(
13031365
)?;
13041366
}
13051367
} else if matches!(data_type, DataType::BinaryView | DataType::Utf8View) {
1306-
// Slicing the views buffer is safe and easy,
1307-
// but pruning unneeded data buffers is much more nuanced since it's complicated to prove that no views reference the pruned buffers
1308-
//
1309-
// Current implementation just serialize the raw arrays as given and not try to optimize anything.
1310-
// If users wants to "compact" the arrays prior to sending them over IPC,
1311-
// they should consider the gc API suggested in #5513
1312-
for buffer in array_data.buffers() {
1368+
let view_buffers = get_byte_view_buffers(array_data);
1369+
for buffer in view_buffers {
13131370
offset = write_buffer(
13141371
buffer.as_slice(),
13151372
buffers,

0 commit comments

Comments
 (0)