Skip to content

Commit 6070619

Browse files
committed
Write null counts in parquet files when they are present
1 parent 69b17ad commit 6070619

File tree

2 files changed

+148
-47
lines changed

2 files changed

+148
-47
lines changed

parquet/src/file/statistics.rs

Lines changed: 122 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -125,22 +125,35 @@ pub fn from_thrift(
125125
) -> Result<Option<Statistics>> {
126126
Ok(match thrift_stats {
127127
Some(stats) => {
128-
// Number of nulls recorded, when it is not available, we just mark it as 0.
129-
// TODO this should be `None` if there is no information about NULLS.
130-
// see https://github.com/apache/arrow-rs/pull/6216/files
131-
let null_count = stats.null_count.unwrap_or(0);
132-
133-
if null_count < 0 {
134-
return Err(ParquetError::General(format!(
135-
"Statistics null count is negative {}",
136-
null_count
137-
)));
138-
}
128+
// transform null count to u64
129+
let null_count = stats
130+
.null_count
131+
.map(|null_count| {
132+
if null_count < 0 {
133+
return Err(ParquetError::General(format!(
134+
"Statistics null count is negative {}",
135+
null_count
136+
)));
137+
}
138+
Ok(null_count as u64)
139+
})
140+
.transpose()?;
139141

140-
// Generic null count.
141-
let null_count = Some(null_count as u64);
142142
// Generic distinct count (count of distinct values occurring)
143-
let distinct_count = stats.distinct_count.map(|value| value as u64);
143+
let distinct_count = stats
144+
.distinct_count
145+
.map(|distinct_count| {
146+
if distinct_count < 0 {
147+
return Err(ParquetError::General(format!(
148+
"Statistics distinct count is negative {}",
149+
distinct_count
150+
)));
151+
}
152+
153+
Ok(distinct_count as u64)
154+
})
155+
.transpose()?;
156+
144157
// Whether or not statistics use deprecated min/max fields.
145158
let old_format = stats.min_value.is_none() && stats.max_value.is_none();
146159
// Generic min value as bytes.
@@ -244,20 +257,21 @@ pub fn from_thrift(
244257
pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
245258
let stats = stats?;
246259

247-
// record null counts if greater than zero.
248-
//
249-
// TODO: This should be Some(0) if there are no nulls.
250-
// see https://github.com/apache/arrow-rs/pull/6216/files
260+
// record null count if it cam fit in i64
251261
let null_count = stats
252262
.null_count_opt()
253-
.map(|value| value as i64)
254-
.filter(|&x| x > 0);
263+
.and_then(|value| i64::try_from(value).ok());
264+
265+
// record distinct count if it can fit in i64
266+
let distinct_count = stats
267+
.distinct_count()
268+
.and_then(|value| i64::try_from(value).ok());
255269

256270
let mut thrift_stats = TStatistics {
257271
max: None,
258272
min: None,
259273
null_count,
260-
distinct_count: stats.distinct_count().map(|value| value as i64),
274+
distinct_count,
261275
max_value: None,
262276
min_value: None,
263277
is_max_value_exact: None,
@@ -1041,4 +1055,91 @@ mod tests {
10411055
true,
10421056
));
10431057
}
1058+
1059+
#[test]
1060+
fn test_count_encoding() {
1061+
statistics_count_test(None, None);
1062+
statistics_count_test(Some(0), Some(0));
1063+
statistics_count_test(Some(100), Some(2000));
1064+
statistics_count_test(Some(1), None);
1065+
statistics_count_test(None, Some(1));
1066+
}
1067+
1068+
#[test]
1069+
fn test_count_encoding_distinct_too_large() {
1070+
// statistics are stored using i64, so test trying to store larger values
1071+
let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1072+
let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1073+
assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1074+
assert_eq!(thrift_stats.null_count, Some(100));
1075+
}
1076+
1077+
#[test]
1078+
fn test_count_encoding_null_too_large() {
1079+
// statistics are stored using i64, so test trying to store larger values
1080+
let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1081+
let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1082+
assert_eq!(thrift_stats.distinct_count, Some(100));
1083+
assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1084+
}
1085+
1086+
#[test]
1087+
fn test_count_decoding_distinct_invalid() {
1088+
let tstatistics = TStatistics {
1089+
distinct_count: Some(-42),
1090+
..Default::default()
1091+
};
1092+
let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1093+
assert_eq!(
1094+
err.to_string(),
1095+
"Parquet error: Statistics distinct count is negative -42"
1096+
);
1097+
}
1098+
1099+
#[test]
1100+
fn test_count_decoding_null_invalid() {
1101+
let tstatistics = TStatistics {
1102+
null_count: Some(-42),
1103+
..Default::default()
1104+
};
1105+
let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1106+
assert_eq!(
1107+
err.to_string(),
1108+
"Parquet error: Statistics null count is negative -42"
1109+
);
1110+
}
1111+
1112+
/// Writes statistics to thrift and reads them back and ensures:
1113+
/// - The statistics are the same
1114+
/// - The statistics written to thrift are the same as the original statistics
1115+
fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1116+
let statistics = make_bool_stats(distinct_count, null_count);
1117+
1118+
let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1119+
assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1120+
assert_eq!(
1121+
thrift_stats.distinct_count.map(|c| c as u64),
1122+
distinct_count
1123+
);
1124+
1125+
let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1126+
.unwrap()
1127+
.unwrap();
1128+
assert_eq!(round_tripped, statistics);
1129+
}
1130+
1131+
fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1132+
let min = Some(true);
1133+
let max = Some(false);
1134+
let is_min_max_deprecated = false;
1135+
1136+
// test is about the counts, so we aren't really testing the min/max values
1137+
Statistics::Boolean(ValueStatistics::new(
1138+
min,
1139+
max,
1140+
distinct_count,
1141+
null_count,
1142+
is_min_max_deprecated,
1143+
))
1144+
}
10441145
}

parquet/tests/arrow_writer_layout.rs

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ fn test_primitive() {
189189
pages: (0..8)
190190
.map(|_| Page {
191191
rows: 250,
192-
page_header_size: 36,
192+
page_header_size: 38,
193193
compressed_size: 1000,
194194
encoding: Encoding::PLAIN,
195195
page_type: PageType::DATA_PAGE,
@@ -218,22 +218,22 @@ fn test_primitive() {
218218
pages: vec![
219219
Page {
220220
rows: 250,
221-
page_header_size: 36,
221+
page_header_size: 38,
222222
compressed_size: 258,
223223
encoding: Encoding::RLE_DICTIONARY,
224224
page_type: PageType::DATA_PAGE,
225225
},
226226
Page {
227227
rows: 1750,
228-
page_header_size: 36,
228+
page_header_size: 38,
229229
compressed_size: 7000,
230230
encoding: Encoding::PLAIN,
231231
page_type: PageType::DATA_PAGE,
232232
},
233233
],
234234
dictionary_page: Some(Page {
235235
rows: 250,
236-
page_header_size: 36,
236+
page_header_size: 38,
237237
compressed_size: 1000,
238238
encoding: Encoding::PLAIN,
239239
page_type: PageType::DICTIONARY_PAGE,
@@ -260,50 +260,50 @@ fn test_primitive() {
260260
pages: vec![
261261
Page {
262262
rows: 400,
263-
page_header_size: 36,
263+
page_header_size: 38,
264264
compressed_size: 452,
265265
encoding: Encoding::RLE_DICTIONARY,
266266
page_type: PageType::DATA_PAGE,
267267
},
268268
Page {
269269
rows: 370,
270-
page_header_size: 36,
270+
page_header_size: 38,
271271
compressed_size: 472,
272272
encoding: Encoding::RLE_DICTIONARY,
273273
page_type: PageType::DATA_PAGE,
274274
},
275275
Page {
276276
rows: 330,
277-
page_header_size: 36,
277+
page_header_size: 38,
278278
compressed_size: 464,
279279
encoding: Encoding::RLE_DICTIONARY,
280280
page_type: PageType::DATA_PAGE,
281281
},
282282
Page {
283283
rows: 330,
284-
page_header_size: 36,
284+
page_header_size: 38,
285285
compressed_size: 464,
286286
encoding: Encoding::RLE_DICTIONARY,
287287
page_type: PageType::DATA_PAGE,
288288
},
289289
Page {
290290
rows: 330,
291-
page_header_size: 36,
291+
page_header_size: 38,
292292
compressed_size: 464,
293293
encoding: Encoding::RLE_DICTIONARY,
294294
page_type: PageType::DATA_PAGE,
295295
},
296296
Page {
297297
rows: 240,
298-
page_header_size: 36,
298+
page_header_size: 38,
299299
compressed_size: 332,
300300
encoding: Encoding::RLE_DICTIONARY,
301301
page_type: PageType::DATA_PAGE,
302302
},
303303
],
304304
dictionary_page: Some(Page {
305305
rows: 2000,
306-
page_header_size: 36,
306+
page_header_size: 38,
307307
compressed_size: 8000,
308308
encoding: Encoding::PLAIN,
309309
page_type: PageType::DICTIONARY_PAGE,
@@ -329,7 +329,7 @@ fn test_primitive() {
329329
pages: (0..20)
330330
.map(|_| Page {
331331
rows: 100,
332-
page_header_size: 36,
332+
page_header_size: 38,
333333
compressed_size: 400,
334334
encoding: Encoding::PLAIN,
335335
page_type: PageType::DATA_PAGE,
@@ -364,14 +364,14 @@ fn test_string() {
364364
pages: (0..15)
365365
.map(|_| Page {
366366
rows: 130,
367-
page_header_size: 36,
367+
page_header_size: 38,
368368
compressed_size: 1040,
369369
encoding: Encoding::PLAIN,
370370
page_type: PageType::DATA_PAGE,
371371
})
372372
.chain(std::iter::once(Page {
373373
rows: 50,
374-
page_header_size: 35,
374+
page_header_size: 37,
375375
compressed_size: 400,
376376
encoding: Encoding::PLAIN,
377377
page_type: PageType::DATA_PAGE,
@@ -400,29 +400,29 @@ fn test_string() {
400400
pages: vec![
401401
Page {
402402
rows: 130,
403-
page_header_size: 36,
403+
page_header_size: 38,
404404
compressed_size: 138,
405405
encoding: Encoding::RLE_DICTIONARY,
406406
page_type: PageType::DATA_PAGE,
407407
},
408408
Page {
409409
rows: 1250,
410-
page_header_size: 38,
410+
page_header_size: 40,
411411
compressed_size: 10000,
412412
encoding: Encoding::PLAIN,
413413
page_type: PageType::DATA_PAGE,
414414
},
415415
Page {
416416
rows: 620,
417-
page_header_size: 36,
417+
page_header_size: 38,
418418
compressed_size: 4960,
419419
encoding: Encoding::PLAIN,
420420
page_type: PageType::DATA_PAGE,
421421
},
422422
],
423423
dictionary_page: Some(Page {
424424
rows: 130,
425-
page_header_size: 36,
425+
page_header_size: 38,
426426
compressed_size: 1040,
427427
encoding: Encoding::PLAIN,
428428
page_type: PageType::DICTIONARY_PAGE,
@@ -449,50 +449,50 @@ fn test_string() {
449449
pages: vec![
450450
Page {
451451
rows: 400,
452-
page_header_size: 36,
452+
page_header_size: 38,
453453
compressed_size: 452,
454454
encoding: Encoding::RLE_DICTIONARY,
455455
page_type: PageType::DATA_PAGE,
456456
},
457457
Page {
458458
rows: 370,
459-
page_header_size: 36,
459+
page_header_size: 38,
460460
compressed_size: 472,
461461
encoding: Encoding::RLE_DICTIONARY,
462462
page_type: PageType::DATA_PAGE,
463463
},
464464
Page {
465465
rows: 330,
466-
page_header_size: 36,
466+
page_header_size: 38,
467467
compressed_size: 464,
468468
encoding: Encoding::RLE_DICTIONARY,
469469
page_type: PageType::DATA_PAGE,
470470
},
471471
Page {
472472
rows: 330,
473-
page_header_size: 36,
473+
page_header_size: 38,
474474
compressed_size: 464,
475475
encoding: Encoding::RLE_DICTIONARY,
476476
page_type: PageType::DATA_PAGE,
477477
},
478478
Page {
479479
rows: 330,
480-
page_header_size: 36,
480+
page_header_size: 38,
481481
compressed_size: 464,
482482
encoding: Encoding::RLE_DICTIONARY,
483483
page_type: PageType::DATA_PAGE,
484484
},
485485
Page {
486486
rows: 240,
487-
page_header_size: 36,
487+
page_header_size: 38,
488488
compressed_size: 332,
489489
encoding: Encoding::RLE_DICTIONARY,
490490
page_type: PageType::DATA_PAGE,
491491
},
492492
],
493493
dictionary_page: Some(Page {
494494
rows: 2000,
495-
page_header_size: 36,
495+
page_header_size: 38,
496496
compressed_size: 16000,
497497
encoding: Encoding::PLAIN,
498498
page_type: PageType::DICTIONARY_PAGE,
@@ -532,7 +532,7 @@ fn test_list() {
532532
pages: (0..10)
533533
.map(|_| Page {
534534
rows: 20,
535-
page_header_size: 36,
535+
page_header_size: 38,
536536
compressed_size: 672,
537537
encoding: Encoding::PLAIN,
538538
page_type: PageType::DATA_PAGE,

0 commit comments

Comments
 (0)