Skip to content

Commit e8a9168

Browse files
authored
Fix truncated string statistics handling (#79)
fix string statistics comparison
1 parent 9b0aaba commit e8a9168

File tree

5 files changed

+259
-37
lines changed

5 files changed

+259
-37
lines changed

src/bin/orc/common.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,17 @@ pub fn format_stats(stats: &ColumnStatistics) -> String {
8686
parts.push(format!("min={min}"));
8787
parts.push(format!("max={max}"));
8888
}
89-
TypeStatistics::String { min, max, .. } => {
90-
parts.push(format!("min={min}"));
91-
parts.push(format!("max={max}"));
89+
TypeStatistics::String {
90+
lower_bound,
91+
upper_bound,
92+
sum: _,
93+
is_exact_min,
94+
is_exact_max,
95+
} => {
96+
parts.push(format!("min={lower_bound}"));
97+
parts.push(format!("max={upper_bound}"));
98+
parts.push(format!("is_exact_min={is_exact_min}"));
99+
parts.push(format!("is_exact_max={is_exact_max}"));
92100
}
93101
TypeStatistics::Bucket { true_count } => {
94102
parts.push(format!("true_count={true_count}"));

src/bin/orc/stats.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,19 @@ fn print_column_stats(col_stats: &ColumnStatistics) {
5353
println!("* Sum: {sum}");
5454
}
5555
}
56-
orc_rust::statistics::TypeStatistics::String { min, max, sum } => {
56+
orc_rust::statistics::TypeStatistics::String {
57+
lower_bound,
58+
upper_bound,
59+
sum,
60+
is_exact_min,
61+
is_exact_max,
62+
} => {
5763
println!("* Data type String");
58-
println!("* Minimum: {min}");
59-
println!("* Maximum: {max}");
64+
println!("* Minimum: {lower_bound}");
65+
println!("* Maximum: {upper_bound}");
6066
println!("* Sum: {sum}");
67+
println!("* IsExactMin: {is_exact_min}");
68+
println!("* IsExactMax: {is_exact_max}");
6169
}
6270
orc_rust::statistics::TypeStatistics::Bucket { true_count } => {
6371
println!("* Data type Bucket");

src/row_group_filter.rs

Lines changed: 190 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,21 @@ fn evaluate_comparison_with_stats(
249249
}
250250

251251
// String comparisons
252-
TypeStatistics::String { min, max, .. } => match value {
253-
PredicateValue::Utf8(Some(v)) => evaluate_string_comparison(min, max, op, v),
252+
TypeStatistics::String {
253+
lower_bound,
254+
upper_bound,
255+
is_exact_min,
256+
is_exact_max,
257+
..
258+
} => match value {
259+
PredicateValue::Utf8(Some(v)) => evaluate_string_comparison(
260+
lower_bound,
261+
upper_bound,
262+
op,
263+
v,
264+
*is_exact_min,
265+
*is_exact_max,
266+
),
254267
_ => {
255268
return Err(UnexpectedSnafu {
256269
msg: "Type mismatch: expected string value".to_string(),
@@ -295,7 +308,7 @@ fn evaluate_comparison_with_stats(
295308
PredicateValue::Utf8(Some(v)) => {
296309
// For decimal, we need to compare strings
297310
// This is a simplified implementation
298-
evaluate_string_comparison(min, max, op, v)
311+
evaluate_string_comparison(min, max, op, v, true, true)
299312
}
300313
_ => {
301314
return Err(UnexpectedSnafu {
@@ -461,31 +474,41 @@ fn evaluate_float_comparison(min: f64, max: f64, op: ComparisonOp, value: f64) -
461474
}
462475
}
463476

464-
fn evaluate_string_comparison(min: &str, max: &str, op: ComparisonOp, value: &str) -> bool {
477+
fn evaluate_string_comparison(
478+
lower_bound: &str,
479+
upper_bound: &str,
480+
op: ComparisonOp,
481+
value: &str,
482+
is_exact_min: bool,
483+
is_exact_max: bool,
484+
) -> bool {
485+
// Check if the column's minimum is <= value
486+
let min_le_value = lower_bound < value || (lower_bound == value && is_exact_min);
487+
488+
// Check if the column's maximum is >= value
489+
let max_ge_value = upper_bound > value || (upper_bound == value && is_exact_max);
490+
465491
match op {
466-
ComparisonOp::Equal => {
467-
// col = value: keep if value is within [min, max] lexicographically
468-
min <= value && value <= max
469-
}
492+
// Range intersection: The value must be reachable from both sides.
493+
ComparisonOp::Equal => min_le_value && max_ge_value,
494+
495+
// One-sided inclusive checks reuse the logic above.
496+
ComparisonOp::LessThanOrEqual => min_le_value,
497+
ComparisonOp::GreaterThanOrEqual => max_ge_value,
498+
499+
// Strict checks are simple.
500+
// Note: We don't need to check exactness here.
501+
// e.g., for LessThan: if lower_bound == value, then actual_min >= value,
502+
// so NO row can be strictly less than value.
503+
ComparisonOp::LessThan => lower_bound < value,
504+
ComparisonOp::GreaterThan => upper_bound > value,
505+
506+
// Special case: Only prune != if we are certain the column contains ONLY `value`.
470507
ComparisonOp::NotEqual => {
471-
// col != value: keep if value is not the only value
472-
!(min == value && max == value)
473-
}
474-
ComparisonOp::LessThan => {
475-
// col < value: keep if min < value
476-
min < value
477-
}
478-
ComparisonOp::LessThanOrEqual => {
479-
// col <= value: keep if min <= value
480-
min <= value
481-
}
482-
ComparisonOp::GreaterThan => {
483-
// col > value: keep if max > value
484-
max > value
485-
}
486-
ComparisonOp::GreaterThanOrEqual => {
487-
// col >= value: keep if max >= value
488-
max >= value
508+
let is_single_value_col = lower_bound == upper_bound && is_exact_min && is_exact_max;
509+
510+
// Keep unless it's a single-value column exactly matching the target
511+
!(is_single_value_col && lower_bound == value)
489512
}
490513
}
491514
}
@@ -1324,4 +1347,145 @@ mod tests {
13241347
assert_eq!(result.len(), 1);
13251348
assert!(result[0]);
13261349
}
1350+
1351+
#[test]
1352+
fn test_evaluate_string_comparison() {
1353+
use crate::predicate::ComparisonOp;
1354+
1355+
// Helper to make the call shorter
1356+
let eval = |lower: &str,
1357+
upper: &str,
1358+
op: ComparisonOp,
1359+
val: &str,
1360+
exact_min: bool,
1361+
exact_max: bool| {
1362+
super::evaluate_string_comparison(lower, upper, op, val, exact_min, exact_max)
1363+
};
1364+
1365+
// 1. EQUAL
1366+
// Range ["a", "c"], value "b" -> Keep
1367+
assert!(eval("a", "c", ComparisonOp::Equal, "b", true, true));
1368+
// Range ["a", "c"], value "d" -> Skip
1369+
assert!(!eval("a", "c", ComparisonOp::Equal, "d", true, true));
1370+
// Range ["a", "c"], value "a" -> Keep
1371+
assert!(eval("a", "c", ComparisonOp::Equal, "a", true, true));
1372+
// Range ["a", "c"], value "c" -> Keep
1373+
assert!(eval("a", "c", ComparisonOp::Equal, "c", true, true));
1374+
1375+
// Truncated stats (inexact)
1376+
// Range ["a", "c"] (min inexact), value "a" -> Skip (actual min > "a")
1377+
assert!(!eval("a", "c", ComparisonOp::Equal, "a", false, true));
1378+
// Range ["a", "c"] (max inexact), value "c" -> Skip (actual max < "c" max is rounded up)
1379+
assert!(!eval("a", "c", ComparisonOp::Equal, "c", true, false));
1380+
1381+
// 2. LESS THAN (< value)
1382+
// Range ["a", "c"], value "b". "a" < "b" -> Keep.
1383+
assert!(eval("a", "c", ComparisonOp::LessThan, "b", true, true));
1384+
// Range ["d", "e"], value "b". "d" >= "b" -> Skip.
1385+
assert!(!eval("d", "e", ComparisonOp::LessThan, "b", true, true));
1386+
// Range ["a", "c"], value "a". "a" < "a" is false.
1387+
// If exact, min="a", so no value < "a". Skip.
1388+
assert!(!eval("a", "c", ComparisonOp::LessThan, "a", true, true));
1389+
// If not exact, min="a" (truncated). Actual min >= "a".
1390+
// So actual min >= value. No value < "a". Skip.
1391+
assert!(!eval("a", "c", ComparisonOp::LessThan, "a", false, true));
1392+
1393+
// 3. GREATER THAN (> value)
1394+
// Range ["a", "c"], value "b". "c" > "b" -> Keep.
1395+
assert!(eval("a", "c", ComparisonOp::GreaterThan, "b", true, true));
1396+
// Range ["a", "b"], value "c". "b" <= "c" -> Skip.
1397+
assert!(!eval("a", "b", ComparisonOp::GreaterThan, "c", true, true));
1398+
// Range ["a", "c"], value "c". "c" > "c" is false.
1399+
// If exact, max="c", so no value > "c". Skip.
1400+
assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, true));
1401+
// If not exact, actual max < "c". Skip.
1402+
assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, false));
1403+
1404+
// 4. NOT EQUAL
1405+
// Range ["a", "c"], value "b". Keep.
1406+
assert!(eval("a", "c", ComparisonOp::NotEqual, "b", true, true));
1407+
// Range ["a", "a"], value "a".
1408+
// Exact: Skip.
1409+
assert!(!eval("a", "a", ComparisonOp::NotEqual, "a", true, true));
1410+
1411+
// 5. LESS THAN OR EQUAL (<= value)
1412+
// Range ["a", "c"], value "b". Keep.
1413+
assert!(eval(
1414+
"a",
1415+
"c",
1416+
ComparisonOp::LessThanOrEqual,
1417+
"b",
1418+
true,
1419+
true
1420+
));
1421+
// Range ["a", "c"], value "a". Keep.
1422+
assert!(eval(
1423+
"a",
1424+
"c",
1425+
ComparisonOp::LessThanOrEqual,
1426+
"a",
1427+
true,
1428+
true
1429+
));
1430+
// Range ["b", "c"], value "a". "b" > "a". Skip.
1431+
assert!(!eval(
1432+
"b",
1433+
"c",
1434+
ComparisonOp::LessThanOrEqual,
1435+
"a",
1436+
true,
1437+
true
1438+
));
1439+
// Inexact min:
1440+
// Range ["a", "c"] (min inexact), value "a".
1441+
// Actual_min > "a". Skip.
1442+
assert!(!eval(
1443+
"a",
1444+
"c",
1445+
ComparisonOp::LessThanOrEqual,
1446+
"a",
1447+
false,
1448+
true
1449+
));
1450+
1451+
// 6. GREATER THAN OR EQUAL (>= value)
1452+
// Range ["a", "c"], value "b". Keep.
1453+
assert!(eval(
1454+
"a",
1455+
"c",
1456+
ComparisonOp::GreaterThanOrEqual,
1457+
"b",
1458+
true,
1459+
true
1460+
));
1461+
// Range ["a", "c"], value "c". Keep.
1462+
assert!(eval(
1463+
"a",
1464+
"c",
1465+
ComparisonOp::GreaterThanOrEqual,
1466+
"c",
1467+
true,
1468+
true
1469+
));
1470+
// Range ["a", "b"], value "c". "b" < "c". Skip.
1471+
assert!(!eval(
1472+
"a",
1473+
"b",
1474+
ComparisonOp::GreaterThanOrEqual,
1475+
"c",
1476+
true,
1477+
true
1478+
));
1479+
// Inexact max:
1480+
// Range ["a", "b"] (max inexact), value "b".
1481+
// Actual_max < "b". Skip.
1482+
assert!(!eval(
1483+
"a",
1484+
"b",
1485+
ComparisonOp::GreaterThanOrEqual,
1486+
"b",
1487+
true,
1488+
false
1489+
));
1490+
}
13271491
}

src/statistics.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,14 @@ pub enum TypeStatistics {
5858
sum: Option<f64>,
5959
},
6060
String {
61-
min: String,
62-
max: String,
61+
lower_bound: String,
62+
upper_bound: String,
6363
/// Total length of all strings
6464
sum: i64,
65+
/// If true, 'min' is an exact minimum. If false, it is a lower bound.
66+
is_exact_min: bool,
67+
/// If true, 'max' is an exact maximum. If false, it is an upper bound.
68+
is_exact_max: bool,
6569
},
6670
/// For Boolean
6771
Bucket { true_count: u64 },
@@ -101,7 +105,9 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
101105
type Error = error::OrcError;
102106

103107
fn try_from(value: &proto::ColumnStatistics) -> Result<Self, Self::Error> {
104-
let type_statistics = if let Some(stats) = &value.int_statistics {
108+
let type_statistics = if value.number_of_values() == 0 {
109+
None
110+
} else if let Some(stats) = &value.int_statistics {
105111
Some(TypeStatistics::Integer {
106112
min: stats.minimum(),
107113
max: stats.maximum(),
@@ -114,10 +120,22 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
114120
sum: stats.sum,
115121
})
116122
} else if let Some(stats) = &value.string_statistics {
123+
let (lower_bound, is_exact_min) = stats
124+
.minimum
125+
.as_deref()
126+
.map(|s| (s, true))
127+
.unwrap_or_else(|| (stats.lower_bound(), false));
128+
let (upper_bound, is_exact_max) = stats
129+
.maximum
130+
.as_deref()
131+
.map(|s| (s, true))
132+
.unwrap_or_else(|| (stats.upper_bound(), false));
117133
Some(TypeStatistics::String {
118-
min: stats.minimum().to_owned(),
119-
max: stats.maximum().to_owned(),
134+
lower_bound: lower_bound.to_owned(),
135+
upper_bound: upper_bound.to_owned(),
120136
sum: stats.sum(),
137+
is_exact_min,
138+
is_exact_max,
121139
})
122140
} else if let Some(stats) = &value.bucket_statistics {
123141
// TODO: false count?

0 commit comments

Comments
 (0)