From 507e2a9e39f93356bdc0dae1521bbdc46629681c Mon Sep 17 00:00:00 2001 From: SongTao Zhuang <51652084+MichaelDeSteven@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:13:31 +0800 Subject: [PATCH] Spark: Make delete file ratio configurable (#12148) --- .../actions/SizeBasedDataRewriter.java | 33 +++++++++++- docs/docs/spark-procedures.md | 1 + .../spark/actions/TestSparkFileRewriter.java | 49 +++++++++++++++++- .../spark/actions/TestSparkFileRewriter.java | 51 ++++++++++++++++++- .../spark/actions/TestSparkFileRewriter.java | 51 ++++++++++++++++++- 5 files changed, 180 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 61b90d9fc6e3..0c55b2892add 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -47,10 +47,26 @@ public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriterDefaults to 0.3, which means that if the deletion ratio of a file reaches or exceeds 30%, it + * may trigger the rewriting operation. + */ + public static final String DELETE_RATIO_THRESHOLD = "delete-ratio-threshold"; + + public static final double DELETE_RATIO_THRESHOLD_DEFAULT = 0.3; private int deleteFileThreshold; + private double deleteRatioThreshold; + protected SizeBasedDataRewriter(Table table) { super(table); } @@ -60,6 +76,7 @@ public Set validOptions() { return ImmutableSet.builder() .addAll(super.validOptions()) .add(DELETE_FILE_THRESHOLD) + .add(DELETE_RATIO_THRESHOLD) .build(); } @@ -67,6 +84,18 @@ public Set validOptions() { public void init(Map options) { super.init(options); this.deleteFileThreshold = deleteFileThreshold(options); + this.deleteRatioThreshold = deleteRatioThreshold(options); + } + + private double deleteRatioThreshold(Map options) { + double value = + PropertyUtil.propertyAsDouble( + options, DELETE_RATIO_THRESHOLD, DELETE_RATIO_THRESHOLD_DEFAULT); + Preconditions.checkArgument( + value > 0, "'%s' is set to %s but must be > 0", DELETE_RATIO_THRESHOLD, value); + Preconditions.checkArgument( + value <= 1, "'%s' is set to %s but must be <= 1", DELETE_RATIO_THRESHOLD, value); + return value; } @Override @@ -116,7 +145,7 @@ private boolean tooHighDeleteRatio(FileScanTask task) { double deletedRecords = (double) Math.min(knownDeletedRecordCount, task.file().recordCount()); double deleteRatio = deletedRecords / task.file().recordCount(); - return deleteRatio >= DELETE_RATIO_THRESHOLD; + return deleteRatio >= deleteRatioThreshold; } @Override diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md index c5f307a54c98..aa8c22186973 100644 --- a/docs/docs/spark-procedures.md +++ b/docs/docs/spark-procedures.md @@ -403,6 +403,7 @@ Iceberg can compact data files in parallel using Spark with the `rewriteDataFile | `rewrite-all` | false | Force rewriting of all provided files overriding other options | | `max-file-group-size-bytes` | 107374182400 (100GB) | Largest amount of data that should be rewritten in a single file group. The entire rewrite operation is broken down into pieces based on partitioning and within partitions based on size into file-groups. This helps with breaking down the rewriting of very large partitions which may not be rewritable otherwise due to the resource constraints of the cluster. | | `delete-file-threshold` | 2147483647 | Minimum number of deletes that needs to be associated with a data file for it to be considered for rewriting | +| `delete-ratio-threshold` | 0.3 | Minimum deletion ratio that needs to be associated with a data file for it to be considered for rewriting | | `output-spec-id` | current partition spec id | Identifier of the output partition spec. Data will be reorganized during the rewrite to align with the output partitioning. | | `remove-dangling-deletes` | false | Remove dangling position and equality deletes after rewriting. A delete file is considered dangling if it does not apply to any live data files. Enabling this will generate an additional commit for the removal. | diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java index eef8fb43468f..1067b4ab7322 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java @@ -246,7 +246,8 @@ public void testBinPackDataValidOptions() { SparkBinPackDataRewriter.MIN_INPUT_FILES, SparkBinPackDataRewriter.REWRITE_ALL, SparkBinPackDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD), + SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD, + SparkBinPackDataRewriter.DELETE_RATIO_THRESHOLD), rewriter.validOptions()); } @@ -265,6 +266,7 @@ public void testSortDataValidOptions() { SparkSortDataRewriter.REWRITE_ALL, SparkSortDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkSortDataRewriter.DELETE_FILE_THRESHOLD, + SparkSortDataRewriter.DELETE_RATIO_THRESHOLD, SparkSortDataRewriter.COMPRESSION_FACTOR), rewriter.validOptions()); } @@ -285,6 +287,7 @@ public void testZOrderDataValidOptions() { SparkZOrderDataRewriter.REWRITE_ALL, SparkZOrderDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkZOrderDataRewriter.DELETE_FILE_THRESHOLD, + SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, SparkZOrderDataRewriter.COMPRESSION_FACTOR, SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION), @@ -301,7 +304,20 @@ public void testInvalidValuesForBinPackDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -314,12 +330,27 @@ public void testInvalidValuesForSortDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -333,24 +364,40 @@ public void testInvalidValuesForZOrderDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); Map invalidMaxOutputOptions = ImmutableMap.of(SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, "0"); assertThatThrownBy(() -> rewriter.init(invalidMaxOutputOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot have the interleaved ZOrder value use less than 1 byte") .hasMessageContaining("'max-output-size' was set to 0"); Map invalidVarLengthContributionOptions = ImmutableMap.of(SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION, "0"); assertThatThrownBy(() -> rewriter.init(invalidVarLengthContributionOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot use less than 1 byte for variable length types with ZOrder") .hasMessageContaining("'var-length-contribution' was set to 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } private void validateSizeBasedRewriterOptions(SizeBasedFileRewriter rewriter) { diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java index 9722b40f2c45..7d728a912214 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java @@ -246,7 +246,8 @@ public void testBinPackDataValidOptions() { SparkBinPackDataRewriter.MIN_INPUT_FILES, SparkBinPackDataRewriter.REWRITE_ALL, SparkBinPackDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD), + SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD, + SparkBinPackDataRewriter.DELETE_RATIO_THRESHOLD), rewriter.validOptions()); } @@ -266,6 +267,7 @@ public void testSortDataValidOptions() { SparkSortDataRewriter.REWRITE_ALL, SparkSortDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkSortDataRewriter.DELETE_FILE_THRESHOLD, + SparkSortDataRewriter.DELETE_RATIO_THRESHOLD, SparkSortDataRewriter.COMPRESSION_FACTOR), rewriter.validOptions()); } @@ -287,6 +289,7 @@ public void testZOrderDataValidOptions() { SparkZOrderDataRewriter.REWRITE_ALL, SparkZOrderDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkZOrderDataRewriter.DELETE_FILE_THRESHOLD, + SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, SparkZOrderDataRewriter.COMPRESSION_FACTOR, SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION), @@ -303,7 +306,21 @@ public void testInvalidValuesForBinPackDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -316,12 +333,27 @@ public void testInvalidValuesForSortDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -335,24 +367,41 @@ public void testInvalidValuesForZOrderDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); Map invalidMaxOutputOptions = ImmutableMap.of(SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, "0"); assertThatThrownBy(() -> rewriter.init(invalidMaxOutputOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot have the interleaved ZOrder value use less than 1 byte") .hasMessageContaining("'max-output-size' was set to 0"); Map invalidVarLengthContributionOptions = ImmutableMap.of(SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION, "0"); assertThatThrownBy(() -> rewriter.init(invalidVarLengthContributionOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot use less than 1 byte for variable length types with ZOrder") .hasMessageContaining("'var-length-contribution' was set to 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } private void validateSizeBasedRewriterOptions(SizeBasedFileRewriter rewriter) { diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java index 3ffa53c3e6aa..42e008ef21d3 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java @@ -269,7 +269,8 @@ public void testBinPackDataValidOptions() { SparkBinPackDataRewriter.MIN_INPUT_FILES, SparkBinPackDataRewriter.REWRITE_ALL, SparkBinPackDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD)); + SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD, + SparkBinPackDataRewriter.DELETE_RATIO_THRESHOLD)); } @Test @@ -289,6 +290,7 @@ public void testSortDataValidOptions() { SparkSortDataRewriter.REWRITE_ALL, SparkSortDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkSortDataRewriter.DELETE_FILE_THRESHOLD, + SparkSortDataRewriter.DELETE_RATIO_THRESHOLD, SparkSortDataRewriter.COMPRESSION_FACTOR)); } @@ -310,6 +312,7 @@ public void testZOrderDataValidOptions() { SparkZOrderDataRewriter.REWRITE_ALL, SparkZOrderDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, SparkZOrderDataRewriter.DELETE_FILE_THRESHOLD, + SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, SparkZOrderDataRewriter.COMPRESSION_FACTOR, SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION)); @@ -325,7 +328,21 @@ public void testInvalidValuesForBinPackDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SizeBasedDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -338,12 +355,27 @@ public void testInvalidValuesForSortDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkShufflingDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } @Test @@ -357,24 +389,41 @@ public void testInvalidValuesForZOrderDataOptions() { Map invalidDeleteThresholdOptions = ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); Map invalidMaxOutputOptions = ImmutableMap.of(SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, "0"); assertThatThrownBy(() -> rewriter.init(invalidMaxOutputOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot have the interleaved ZOrder value use less than 1 byte") .hasMessageContaining("'max-output-size' was set to 0"); Map invalidVarLengthContributionOptions = ImmutableMap.of(SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION, "0"); assertThatThrownBy(() -> rewriter.init(invalidVarLengthContributionOptions)) + .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Cannot use less than 1 byte for variable length types with ZOrder") .hasMessageContaining("'var-length-contribution' was set to 0"); + + Map negativeDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "-1"); + assertThatThrownBy(() -> rewriter.init(negativeDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to -1.0 but must be > 0"); + + Map invalidDeleteRatioThresholdOptions = + ImmutableMap.of(SparkZOrderDataRewriter.DELETE_RATIO_THRESHOLD, "127"); + + assertThatThrownBy(() -> rewriter.init(invalidDeleteRatioThresholdOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("'delete-ratio-threshold' is set to 127.0 but must be <= 1"); } private void validateSizeBasedRewriterOptions(SizeBasedFileRewriter rewriter) {