diff --git a/docs/content/append-table/query-performance.md b/docs/content/append-table/query-performance.md index d150c95bbc45..4a7b25e6a6a6 100644 --- a/docs/content/append-table/query-performance.md +++ b/docs/content/append-table/query-performance.md @@ -64,6 +64,7 @@ scenario. Using a bitmap may consume more space but can result in greater accura `Bitmap`: * `file-index.bitmap.columns`: specify the columns that need bitmap index. +* `file-index.bitmap..index-block-size`: to config secondary index block size, default value is 16kb. `Bit-Slice Index Bitmap` * `file-index.bsi.columns`: specify the columns that need bsi index. diff --git a/docs/content/concepts/spec/fileindex.md b/docs/content/concepts/spec/fileindex.md index f8cdd97505df..757657bd4c1e 100644 --- a/docs/content/concepts/spec/fileindex.md +++ b/docs/content/concepts/spec/fileindex.md @@ -98,9 +98,71 @@ This class use (64-bits) long hash. Store the num hash function (one integer) an Define `'file-index.bitmap.columns'`. +Bitmap file index format (V2): + +
+
+Bitmap file index format (V2)
++-------------------------------------------------+-----------------
+| version (1 byte) = 2                           |
++-------------------------------------------------+
+| row count (4 bytes int)                        |
++-------------------------------------------------+
+| non-null value bitmap number (4 bytes int)     |
++-------------------------------------------------+
+| has null value (1 byte)                        |
++-------------------------------------------------+
+| null value offset (4 bytes if has null value)  |       HEAD
++-------------------------------------------------+
+| null bitmap length (4 bytes if has null value) |
++-------------------------------------------------+
+| bitmap index block number (4 bytes int)        |
++-------------------------------------------------+
+| value 1 | offset 1                             |
++-------------------------------------------------+
+| value 2 | offset 2                             |
++-------------------------------------------------+
+| ...                                            |
++-------------------------------------------------+
+| bitmap body offset (4 bytes int)               |
++-------------------------------------------------+-----------------
+| bitmap index block 1                           |
++-------------------------------------------------+
+| bitmap index block 2                           |  INDEX BLOCKS
++-------------------------------------------------+
+| ...                                            |
++-------------------------------------------------+-----------------
+| serialized bitmap 1                            |
++-------------------------------------------------+
+| serialized bitmap 2                            |
++-------------------------------------------------+  BITMAP BLOCKS
+| serialized bitmap 3                            |
++-------------------------------------------------+
+| ...                                            |
++-------------------------------------------------+-----------------
+
+index block format:
++-------------------------------------------------+
+| entry number (4 bytes int)                     |
++-------------------------------------------------+
+| value 1 | offset 1 | length 1                  |
++-------------------------------------------------+
+| value 2 | offset 2 | length 2                  |
++-------------------------------------------------+
+| ...                                            |
++-------------------------------------------------+
+
+value x:                       var bytes for any data type (as bitmap identifier)
+offset:                        4 bytes int (when it is negative, it represents that there is only one value
+                                 and its position is the inverse of the negative value)
+length:                        4 bytes int
+  
+
+ Bitmap file index format (V1):
+
 Bitmap file index format (V1)
 +-------------------------------------------------+-----------------
 | version (1 byte)                               |
@@ -135,7 +197,97 @@ offset:                        4 bytes int (when it is negative, it represents t
                                  and its position is the inverse of the negative value)
 
-Integer are all BIG_ENDIAN. +Integer are all BIG_ENDIAN. In the paimon version that supports v2, the bitmap index version defaults to v2. + +Bitmap only support the following data type: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Paimon Data TypeSupported
TinyIntTypetrue
SmallIntTypetrue
IntTypetrue
BigIntTypetrue
DateTypetrue
TimeTypetrue
LocalZonedTimestampTypetrue
TimestampTypetrue
CharTypetrue
VarCharTypetrue
StringTypetrue
BooleanTypetrue
DecimalType(precision, scale)false
FloatTypeNot recommended
DoubleTypeNot recommended
VarBinaryType, BinaryTypefalse
RowTypefalse
MapTypefalse
ArrayTypefalse
+ ## Index: Bit-Slice Index Bitmap diff --git a/docs/content/primary-key-table/query-performance.md b/docs/content/primary-key-table/query-performance.md index c32f1c5e662f..681fc0bca276 100644 --- a/docs/content/primary-key-table/query-performance.md +++ b/docs/content/primary-key-table/query-performance.md @@ -62,6 +62,7 @@ Supported filter types: `Bitmap`: * `file-index.bitmap.columns`: specify the columns that need bitmap index. +* `file-index.bitmap..index-block-size`: to config secondary index block size, default value is 16kb. `Bit-Slice Index Bitmap` * `file-index.bsi.columns`: specify the columns that need bsi index. diff --git a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/BitmapIndexBenchmark.java b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/BitmapIndexBenchmark.java new file mode 100644 index 000000000000..baecc6381969 --- /dev/null +++ b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/BitmapIndexBenchmark.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.benchmark.bitmap; + +import org.apache.paimon.benchmark.Benchmark; +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexResult; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fileindex.bitmap.BitmapFileIndex; +import org.apache.paimon.fileindex.bitmap.BitmapIndexResult; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.utils.RoaringBitmap32; + +import org.apache.commons.io.FileUtils; +import org.junit.Rule; +import org.junit.jupiter.api.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.FileNotFoundException; + +/** Benchmark for {@link BitmapFileIndex}. */ +public class BitmapIndexBenchmark { + + public static final int ROW_COUNT = 1000000; + + private static final String prefix = "asdfghjkl"; + + @Rule public TemporaryFolder folder = new TemporaryFolder(); + + @Test + public void testQuery10() throws Exception { + testQuery(10); + } + + @Test + public void testQuery100() throws Exception { + testQuery(100); + } + + @Test + public void testQuery1000() throws Exception { + testQuery(1000); + } + + @Test + public void testQuery10000() throws Exception { + testQuery(10000); + } + + @Test + public void testQuery30000() throws Exception { + testQuery(30000); + } + + @Test + public void testQuery50000() throws Exception { + testQuery(50000); + } + + @Test + public void testQuery80000() throws Exception { + testQuery(80000); + } + + @Test + public void testQuery100000() throws Exception { + testQuery(100000); + } + + private void testQuery(int approxCardinality) throws Exception { + + RoaringBitmap32 middleBm = new RoaringBitmap32(); + + Options writeOptions1 = new Options(); + writeOptions1.setInteger(BitmapFileIndex.VERSION, BitmapFileIndex.VERSION_1); + FileIndexWriter writer1 = + new BitmapFileIndex(DataTypes.STRING(), writeOptions1).createWriter(); + + Options writeOptions2 = new Options(); + writeOptions1.setInteger(BitmapFileIndex.VERSION, BitmapFileIndex.VERSION_2); + FileIndexWriter writer2 = + new BitmapFileIndex(DataTypes.STRING(), writeOptions2).createWriter(); + + for (int i = 0; i < ROW_COUNT; i++) { + int sid = (int) (Math.random() * approxCardinality); + if (sid == approxCardinality / 2) { + middleBm.add(i); + } + writer1.write(BinaryString.fromString(prefix + sid)); + writer2.write(BinaryString.fromString(prefix + sid)); + } + + folder.create(); + + File file1 = folder.newFile("bitmap-index-v1"); + File file2 = folder.newFile("bitmap-index-v2"); + FileUtils.writeByteArrayToFile(file1, writer1.serializedBytes()); + FileUtils.writeByteArrayToFile(file2, writer2.serializedBytes()); + + Benchmark benchmark = + new Benchmark( + String.format("bitmap-index-query-benchmark-%d", approxCardinality), + 100) + .setNumWarmupIters(1) + .setOutputPerIteration(true); + + benchmark.addCase("formatV1", 10, () -> query(approxCardinality, file1, "false", "false")); + + benchmark.addCase( + "formatV1-bitmapByteBuffer", + 10, + () -> query(approxCardinality, file1, "false", "true")); + + benchmark.addCase( + "formatV1-bufferedInput", + 10, + () -> query(approxCardinality, file1, "true", "false")); + + benchmark.addCase( + "formatV1-bufferedInput-bitmapByteBuffer", + 10, + () -> query(approxCardinality, file1, "true", "true")); + + benchmark.addCase("format-v2", 10, () -> query(approxCardinality, file2, "false", "false")); + + benchmark.addCase( + "format-v2-bitmapByteBuffer", + 10, + () -> query(approxCardinality, file2, "false", "true")); + + benchmark.addCase( + "format-v2-bufferedInput", + 10, + () -> query(approxCardinality, file2, "true", "false")); + + benchmark.addCase( + "format-v2-bufferedInput-bitmapByteBuffer", + 10, + () -> query(approxCardinality, file2, "true", "true")); + + benchmark.run(); + } + + private static void query( + int approxCardinality, + File file1, + String enableBufferedInput, + String enableNextOffsetToSize) { + try { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING()); + Options options = new Options(); + options.set(BitmapFileIndex.ENABLE_BUFFERED_INPUT, enableBufferedInput); + options.set(BitmapFileIndex.ENABLE_NEXT_OFFSET_TO_SIZE, enableNextOffsetToSize); + LocalFileIO.LocalSeekableInputStream localSeekableInputStream = + new LocalFileIO.LocalSeekableInputStream(file1); + FileIndexReader reader = + new BitmapFileIndex(DataTypes.STRING(), options) + .createReader(localSeekableInputStream, 0, 0); + FileIndexResult result = + reader.visitEqual( + fieldRef, BinaryString.fromString(prefix + (approxCardinality / 2))); + ((BitmapIndexResult) result).get(); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } +} diff --git a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/RoaringBitmapBenchmark.java b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/RoaringBitmapBenchmark.java index 4b989e96e5ee..a6e4b59e34a5 100644 --- a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/RoaringBitmapBenchmark.java +++ b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/bitmap/RoaringBitmapBenchmark.java @@ -25,6 +25,7 @@ import org.junit.jupiter.api.io.TempDir; import org.roaringbitmap.RoaringBitmap; +import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; @@ -78,6 +79,21 @@ public void testDeserialize() throws Exception { } }); + benchmark.addCase( + "deserialize(DataInputStream(BufferedInputStream))", + 10, + () -> { + try { + LocalFileIO.LocalSeekableInputStream seekableStream = + new LocalFileIO.LocalSeekableInputStream(file); + DataInputStream input = + new DataInputStream(new BufferedInputStream(seekableStream)); + new RoaringBitmap().deserialize(input); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + benchmark.addCase( "deserialize(DataInput, byte[])", 10, diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndex.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndex.java index 4020302c565c..dae835370b06 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndex.java +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndex.java @@ -33,9 +33,9 @@ import org.apache.paimon.utils.RoaringBitmap32; import java.io.ByteArrayOutputStream; -import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutputStream; +import java.nio.ByteBuffer; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; @@ -49,23 +49,31 @@ public class BitmapFileIndex implements FileIndexer { public static final int VERSION_1 = 1; + public static final int VERSION_2 = 2; + + public static final String VERSION = "version"; + public static final String INDEX_BLOCK_SIZE = "index-block-size"; + public static final String ENABLE_BUFFERED_INPUT = "enable-buffered-input"; + public static final String ENABLE_NEXT_OFFSET_TO_SIZE = "enable-next-offset-to-size"; private final DataType dataType; + private final Options options; public BitmapFileIndex(DataType dataType, Options options) { this.dataType = dataType; + this.options = options; } @Override public FileIndexWriter createWriter() { - return new Writer(dataType); + return new Writer(dataType, options); } @Override public FileIndexReader createReader( SeekableInputStream seekableInputStream, int start, int length) { try { - return new Reader(seekableInputStream, start, length); + return new Reader(seekableInputStream, start, options); } catch (Exception e) { throw new RuntimeException(e); } @@ -73,15 +81,19 @@ public FileIndexReader createReader( private static class Writer extends FileIndexWriter { + private final int version; private final DataType dataType; private final Function valueMapper; private final Map id2bitmap = new HashMap<>(); private final RoaringBitmap32 nullBitmap = new RoaringBitmap32(); private int rowNumber; + private final Options options; - public Writer(DataType dataType) { + public Writer(DataType dataType, Options options) { + this.version = options.getInteger(VERSION, VERSION_2); this.dataType = dataType; this.valueMapper = getValueMapper(dataType); + this.options = options; } @Override @@ -103,7 +115,7 @@ public byte[] serializedBytes() { ByteArrayOutputStream output = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(output); - dos.writeByte(VERSION_1); + dos.writeByte(version); // 1.serialize bitmaps to bytes byte[] nullBitmapBytes = nullBitmap.serialize(); @@ -111,7 +123,7 @@ public byte[] serializedBytes() { id2bitmap.entrySet().stream() .collect( Collectors.toMap( - e -> e.getKey(), e -> e.getValue().serialize())); + Map.Entry::getKey, e -> e.getValue().serialize())); // 2.build bitmap file index meta LinkedHashMap bitmapOffsets = new LinkedHashMap<>(); @@ -132,16 +144,36 @@ public byte[] serializedBytes() { offsetRef[0] += bytes.length; } }); - BitmapFileIndexMeta bitmapFileIndexMeta = - new BitmapFileIndexMeta( - dataType, - rowNumber, - id2bitmap.size(), - !nullBitmap.isEmpty(), - nullBitmap.getCardinality() == 1 - ? -1 - nullBitmap.iterator().next() - : 0, - bitmapOffsets); + BitmapFileIndexMeta bitmapFileIndexMeta; + if (version == VERSION_1) { + bitmapFileIndexMeta = + new BitmapFileIndexMeta( + dataType, + options, + rowNumber, + id2bitmap.size(), + !nullBitmap.isEmpty(), + nullBitmap.getCardinality() == 1 + ? -1 - nullBitmap.iterator().next() + : 0, + bitmapOffsets); + } else if (version == VERSION_2) { + bitmapFileIndexMeta = + new BitmapFileIndexMetaV2( + dataType, + options, + rowNumber, + id2bitmap.size(), + !nullBitmap.isEmpty(), + nullBitmap.getCardinality() == 1 + ? -1 - nullBitmap.iterator().next() + : 0, + nullBitmapBytes.length, + bitmapOffsets, + offsetRef[0]); + } else { + throw new RuntimeException("invalid version: " + version); + } // 3.serialize meta bitmapFileIndexMeta.serialize(dos); @@ -164,16 +196,20 @@ private static class Reader extends FileIndexReader { private final SeekableInputStream seekableInputStream; private final int headStart; - private int bodyStart; private final Map bitmaps = new LinkedHashMap<>(); + private final boolean enableNextOffsetToSize; - private int version; private BitmapFileIndexMeta bitmapFileIndexMeta; private Function valueMapper; - public Reader(SeekableInputStream seekableInputStream, int start, int length) { + private final Options options; + + public Reader(SeekableInputStream seekableInputStream, int start, Options options) { this.seekableInputStream = seekableInputStream; this.headStart = start; + this.options = options; + enableNextOffsetToSize = + options.getBoolean(BitmapFileIndex.ENABLE_NEXT_OFFSET_TO_SIZE, true); } @Override @@ -222,7 +258,7 @@ private RoaringBitmap32 getInListResultBitmap(List literals) { .map( it -> bitmaps.computeIfAbsent( - valueMapper.apply(it), k -> readBitmap(k))) + valueMapper.apply(it), this::readBitmap)) .iterator()); } @@ -235,8 +271,18 @@ private RoaringBitmap32 readBitmap(Object bitmapId) { if (offset < 0) { return RoaringBitmap32.bitmapOf(-1 - offset); } else { - seekableInputStream.seek(bodyStart + offset); + seekableInputStream.seek(bitmapFileIndexMeta.getBodyStart() + offset); RoaringBitmap32 bitmap = new RoaringBitmap32(); + if (enableNextOffsetToSize) { + int length = bitmapFileIndexMeta.getLength(bitmapId); + if (length != -1) { + DataInputStream input = new DataInputStream(seekableInputStream); + byte[] bytes = new byte[length]; + input.readFully(bytes); + bitmap.deserialize(ByteBuffer.wrap(bytes)); + return bitmap; + } + } bitmap.deserialize(new DataInputStream(seekableInputStream)); return bitmap; } @@ -251,18 +297,20 @@ private void readInternalMeta(DataType dataType) { this.valueMapper = getValueMapper(dataType); try { seekableInputStream.seek(headStart); - this.version = seekableInputStream.read(); - if (this.version > VERSION_1) { + int version = seekableInputStream.read(); + if (version == VERSION_1) { + this.bitmapFileIndexMeta = new BitmapFileIndexMeta(dataType, options); + this.bitmapFileIndexMeta.deserialize(seekableInputStream); + } else if (version == VERSION_2) { + this.bitmapFileIndexMeta = new BitmapFileIndexMetaV2(dataType, options); + this.bitmapFileIndexMeta.deserialize(seekableInputStream); + } else if (version > VERSION_2) { throw new RuntimeException( String.format( "read index file fail, " + "your plugin version is lower than %d", - this.version)); + version)); } - DataInput input = new DataInputStream(seekableInputStream); - this.bitmapFileIndexMeta = new BitmapFileIndexMeta(dataType); - this.bitmapFileIndexMeta.deserialize(input); - bodyStart = (int) seekableInputStream.getPos(); } catch (Exception e) { throw new RuntimeException(e); } diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMeta.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMeta.java index e18e4169ece6..3bdf1d277675 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMeta.java +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMeta.java @@ -19,6 +19,8 @@ package org.apache.paimon.fileindex.bitmap; import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.Options; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; @@ -43,10 +45,15 @@ import org.apache.paimon.types.VarCharType; import org.apache.paimon.types.VariantType; +import java.io.BufferedInputStream; import java.io.DataInput; +import java.io.DataInputStream; import java.io.DataOutput; +import java.io.InputStream; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; +import java.util.function.Function; /** * @@ -88,26 +95,33 @@ */ public class BitmapFileIndexMeta { - private final DataType dataType; - - private int rowCount; - private int nonNullBitmapNumber; - private boolean hasNullValue; - private int nullValueOffset; - private LinkedHashMap bitmapOffsets; - - public BitmapFileIndexMeta(DataType dataType) { + protected final DataType dataType; + protected final Options options; + protected int rowCount; + protected int nonNullBitmapNumber; + protected boolean hasNullValue; + protected int nullValueOffset; + protected LinkedHashMap bitmapOffsets; + protected Map bitmapLengths; + protected long bodyStart; + protected boolean enableNextOffsetToSize; + + public BitmapFileIndexMeta(DataType dataType, Options options) { this.dataType = dataType; + this.options = options; + enableNextOffsetToSize = + options.getBoolean(BitmapFileIndex.ENABLE_NEXT_OFFSET_TO_SIZE, true); } public BitmapFileIndexMeta( DataType dataType, + Options options, int rowCount, int nonNullBitmapNumber, boolean hasNullValue, int nullValueOffset, LinkedHashMap bitmapOffsets) { - this(dataType); + this(dataType, options); this.rowCount = rowCount; this.nonNullBitmapNumber = nonNullBitmapNumber; this.hasNullValue = hasNullValue; @@ -119,6 +133,10 @@ public int getRowCount() { return rowCount; } + public long getBodyStart() { + return bodyStart; + } + public boolean contains(Object bitmapId) { if (bitmapId == null) { return hasNullValue; @@ -133,8 +151,118 @@ public int getOffset(Object bitmapId) { return bitmapOffsets.get(bitmapId); } + public int getLength(Object bitmapId) { + if (bitmapLengths == null) { + return -1; + } + return bitmapLengths.getOrDefault(bitmapId, -1); + } + public void serialize(DataOutput out) throws Exception { + ThrowableConsumer valueWriter = getValueWriter(out); + + out.writeInt(rowCount); + out.writeInt(nonNullBitmapNumber); + out.writeBoolean(hasNullValue); + if (hasNullValue) { + out.writeInt(nullValueOffset); + } + for (Map.Entry entry : bitmapOffsets.entrySet()) { + valueWriter.accept(entry.getKey()); + out.writeInt(entry.getValue()); + } + } + + public void deserialize(SeekableInputStream seekableInputStream) throws Exception { + bodyStart = seekableInputStream.getPos(); + InputStream inputStream = seekableInputStream; + if (options.getBoolean(BitmapFileIndex.ENABLE_BUFFERED_INPUT, true)) { + inputStream = new BufferedInputStream(inputStream); + } + if (enableNextOffsetToSize) { + this.bitmapLengths = new HashMap<>(); + } + DataInput in = new DataInputStream(inputStream); + ThrowableSupplier valueReader = getValueReader(in); + Function measure = getSerializeSizeMeasure(); + + rowCount = in.readInt(); + bodyStart += Integer.BYTES; + + nonNullBitmapNumber = in.readInt(); + bodyStart += Integer.BYTES; + + hasNullValue = in.readBoolean(); + bodyStart++; + + if (hasNullValue) { + nullValueOffset = in.readInt(); + bodyStart += Integer.BYTES; + } + + bitmapOffsets = new LinkedHashMap<>(); + Object lastValue = null; + int lastOffset = nullValueOffset; + for (int i = 0; i < nonNullBitmapNumber; i++) { + Object value = valueReader.get(); + int offset = in.readInt(); + bitmapOffsets.put(value, offset); + bodyStart += measure.apply(value) + Integer.BYTES; + if (enableNextOffsetToSize) { + if (offset >= 0) { + if (lastOffset >= 0) { + int length = offset - lastOffset; + bitmapLengths.put(lastValue, length); + } + lastValue = value; + lastOffset = offset; + } + } + } + } + + protected Function getSerializeSizeMeasure() { + return dataType.accept( + new DataTypeVisitorAdapter>() { + @Override + public Function visitBinaryString() { + return o -> Integer.BYTES + ((BinaryString) o).getSizeInBytes(); + } + + @Override + public Function visitByte() { + return o -> Byte.BYTES; + } + + @Override + public Function visitShort() { + return o -> Short.BYTES; + } + + @Override + public Function visitInt() { + return o -> Integer.BYTES; + } + + @Override + public Function visitLong() { + return o -> Long.BYTES; + } + + @Override + public Function visitFloat() { + return o -> Float.BYTES; + } + + @Override + public Function visitDouble() { + return o -> Double.BYTES; + } + }); + } + + protected ThrowableConsumer getValueWriter(DataOutput out) { ThrowableConsumer valueWriter = dataType.accept( new DataTypeVisitorAdapter() { @@ -177,21 +305,10 @@ public ThrowableConsumer visitDouble() { return o -> out.writeDouble((double) o); } }); - - out.writeInt(rowCount); - out.writeInt(nonNullBitmapNumber); - out.writeBoolean(hasNullValue); - if (hasNullValue) { - out.writeInt(nullValueOffset); - } - for (Map.Entry entry : bitmapOffsets.entrySet()) { - valueWriter.accept(entry.getKey()); - out.writeInt(entry.getValue()); - } + return valueWriter; } - public void deserialize(DataInput in) throws Exception { - + protected ThrowableSupplier getValueReader(DataInput in) { ThrowableSupplier valueReader = dataType.accept( new DataTypeVisitorAdapter() { @@ -235,17 +352,7 @@ public ThrowableSupplier visitDouble() { return in::readDouble; } }); - - rowCount = in.readInt(); - nonNullBitmapNumber = in.readInt(); - hasNullValue = in.readBoolean(); - if (hasNullValue) { - nullValueOffset = in.readInt(); - } - bitmapOffsets = new LinkedHashMap<>(); - for (int i = 0; i < nonNullBitmapNumber; i++) { - bitmapOffsets.put(valueReader.get(), in.readInt()); - } + return valueReader; } /** functional interface. */ diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMetaV2.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMetaV2.java new file mode 100644 index 000000000000..0bfc7062a7c8 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapFileIndexMetaV2.java @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.bitmap; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataType; + +import java.io.BufferedInputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +/** + * When the bitmap-indexed column cardinality is high, using the first version of the bitmap index + * format will take a lot of time to read the entire dictionary. But in fact we don't need a full + * dictionary when dealing with a small number of predicates, the performance of predicate hits on + * the bitmap can be improved by creating a secondary index on the dictionary. + * + *
+ * Bitmap file index format (V2)
+ * +-------------------------------------------------+-----------------
+ * | version (1 byte) = 2                           |
+ * +-------------------------------------------------+
+ * | row count (4 bytes int)                        |
+ * +-------------------------------------------------+
+ * | non-null value bitmap number (4 bytes int)     |
+ * +-------------------------------------------------+
+ * | has null value (1 byte)                        |
+ * +-------------------------------------------------+
+ * | null value offset (4 bytes if has null value)  |       HEAD
+ * +-------------------------------------------------+
+ * | null bitmap length (4 bytes if has null value) |
+ * +-------------------------------------------------+
+ * | bitmap index block number (4 bytes int)        |
+ * +-------------------------------------------------+
+ * | value 1 | offset 1                             |
+ * +-------------------------------------------------+
+ * | value 2 | offset 2                             |
+ * +-------------------------------------------------+
+ * | ...                                            |
+ * +-------------------------------------------------+
+ * | bitmap blocks offset (4 bytes int)             |
+ * +-------------------------------------------------+-----------------
+ * | bitmap index block 1                           |
+ * +-------------------------------------------------+
+ * | bitmap index block 2                           |  INDEX BLOCKS
+ * +-------------------------------------------------+
+ * | ...                                            |
+ * +-------------------------------------------------+-----------------
+ * | serialized bitmap 1                            |
+ * +-------------------------------------------------+
+ * | serialized bitmap 2                            |
+ * +-------------------------------------------------+  BITMAP BLOCKS
+ * | serialized bitmap 3                            |
+ * +-------------------------------------------------+
+ * | ...                                            |
+ * +-------------------------------------------------+-----------------
+ *
+ * index block format:
+ * +-------------------------------------------------+
+ * | entry number (4 bytes int)                     |
+ * +-------------------------------------------------+
+ * | value 1 | offset 1 | length 1                  |
+ * +-------------------------------------------------+
+ * | value 2 | offset 2 | length 2                  |
+ * +-------------------------------------------------+
+ * | ...                                            |
+ * +-------------------------------------------------+
+ * 
+ */ +public class BitmapFileIndexMetaV2 extends BitmapFileIndexMeta { + + private long blockSizeLimit; + + private List indexBlocks; + private long indexBlockStart; + private int nullBitmapLength; + + public BitmapFileIndexMetaV2(DataType dataType, Options options) { + super(dataType, options); + this.nullBitmapLength = -1; + } + + public BitmapFileIndexMetaV2( + DataType dataType, + Options options, + int rowCount, + int nonNullBitmapNumber, + boolean hasNullValue, + int nullValueOffset, + int nullBitmapLength, + LinkedHashMap bitmapOffsets, + int finalOffset) { + super( + dataType, + options, + rowCount, + nonNullBitmapNumber, + hasNullValue, + nullValueOffset, + bitmapOffsets); + this.nullBitmapLength = nullBitmapLength; + blockSizeLimit = + MemorySize.parse(options.getString(BitmapFileIndex.INDEX_BLOCK_SIZE, "16kb")) + .getBytes(); + if (enableNextOffsetToSize) { + bitmapLengths = new HashMap<>(); + Object lastValue = null; + int lastOffset = nullValueOffset; + for (Map.Entry entry : bitmapOffsets.entrySet()) { + Object value = entry.getKey(); + Integer offset = entry.getValue(); + if (offset >= 0) { + if (lastOffset >= 0) { + bitmapLengths.put(lastValue, offset - lastOffset); + } + lastValue = value; + lastOffset = offset; + } + } + bitmapLengths.put(lastValue, finalOffset - lastOffset); + } + } + + public static Comparator getComparator(DataType dataType) { + return dataType.accept( + new DataTypeVisitorAdapter>() { + @Override + public Comparator visitBinaryString() { + return Comparator.comparing(o -> ((BinaryString) o)); + } + + @Override + public Comparator visitByte() { + return Comparator.comparing(o -> ((Byte) o)); + } + + @Override + public Comparator visitShort() { + return Comparator.comparing(o -> ((Short) o)); + } + + @Override + public Comparator visitInt() { + return Comparator.comparing(o -> ((Integer) o)); + } + + @Override + public Comparator visitLong() { + return Comparator.comparing(o -> ((Long) o)); + } + + @Override + public Comparator visitFloat() { + return Comparator.comparing(o -> ((Float) o)); + } + + @Override + public Comparator visitDouble() { + return Comparator.comparing(o -> ((Double) o)); + } + }); + } + + @Override + public boolean contains(Object bitmapId) { + if (bitmapId == null) { + return hasNullValue; + } + BitmapIndexBlock block = findBlock(bitmapId); + return block != null && block.contains(bitmapId); + } + + @Override + public int getOffset(Object bitmapId) { + if (bitmapId == null) { + return nullValueOffset; + } + BitmapIndexBlock block = findBlock(bitmapId); + return block.getOffset(bitmapId); + } + + @Override + public int getLength(Object bitmapId) { + if (bitmapId == null) { + return nullBitmapLength; + } + BitmapIndexBlock block = findBlock(bitmapId); + return block.getLength(bitmapId); + } + + private BitmapIndexBlock findBlock(Object bitmapId) { + Comparator comparator = getComparator(dataType); + BitmapIndexBlock prev = null; + for (BitmapIndexBlock block : indexBlocks) { + int cmp = comparator.compare(bitmapId, block.key); + if (cmp < 0) { + return prev; + } + prev = block; + } + return prev; + } + + @Override + public void serialize(DataOutput out) throws Exception { + + ThrowableConsumer valueWriter = getValueWriter(out); + + out.writeInt(rowCount); + out.writeInt(nonNullBitmapNumber); + out.writeBoolean(hasNullValue); + if (hasNullValue) { + out.writeInt(nullValueOffset); + out.writeInt(nullBitmapLength); + } + + LinkedList indexBlocks = new LinkedList<>(); + this.indexBlocks = indexBlocks; + indexBlocks.add(new BitmapIndexBlock(0)); + Comparator comparator = getComparator(dataType); + bitmapOffsets.entrySet().stream() + .map( + it -> + new Entry( + it.getKey(), + it.getValue(), + bitmapLengths == null + ? -1 + : bitmapLengths.getOrDefault(it.getKey(), -1))) + .sorted((e1, e2) -> comparator.compare(e1.key, e2.key)) + .forEach( + e -> { + BitmapIndexBlock last = indexBlocks.peekLast(); + if (!last.tryAdd(e)) { + BitmapIndexBlock next = + new BitmapIndexBlock(last.offset + last.serializedBytes); + indexBlocks.add(next); + if (!next.tryAdd(e)) { + throw new RuntimeException("index fail"); + } + } + }); + + out.writeInt(indexBlocks.size()); + + int bitmapBodyOffset = 0; + for (BitmapIndexBlock e : indexBlocks) { + // secondary entry + valueWriter.accept(e.key); + out.writeInt(e.offset); + bitmapBodyOffset += e.serializedBytes; + } + + // bitmap body offset + out.writeInt(bitmapBodyOffset); + + // bitmap index blocks + for (BitmapIndexBlock indexBlock : indexBlocks) { + out.writeInt(indexBlock.entryList.size()); + for (Entry e : indexBlock.entryList) { + valueWriter.accept(e.key); + out.writeInt(e.offset); + out.writeInt(e.length); + } + } + } + + @Override + public void deserialize(SeekableInputStream seekableInputStream) throws Exception { + + indexBlockStart = seekableInputStream.getPos(); + + InputStream inputStream = seekableInputStream; + if (options.getBoolean(BitmapFileIndex.ENABLE_BUFFERED_INPUT, true)) { + inputStream = new BufferedInputStream(inputStream); + } + DataInput in = new DataInputStream(inputStream); + ThrowableSupplier valueReader = getValueReader(in); + Function measure = getSerializeSizeMeasure(); + + rowCount = in.readInt(); + indexBlockStart += Integer.BYTES; + + nonNullBitmapNumber = in.readInt(); + indexBlockStart += Integer.BYTES; + + hasNullValue = in.readBoolean(); + indexBlockStart++; + + if (hasNullValue) { + nullValueOffset = in.readInt(); + nullBitmapLength = in.readInt(); + indexBlockStart += 2 * Integer.BYTES; + } + + bitmapOffsets = new LinkedHashMap<>(); + + int bitmapBlockNumber = in.readInt(); + indexBlockStart += Integer.BYTES; + + indexBlocks = new ArrayList<>(bitmapBlockNumber); + for (int i = 0; i < bitmapBlockNumber; i++) { + Object key = valueReader.get(); + int offset = in.readInt(); + indexBlocks.add( + new BitmapIndexBlock(dataType, options, key, offset, seekableInputStream)); + indexBlockStart += measure.apply(key) + Integer.BYTES; + } + + // bitmap body offset + int bitmapBodyOffset = in.readInt(); + indexBlockStart += Integer.BYTES; + + bodyStart = indexBlockStart + bitmapBodyOffset; + } + + /** Split of all bitmap entries. */ + class BitmapIndexBlock { + + Object key; + int offset; + int serializedBytes = Integer.BYTES; + List entryList; + Function keyBytesMapper; + DataType dataType; + SeekableInputStream seekableInputStream; + Options options; + + void tryDeserialize() { + if (entryList == null) { + try { + seekableInputStream.seek(indexBlockStart + offset); + InputStream inputStream = seekableInputStream; + if (options.getBoolean(BitmapFileIndex.ENABLE_BUFFERED_INPUT, true)) { + inputStream = new BufferedInputStream(inputStream); + } + DataInputStream in = new DataInputStream(inputStream); + ThrowableSupplier valueReader = getValueReader(in); + int entryNum = in.readInt(); + entryList = new ArrayList<>(entryNum); + for (int i = 0; i < entryNum; i++) { + entryList.add(new Entry(valueReader.get(), in.readInt(), in.readInt())); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + boolean contains(Object bitmapId) { + if (key.equals(bitmapId)) { + return true; + } + tryDeserialize(); + Comparator comparator = getComparator(dataType); + int idx = + Collections.binarySearch( + entryList, + new Entry(bitmapId, 0, 0), + (e1, e2) -> comparator.compare(e1.key, e2.key)); + return idx >= 0; + } + + int getOffset(Object bitmapId) { + tryDeserialize(); + Comparator comparator = getComparator(dataType); + int idx = + Collections.binarySearch( + entryList, + new Entry(bitmapId, 0, 0), + (e1, e2) -> comparator.compare(e1.key, e2.key)); + return entryList.get(idx).offset; + } + + int getLength(Object bitmapId) { + tryDeserialize(); + Comparator comparator = getComparator(dataType); + int idx = + Collections.binarySearch( + entryList, + new Entry(bitmapId, 0, 0), + (e1, e2) -> comparator.compare(e1.key, e2.key)); + return entryList.get(idx).length; + } + + boolean tryAdd(Entry entry) { + if (key == null) { + key = entry.key; + } + int entryBytes = 2 * Integer.BYTES + keyBytesMapper.apply(entry.key); + if (serializedBytes + entryBytes > blockSizeLimit) { + return false; + } + serializedBytes += entryBytes; + entryList.add(entry); + return true; + } + + // for build and serialize + public BitmapIndexBlock(int offset) { + this.offset = offset; + this.entryList = new LinkedList<>(); + keyBytesMapper = getSerializeSizeMeasure(); + } + + // for deserialize + public BitmapIndexBlock( + DataType dataType, + Options options, + Object key, + int offset, + SeekableInputStream seekableInputStream) { + this.dataType = dataType; + this.options = options; + this.key = key; + this.offset = offset; + this.seekableInputStream = seekableInputStream; + } + } + + /** Bitmap entry. */ + static class Entry { + + Object key; + int offset; + int length; + + public Entry(Object key, int offset, int length) { + this.key = key; + this.offset = offset; + this.length = length; + } + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/bitmapindex/TestBitmapFileIndex.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/bitmapindex/TestBitmapFileIndex.java index ba94f3c077dd..bc0995b5931b 100644 --- a/paimon-common/src/test/java/org/apache/paimon/fileindex/bitmapindex/TestBitmapFileIndex.java +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/bitmapindex/TestBitmapFileIndex.java @@ -20,22 +20,35 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexResult; import org.apache.paimon.fileindex.FileIndexWriter; import org.apache.paimon.fileindex.bitmap.BitmapFileIndex; +import org.apache.paimon.fileindex.bitmap.BitmapFileIndexMetaV2; import org.apache.paimon.fileindex.bitmap.BitmapIndexResult; -import org.apache.paimon.fs.ByteArraySeekableStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.options.Options; import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypes; import org.apache.paimon.types.IntType; import org.apache.paimon.types.VarCharType; import org.apache.paimon.utils.RoaringBitmap32; +import org.apache.commons.io.FileUtils; +import org.junit.Rule; import org.junit.jupiter.api.Test; +import org.junit.rules.TemporaryFolder; +import java.io.File; import java.util.Arrays; +import java.util.function.Consumer; /** test for {@link BitmapFileIndex}. */ public class TestBitmapFileIndex { + @Rule public TemporaryFolder folder = new TemporaryFolder(); + @Test public void testFlip() { RoaringBitmap32 bitmap = RoaringBitmap32.bitmapOf(1, 3, 5); @@ -44,98 +57,162 @@ public void testFlip() { } @Test - public void testBitmapIndex1() { - VarCharType dataType = new VarCharType(); - FieldRef fieldRef = new FieldRef(0, "", dataType); - BitmapFileIndex bitmapFileIndex = new BitmapFileIndex(dataType, null); - FileIndexWriter writer = bitmapFileIndex.createWriter(); - Object[] arr = { - BinaryString.fromString("a"), - null, - BinaryString.fromString("b"), - null, - BinaryString.fromString("a"), - }; - for (Object o : arr) { - writer.write(o); - } - byte[] bytes = writer.serializedBytes(); - ByteArraySeekableStream seekableStream = new ByteArraySeekableStream(bytes); - FileIndexReader reader = bitmapFileIndex.createReader(seekableStream, 0, bytes.length); - - BitmapIndexResult result1 = - (BitmapIndexResult) reader.visitEqual(fieldRef, BinaryString.fromString("a")); - assert result1.get().equals(RoaringBitmap32.bitmapOf(0, 4)); - - BitmapIndexResult result2 = - (BitmapIndexResult) reader.visitEqual(fieldRef, BinaryString.fromString("b")); - assert result2.get().equals(RoaringBitmap32.bitmapOf(2)); - - BitmapIndexResult result3 = (BitmapIndexResult) reader.visitIsNull(fieldRef); - assert result3.get().equals(RoaringBitmap32.bitmapOf(1, 3)); - - BitmapIndexResult result4 = (BitmapIndexResult) result1.and(result2); - assert result4.get().equals(RoaringBitmap32.bitmapOf()); - - BitmapIndexResult result5 = (BitmapIndexResult) result1.or(result2); - assert result5.get().equals(RoaringBitmap32.bitmapOf(0, 2, 4)); + public void testComparator() { + assert BitmapFileIndexMetaV2.getComparator(new VarCharType()) + .compare(BinaryString.fromString("a"), BinaryString.fromString("b")) + < 0; + assert BitmapFileIndexMetaV2.getComparator(new VarCharType()) + .compare(BinaryString.fromString("a"), BinaryString.fromString("a")) + == 0; + assert BitmapFileIndexMetaV2.getComparator(new VarCharType()) + .compare(BinaryString.fromString("c"), BinaryString.fromString("b")) + > 0; + assert BitmapFileIndexMetaV2.getComparator(new IntType()).compare(1, 2) < 0; + assert BitmapFileIndexMetaV2.getComparator(new IntType()).compare(2, 1) > 0; } @Test - public void testBitmapIndex2() { - IntType dataType = new IntType(); - FieldRef fieldRef = new FieldRef(0, "", dataType); - BitmapFileIndex bitmapFileIndex = new BitmapFileIndex(dataType, null); - FileIndexWriter writer = bitmapFileIndex.createWriter(); - Object[] arr = {0, 1, null}; - for (Object o : arr) { - writer.write(o); - } - byte[] bytes = writer.serializedBytes(); - ByteArraySeekableStream seekableStream = new ByteArraySeekableStream(bytes); - FileIndexReader reader = bitmapFileIndex.createReader(seekableStream, 0, bytes.length); - - BitmapIndexResult result1 = (BitmapIndexResult) reader.visitEqual(fieldRef, 1); - assert result1.get().equals(RoaringBitmap32.bitmapOf(1)); - - BitmapIndexResult result2 = (BitmapIndexResult) reader.visitIsNull(fieldRef); - assert result2.get().equals(RoaringBitmap32.bitmapOf(2)); - - BitmapIndexResult result3 = (BitmapIndexResult) reader.visitIsNotNull(fieldRef); - assert result3.get().equals(RoaringBitmap32.bitmapOf(0, 1)); - - BitmapIndexResult result4 = - (BitmapIndexResult) reader.visitNotIn(fieldRef, Arrays.asList(1, 2)); - assert result4.get().equals(RoaringBitmap32.bitmapOf(0, 2)); - - BitmapIndexResult result5 = - (BitmapIndexResult) reader.visitNotIn(fieldRef, Arrays.asList(1, 0)); - assert result5.get().equals(RoaringBitmap32.bitmapOf(2)); + public void testMemorySize() { + assert MemorySize.parse("16kb").getBytes() == 16 * 1024; + assert MemorySize.parse("16KB").getBytes() == 16 * 1024; + assert MemorySize.parse("16 kb").getBytes() == 16 * 1024; + assert MemorySize.parse("16 KB").getBytes() == 16 * 1024; + assert MemorySize.parse("16384").getBytes() == 16 * 1024; } @Test - public void testBitmapIndex3() { - - IntType intType = new IntType(); - FieldRef fieldRef = new FieldRef(0, "", intType); - BitmapFileIndex bitmapFileIndex = new BitmapFileIndex(intType, null); - FileIndexWriter writer = bitmapFileIndex.createWriter(); + public void testV1() throws Exception { + testIntType(BitmapFileIndex.VERSION_1); + testStringType(BitmapFileIndex.VERSION_1); + testHighCardinality(BitmapFileIndex.VERSION_1, 1000000, 100000, null); + } - // test only one null-value - Object[] arr = {1, 2, 1, 2, 1, 3, null}; + @Test + public void testV2() throws Exception { + testIntType(BitmapFileIndex.VERSION_2); + testStringType(BitmapFileIndex.VERSION_2); + testHighCardinality(BitmapFileIndex.VERSION_2, 1000000, 100000, null); + } - for (Object o : arr) { - writer.write(o); + private FileIndexReader createTestReaderOnWriter( + int writerVersion, + Integer indexBlockSize, + DataType dataType, + Consumer consumer) + throws Exception { + Options options = new Options(); + options.setInteger(BitmapFileIndex.VERSION, writerVersion); + if (indexBlockSize != null) { + options.setInteger(BitmapFileIndex.INDEX_BLOCK_SIZE, indexBlockSize); } - byte[] bytes = writer.serializedBytes(); - ByteArraySeekableStream seekableStream = new ByteArraySeekableStream(bytes); - FileIndexReader reader = bitmapFileIndex.createReader(seekableStream, 0, bytes.length); + BitmapFileIndex bitmapFileIndex = new BitmapFileIndex(dataType, options); + FileIndexWriter writer; + writer = bitmapFileIndex.createWriter(); + consumer.accept(writer); + folder.create(); + File file = folder.newFile("f1"); + byte[] data = writer.serializedBytes(); + FileUtils.writeByteArrayToFile(file, data); + LocalFileIO.LocalSeekableInputStream localSeekableInputStream = + new LocalFileIO.LocalSeekableInputStream(file); + return bitmapFileIndex.createReader(localSeekableInputStream, 0, 0); + } - BitmapIndexResult result1 = (BitmapIndexResult) reader.visitEqual(fieldRef, 1); - assert result1.get().equals(RoaringBitmap32.bitmapOf(0, 2, 4)); + private void testStringType(int version) throws Exception { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING()); + BinaryString a = BinaryString.fromString("a"); + BinaryString b = BinaryString.fromString("b"); + Object[] dataColumn = {a, null, b, null, a}; + FileIndexReader reader = + createTestReaderOnWriter( + version, + null, + DataTypes.STRING(), + writer -> { + for (Object o : dataColumn) { + writer.write(o); + } + }); + assert ((BitmapIndexResult) reader.visitEqual(fieldRef, a)) + .get() + .equals(RoaringBitmap32.bitmapOf(0, 4)); + assert ((BitmapIndexResult) reader.visitEqual(fieldRef, b)) + .get() + .equals(RoaringBitmap32.bitmapOf(2)); + assert ((BitmapIndexResult) reader.visitIsNull(fieldRef)) + .get() + .equals(RoaringBitmap32.bitmapOf(1, 3)); + assert ((BitmapIndexResult) reader.visitIn(fieldRef, Arrays.asList(a, b))) + .get() + .equals(RoaringBitmap32.bitmapOf(0, 2, 4)); + } + + private void testIntType(int version) throws Exception { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.INT()); + Object[] dataColumn = {0, 1, null}; + FileIndexReader reader = + createTestReaderOnWriter( + version, + null, + DataTypes.INT(), + writer -> { + for (Object o : dataColumn) { + writer.write(o); + } + }); + assert ((BitmapIndexResult) reader.visitEqual(fieldRef, 0)) + .get() + .equals(RoaringBitmap32.bitmapOf(0)); + assert ((BitmapIndexResult) reader.visitEqual(fieldRef, 1)) + .get() + .equals(RoaringBitmap32.bitmapOf(1)); + assert ((BitmapIndexResult) reader.visitIsNull(fieldRef)) + .get() + .equals(RoaringBitmap32.bitmapOf(2)); + assert ((BitmapIndexResult) reader.visitIn(fieldRef, Arrays.asList(0, 1, 2))) + .get() + .equals(RoaringBitmap32.bitmapOf(0, 1)); + } - // test read singleton bitmap - BitmapIndexResult result2 = (BitmapIndexResult) reader.visitIsNull(fieldRef); - assert result2.get().equals(RoaringBitmap32.bitmapOf(6)); + private void testHighCardinality( + int version, int rowCount, int approxCardinality, Integer secondaryBlockSize) + throws Exception { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING()); + RoaringBitmap32 middleBm = new RoaringBitmap32(); + RoaringBitmap32 nullBm = new RoaringBitmap32(); + long time1 = System.currentTimeMillis(); + String prefix = "ssssssssss"; + FileIndexReader reader = + createTestReaderOnWriter( + version, + secondaryBlockSize, + DataTypes.STRING(), + writer -> { + for (int i = 0; i < rowCount; i++) { + + int sid = (int) (Math.random() * approxCardinality); + if (sid == approxCardinality / 2) { + middleBm.add(i); + } else if (Math.random() < 0.01) { + nullBm.add(i); + writer.write(null); + continue; + } + writer.write(BinaryString.fromString(prefix + sid)); + } + }); + System.out.println("write time: " + (System.currentTimeMillis() - time1)); + long time2 = System.currentTimeMillis(); + FileIndexResult result = + reader.visitEqual( + fieldRef, BinaryString.fromString(prefix + (approxCardinality / 2))); + RoaringBitmap32 resultBm = ((BitmapIndexResult) result).get(); + System.out.println("read time: " + (System.currentTimeMillis() - time2)); + assert resultBm.equals(middleBm); + long time3 = System.currentTimeMillis(); + FileIndexResult resultNull = reader.visitIsNull(fieldRef); + RoaringBitmap32 resultNullBm = ((BitmapIndexResult) resultNull).get(); + System.out.println("read null bitmap time: " + (System.currentTimeMillis() - time3)); + assert resultNullBm.equals(nullBm); } } diff --git a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkFileIndexITCase.java b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkFileIndexITCase.java index cb35fa507016..198cdd8cf48a 100644 --- a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkFileIndexITCase.java +++ b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkFileIndexITCase.java @@ -44,6 +44,7 @@ import org.apache.paimon.table.source.Split; import org.apache.paimon.types.IntType; import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VarCharType; import org.apache.paimon.utils.FileStorePathFactory; import org.apache.paimon.utils.RoaringBitmap32; @@ -100,6 +101,7 @@ public void testReadWriteTableWithBitmapIndex() throws Catalog.TableNotExistExce spark.sql( "CREATE TABLE T(a int) TBLPROPERTIES (" + "'file-index.bitmap.columns'='a'," + + "'file-index.bitmap.a.index-block-size'='32kb'," + "'file-index.in-manifest-threshold'='1B');"); spark.sql("INSERT INTO T VALUES (0),(1),(2),(3),(4),(5);"); @@ -112,6 +114,7 @@ public void testReadWriteTableWithBitmapIndex() throws Catalog.TableNotExistExce // check index reader foreachIndexReader( + "T", fileIndexReader -> { FileIndexResult fileIndexResult = fileIndexReader.visitEqual(new FieldRef(0, "", new IntType()), 3); @@ -119,6 +122,23 @@ public void testReadWriteTableWithBitmapIndex() throws Catalog.TableNotExistExce RoaringBitmap32 roaringBitmap32 = ((BitmapIndexResult) fileIndexResult).get(); assert roaringBitmap32.equals(RoaringBitmap32.bitmapOf(3)); }); + + // test string type with null bitmap + spark.sql( + "CREATE TABLE T2(a string) TBLPROPERTIES (" + + "'file-index.bitmap.columns'='a'," + + "'file-index.in-manifest-threshold'='1B');"); + spark.sql("INSERT INTO T2 VALUES ('0'),('1'),('1'),(null),('0'),('1');"); + foreachIndexReader( + "T2", + fileIndexReader -> { + FileIndexResult fileIndexResult = + fileIndexReader.visitEqual( + new FieldRef(0, "", new VarCharType()), null); + assert fileIndexResult instanceof BitmapIndexResult; + RoaringBitmap32 roaringBitmap32 = ((BitmapIndexResult) fileIndexResult).get(); + assert roaringBitmap32.equals(RoaringBitmap32.bitmapOf(3)); + }); } @Test @@ -136,6 +156,7 @@ public void testReadWriteTableWithBitSliceIndex() throws Catalog.TableNotExistEx // check index reader foreachIndexReader( + "T", fileIndexReader -> { FileIndexResult fileIndexResult = fileIndexReader.visitGreaterOrEqual( @@ -146,9 +167,9 @@ public void testReadWriteTableWithBitSliceIndex() throws Catalog.TableNotExistEx }); } - protected void foreachIndexReader(Consumer consumer) + protected void foreachIndexReader(String tableName, Consumer consumer) throws Catalog.TableNotExistException { - Path tableRoot = fileSystemCatalog.getTableLocation(Identifier.create("db", "T")); + Path tableRoot = fileSystemCatalog.getTableLocation(Identifier.create("db", tableName)); SchemaManager schemaManager = new SchemaManager(fileIO, tableRoot); FileStorePathFactory pathFactory = new FileStorePathFactory( @@ -164,7 +185,7 @@ protected void foreachIndexReader(Consumer consumer) null, null); - Table table = fileSystemCatalog.getTable(Identifier.create("db", "T")); + Table table = fileSystemCatalog.getTable(Identifier.create("db", tableName)); ReadBuilder readBuilder = table.newReadBuilder(); List splits = readBuilder.newScan().plan().splits(); for (Split split : splits) {