From 53a2a0495745da545ab89ae3bd85762744e8225a Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 18 Jun 2026 13:42:56 +0800 Subject: [PATCH 01/10] [core][python] Add global index fast search option --- docs/docs/multimodal-table/global-index.mdx | 14 +- docs/generated/core_configuration.html | 6 + .../java/org/apache/paimon/CoreOptions.java | 13 + .../globalindex/VectorGlobalIndexer.java | 26 ++ .../testvector/TestVectorGlobalIndexer.java | 21 +- .../globalindex/DataEvolutionBatchScan.java | 98 +++++- .../globalindex/GlobalIndexScanner.java | 119 ++++++- .../table/source/AbstractVectorRead.java | 224 +++++++++++- .../table/source/BatchVectorReadImpl.java | 38 ++- .../source/BatchVectorSearchBuilderImpl.java | 3 +- .../paimon/table/source/VectorReadImpl.java | 29 +- .../table/source/VectorSearchBuilderImpl.java | 2 +- .../table/BtreeGlobalIndexTableTest.java | 54 +++ .../table/source/VectorSearchBuilderTest.java | 103 ++++++ .../index/LuminaVectorGlobalIndexer.java | 9 +- .../pypaimon/common/options/core_options.py | 14 + .../globalindex/global_index_reader.py | 3 + .../globalindex/global_index_scanner.py | 108 +++++- .../lumina_vector_global_index_reader.py | 4 + paimon-python/pypaimon/read/read_builder.py | 19 +- .../pypaimon/read/scanner/file_scanner.py | 69 ++++ paimon-python/pypaimon/read/table_scan.py | 26 +- .../source/batch_vector_search_builder.py | 1 + .../table/source/vector_search_builder.py | 1 + .../table/source/vector_search_read.py | 321 +++++++++++++++--- .../table/source/vector_search_scan.py | 7 +- .../pypaimon/tests/global_index_test.py | 34 ++ .../tests/vector_search_filter_test.py | 168 ++++++++- .../spark/read/SparkVectorReadImpl.java | 21 +- .../read/SparkVectorSearchBuilderImpl.java | 3 +- .../vector/index/VectorGlobalIndexer.java | 8 +- 31 files changed, 1463 insertions(+), 103 deletions(-) create mode 100644 paimon-common/src/main/java/org/apache/paimon/globalindex/VectorGlobalIndexer.java diff --git a/docs/docs/multimodal-table/global-index.mdx b/docs/docs/multimodal-table/global-index.mdx index 90a8ac33bcc0..001175278c87 100644 --- a/docs/docs/multimodal-table/global-index.mdx +++ b/docs/docs/multimodal-table/global-index.mdx @@ -129,8 +129,17 @@ ALTER TABLE my_table SET ( Global index files cover row-id ranges. If more rows are appended after an index is built, those new rows are not automatically covered by the existing index files. Run `create_global_index` again -to build index files for newly uncovered data. A query that can be answered by a matching global -index reads indexed row ranges; rows in uncovered ranges are not returned for that indexed query. +to build index files for newly uncovered data. By default, queries use fast search and only read +indexed row ranges; rows in uncovered ranges are not returned for that indexed query. + +To improve freshness for query types that support slow search, set: + +```sql +ALTER TABLE my_table SET ('global-index.fast-search' = 'false'); +``` + +With fast search disabled, supported global-index queries merge indexed results with a scan over +files not covered by global indexes. To temporarily disable global-index scan acceleration while keeping the index files, set: @@ -147,6 +156,7 @@ These table options affect global index build and read behavior: | Option | Default | Description | |---|---|---| | `global-index.enabled` | `true` | Whether scans can use global indexes. | +| `global-index.fast-search` | `true` | Whether global index queries only search indexed files. Set to `false` to also scan files not covered by global indexes when supported. | | `global-index.external-path` | Not set | Root directory for global index files. If not set, files are stored under the table index directory. | | `sorted-index.records-per-range` | `10000000` | Expected number of records per sorted global index file for BTree and Bitmap builds. | | `sorted-index.build.max-parallelism` | `4096` | Maximum Flink or Spark parallelism for building sorted global indexes. | diff --git a/docs/generated/core_configuration.html b/docs/generated/core_configuration.html index 575c40e21f03..8b395657ae05 100644 --- a/docs/generated/core_configuration.html +++ b/docs/generated/core_configuration.html @@ -734,6 +734,12 @@ Boolean Whether to enable global index for scan. + +
global-index.fast-search
+ true + Boolean + Whether global index queries only search indexed files. Set to false to also scan files not covered by global indexes when supported. +
global-index.external-path
(none) diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java index f2a0d32a4d13..400571509f2f 100644 --- a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java +++ b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java @@ -2543,6 +2543,15 @@ public InlineElement getDescription() { .defaultValue(true) .withDescription("Whether to enable global index for scan."); + public static final ConfigOption GLOBAL_INDEX_FAST_SEARCH = + key("global-index.fast-search") + .booleanType() + .defaultValue(true) + .withDescription( + "Whether global index queries only search indexed files. " + + "Set to false to also scan files not covered by global indexes " + + "when supported."); + public static final ConfigOption GLOBAL_INDEX_THREAD_NUM = key("global-index.thread-num") .intType() @@ -4049,6 +4058,10 @@ public boolean globalIndexEnabled() { return options.get(GLOBAL_INDEX_ENABLED); } + public boolean globalIndexFastSearch() { + return options.get(GLOBAL_INDEX_FAST_SEARCH); + } + public Integer globalIndexThreadNum() { return options.get(GLOBAL_INDEX_THREAD_NUM); } diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/VectorGlobalIndexer.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/VectorGlobalIndexer.java new file mode 100644 index 000000000000..63166ef7c1f1 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/VectorGlobalIndexer.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +/** A {@link GlobalIndexer} that supports vector similarity search. */ +public interface VectorGlobalIndexer extends GlobalIndexer { + + /** Returns the metric name used to convert vector distances to comparable scores. */ + String metric(); +} diff --git a/paimon-common/src/test/java/org/apache/paimon/globalindex/testvector/TestVectorGlobalIndexer.java b/paimon-common/src/test/java/org/apache/paimon/globalindex/testvector/TestVectorGlobalIndexer.java index cd3a14068112..f652719834ab 100644 --- a/paimon-common/src/test/java/org/apache/paimon/globalindex/testvector/TestVectorGlobalIndexer.java +++ b/paimon-common/src/test/java/org/apache/paimon/globalindex/testvector/TestVectorGlobalIndexer.java @@ -21,7 +21,7 @@ import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; import org.apache.paimon.globalindex.GlobalIndexWriter; -import org.apache.paimon.globalindex.GlobalIndexer; +import org.apache.paimon.globalindex.VectorGlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; import org.apache.paimon.options.Options; @@ -32,12 +32,13 @@ import java.io.IOException; import java.util.List; import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicInteger; import static org.apache.paimon.utils.Preconditions.checkArgument; /** - * A test-only {@link GlobalIndexer} for vector similarity search. Uses brute-force linear scan for - * ANN queries. No native library dependency required. + * A test-only {@link VectorGlobalIndexer} for vector similarity search. Uses brute-force linear + * scan for ANN queries. No native library dependency required. * *

Supported distance metrics (configured via option {@code test.vector.metric}): * @@ -47,7 +48,7 @@ *

  • {@code inner_product} - Inner product similarity (directly used as score) * */ -public class TestVectorGlobalIndexer implements GlobalIndexer { +public class TestVectorGlobalIndexer implements VectorGlobalIndexer { /** Option key for vector dimension. */ public static final String OPT_DIMENSION = "test.vector.dimension"; @@ -59,6 +60,8 @@ public class TestVectorGlobalIndexer implements GlobalIndexer { public static final String OPT_REQUIRED_OPTION_VALUE = "test.vector.required-option.value"; + private static final AtomicInteger METRIC_CALLS = new AtomicInteger(); + private final DataType fieldType; private final int dimension; private final String metric; @@ -96,7 +99,17 @@ public int dimension() { return dimension; } + @Override public String metric() { + METRIC_CALLS.incrementAndGet(); return metric; } + + public static void resetMetricCalls() { + METRIC_CALLS.set(0); + } + + public static int metricCalls() { + return METRIC_CALLS.get(); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java index 8d6038ccc492..741b281ff607 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java @@ -21,6 +21,7 @@ import org.apache.paimon.CoreOptions; import org.apache.paimon.annotation.VisibleForTesting; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.InternalRow; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.manifest.PartitionEntry; import org.apache.paimon.metrics.MetricRegistry; @@ -35,10 +36,15 @@ import org.apache.paimon.table.source.DataTableBatchScan; import org.apache.paimon.table.source.DataTableScan; import org.apache.paimon.table.source.InnerTableScan; +import org.apache.paimon.table.source.ReadBuilder; import org.apache.paimon.table.source.Split; +import org.apache.paimon.table.source.TableRead; +import org.apache.paimon.table.source.TableScan; +import org.apache.paimon.table.source.snapshot.SnapshotReader; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.Filter; import org.apache.paimon.utils.Range; +import org.apache.paimon.utils.RoaringNavigableMap64; import org.apache.paimon.utils.RowRangeIndex; import org.slf4j.Logger; @@ -55,6 +61,7 @@ import java.util.function.Function; import static org.apache.paimon.table.SpecialFields.ROW_ID; +import static org.apache.paimon.table.SpecialFields.rowTypeWithRowId; import static org.apache.paimon.utils.ManifestReadThreadPool.randomlyExecuteSequentialReturn; /** Scan for data evolution table. */ @@ -244,7 +251,17 @@ public Plan plan() { Optional indexResult = evalGlobalIndex(); if (indexResult.isPresent()) { GlobalIndexResult result = indexResult.get(); - rowRangeIndex = RowRangeIndex.create(result.results().toRangeList()); + RoaringNavigableMap64 rowIds = result.results(); + List rowRanges = rowIds.toRangeList(); + boolean scanUnindexedRanges = + globalIndexResult == null + && !(result instanceof ScoredGlobalIndexResult) + && !table.coreOptions().globalIndexFastSearch(); + if (scanUnindexedRanges) { + rowIds = withUnindexedRows(rowIds); + rowRanges = rowIds.toRangeList(); + } + rowRangeIndex = RowRangeIndex.create(rowRanges); if (result instanceof ScoredGlobalIndexResult) { scoreGetter = ((ScoredGlobalIndexResult) result).scoreGetter(); } @@ -259,6 +276,81 @@ public Plan plan() { return wrapToIndexSplits(splits, rowRangeIndex, scoreGetter); } + private RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedResultRows) { + TableScan.Plan allDataPlan = allDataPlan(); + List dataRanges = new ArrayList<>(); + for (Split split : allDataPlan.splits()) { + if (!(split instanceof DataSplit)) { + continue; + } + for (DataFileMeta file : ((DataSplit) split).dataFiles()) { + if (file.firstRowId() != null) { + dataRanges.add(file.nonNullRowIdRange()); + } + } + } + + List predicateIndexedRanges = + GlobalIndexScanner.indexedRanges( + table, + batchScan.snapshotReader().manifestsReader().partitionFilter(), + filter, + batchScan.snapshotReader().snapshotManager().snapshot(snapshotId(allDataPlan))); + predicateIndexedRanges = Range.sortAndMergeOverlap(predicateIndexedRanges, true); + + List unindexedRanges = new ArrayList<>(); + for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { + unindexedRanges.addAll(dataRange.exclude(predicateIndexedRanges)); + } + unindexedRanges = Range.sortAndMergeOverlap(unindexedRanges, true); + + RoaringNavigableMap64 rows = new RoaringNavigableMap64(); + rows.or(indexedResultRows); + rows.or(matchingRows(unindexedRanges)); + return rows; + } + + private RoaringNavigableMap64 matchingRows(List ranges) { + RoaringNavigableMap64 rows = new RoaringNavigableMap64(); + if (ranges.isEmpty()) { + return rows; + } + + RowType readType = rowTypeWithRowId(table.rowType()); + RowRangeIndex rowRangeIndex = RowRangeIndex.create(ranges); + ReadBuilder readBuilder = table.newReadBuilder().withReadType(readType).withFilter(filter); + readBuilder.withPartitionFilter(batchScan.snapshotReader().manifestsReader().partitionFilter()); + List splits = readBuilder.withRowRangeIndex(rowRangeIndex).newScan().plan().splits(); + int rowIdIndex = readType.getFieldIndex(ROW_ID.name()); + try { + TableRead read = readBuilder.newRead(); + try (org.apache.paimon.reader.RecordReader reader = + read.executeFilter().createReader(splits)) { + reader.forEachRemaining(row -> rows.add(row.getLong(rowIdIndex))); + } + } catch (IOException e) { + throw new RuntimeException( + "Failed to scan unindexed data for global index slow search.", e); + } + return rows; + } + + private long snapshotId(TableScan.Plan plan) { + if (plan instanceof SnapshotReader.Plan) { + Long snapshotId = ((SnapshotReader.Plan) plan).snapshotId(); + if (snapshotId != null) { + return snapshotId; + } + } + throw new IllegalStateException("Cannot read global index coverage without a snapshot."); + } + + private TableScan.Plan allDataPlan() { + ReadBuilder readBuilder = table.newReadBuilder(); + readBuilder.withPartitionFilter(batchScan.snapshotReader().manifestsReader().partitionFilter()); + return readBuilder.newScan().plan(); + } + private Optional evalGlobalIndex() { if (this.globalIndexResult != null) { return Optional.of(globalIndexResult); @@ -296,7 +388,9 @@ public static Plan wrapToIndexSplits( Function> process = split -> Collections.singletonList( - wrap((DataSplit) split, rowRangeIndex, scoreGetter)); + split instanceof IndexedSplit + ? (IndexedSplit) split + : wrap((DataSplit) split, rowRangeIndex, scoreGetter)); randomlyExecuteSequentialReturn(process, splits, null).forEachRemaining(indexedSplits::add); return () -> indexedSplits; } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 1c8bbd3dd765..21e36fffd94a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -18,6 +18,7 @@ package org.apache.paimon.globalindex; +import org.apache.paimon.Snapshot; import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; @@ -27,6 +28,10 @@ import org.apache.paimon.manifest.IndexManifestEntry; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; +import org.apache.paimon.predicate.CompoundPredicate; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.predicate.LeafPredicate; +import org.apache.paimon.predicate.Or; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; @@ -179,8 +184,64 @@ public static Optional create( indexFiles)); } + public static List indexedRanges(FileStoreTable table, Snapshot snapshot) { + return table.store().newIndexFileHandler().scan(snapshot, entry -> true).stream() + .map(IndexManifestEntry::indexFile) + .map(IndexFileMeta::globalIndexMeta) + .filter(meta -> meta != null) + .map(meta -> new Range(meta.rowRangeStart(), meta.rowRangeEnd())) + .collect(Collectors.toList()); + } + + public static List indexedRanges( + FileStoreTable table, + PartitionPredicate partitionFilter, + Predicate filter, + Snapshot snapshot) { + if (filter == null) { + return Collections.emptyList(); + } + Map> coverageByField = new HashMap<>(); + for (IndexManifestEntry entry : + table.store() + .newIndexFileHandler() + .scan(snapshot, indexFileFilter(table, partitionFilter, filter))) { + GlobalIndexMeta globalIndex = entry.indexFile().globalIndexMeta(); + if (globalIndex == null) { + continue; + } + Range range = new Range(globalIndex.rowRangeStart(), globalIndex.rowRangeEnd()); + coverageByField.computeIfAbsent(globalIndex.indexFieldId(), k -> new ArrayList<>()) + .add(range); + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + coverageByField.computeIfAbsent(id, k -> new ArrayList<>()).add(range); + } + } + } + Optional> coverage = indexedRanges(filter, table.rowType(), coverageByField); + return coverage.orElse(Collections.emptyList()); + } + public static Optional create( FileStoreTable table, PartitionPredicate partitionFilter, Predicate filter) { + List indexFiles = + table.store() + .newIndexFileHandler() + .scan( + tryTravelOrLatest(table), + indexFileFilter(table, partitionFilter, filter)) + .stream() + .map(IndexManifestEntry::indexFile) + .collect(Collectors.toList()); + return create(table, indexFiles); + } + + private static Filter indexFileFilter( + FileStoreTable table, PartitionPredicate partitionFilter, Predicate filter) { + if (filter == null) { + return entry -> false; + } Set filterFieldIds = collectFieldNames(filter).stream() .filter(name -> table.rowType().containsField(name)) @@ -209,13 +270,61 @@ public static Optional create( } return false; }; + return indexFileFilter; + } - List indexFiles = - table.store().newIndexFileHandler().scan(tryTravelOrLatest(table), indexFileFilter) - .stream() - .map(IndexManifestEntry::indexFile) + private static Optional> indexedRanges( + Predicate predicate, RowType rowType, Map> coverageByField) { + if (predicate == null) { + return Optional.empty(); + } + if (predicate instanceof LeafPredicate) { + Optional fieldRef = ((LeafPredicate) predicate).fieldRefOptional(); + if (!fieldRef.isPresent() || !rowType.containsField(fieldRef.get().name())) { + return Optional.empty(); + } + List coverage = coverageByField.get(rowType.getField(fieldRef.get().name()).id()); + if (coverage == null || coverage.isEmpty()) { + return Optional.empty(); + } + return Optional.of(Range.sortAndMergeOverlap(coverage, true)); + } + + if (!(predicate instanceof CompoundPredicate)) { + return Optional.empty(); + } + + CompoundPredicate compoundPredicate = (CompoundPredicate) predicate; + List>> childCoverages = + compoundPredicate.children().stream() + .map(child -> indexedRanges(child, rowType, coverageByField)) .collect(Collectors.toList()); - return create(table, indexFiles); + + if (compoundPredicate.function() instanceof Or) { + Optional> result = Optional.empty(); + for (Optional> childCoverage : childCoverages) { + if (!childCoverage.isPresent()) { + return Optional.empty(); + } + result = + result.isPresent() + ? Optional.of(Range.and(result.get(), childCoverage.get())) + : childCoverage; + } + return result; + } + + Optional> result = Optional.empty(); + for (Optional> childCoverage : childCoverages) { + if (!childCoverage.isPresent()) { + continue; + } + result = + result.isPresent() + ? Optional.of(Range.and(result.get(), childCoverage.get())) + : childCoverage; + } + return result; } public Optional scan(Predicate predicate) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java index 12bec4fc2f51..74e7623bd436 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java @@ -18,6 +18,9 @@ package org.apache.paimon.table.source; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; import org.apache.paimon.fs.FileIO; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -25,18 +28,27 @@ import org.apache.paimon.globalindex.GlobalIndexScanner; import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; +import org.apache.paimon.globalindex.IndexedSplit; import org.apache.paimon.globalindex.OffsetGlobalIndexReader; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; +import org.apache.paimon.globalindex.VectorGlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.BatchVectorSearch; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.predicate.VectorSearch; +import org.apache.paimon.reader.RecordReader; import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.SpecialFields; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.Range; import org.apache.paimon.utils.RoaringNavigableMap64; import javax.annotation.Nullable; @@ -63,6 +75,7 @@ public abstract class AbstractVectorRead implements Serializable { private static final long serialVersionUID = 1L; protected final FileStoreTable table; + private final PartitionPredicate partitionFilter; private final Predicate filter; protected final int limit; protected final DataField vectorColumn; @@ -74,7 +87,18 @@ protected AbstractVectorRead( int limit, DataField vectorColumn, Map options) { + this(table, null, filter, limit, vectorColumn, options); + } + + protected AbstractVectorRead( + FileStoreTable table, + PartitionPredicate partitionFilter, + Predicate filter, + int limit, + DataField vectorColumn, + Map options) { this.table = table; + this.partitionFilter = partitionFilter; this.filter = filter; this.limit = limit; this.vectorColumn = vectorColumn; @@ -85,7 +109,7 @@ protected AbstractVectorRead( } protected GlobalIndexer createGlobalIndexer(List splits) { - IndexFileMeta firstFile = splits.get(0).vectorIndexFiles().get(0); + IndexFileMeta firstFile = firstVectorIndexFile(splits); String indexType = firstFile.indexType(); GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); if (firstMeta.extraFieldIds() != null) { @@ -127,6 +151,10 @@ protected CompletableFuture> eval( float[] vector, @Nullable RoaringNavigableMap64 includeRowIds, ExecutorService executor) { + if (vectorIndexFiles.isEmpty()) { + return CompletableFuture.completedFuture(Optional.empty()); + } + List indexIOMetaList = buildIOMetaList(indexPathFactory, vectorIndexFiles); @SuppressWarnings("resource") @@ -151,6 +179,10 @@ protected CompletableFuture>> evalBatch( float[][] vectors, @Nullable RoaringNavigableMap64 includeRowIds, ExecutorService executor) { + if (vectorIndexFiles.isEmpty()) { + return CompletableFuture.completedFuture(emptyOptionalResults(vectors.length)); + } + List indexIOMetaList = buildIOMetaList(indexPathFactory, vectorIndexFiles); @SuppressWarnings("resource") @@ -166,6 +198,36 @@ protected CompletableFuture>> evalBatch( .whenComplete((r, t) -> IOUtils.closeQuietly(reader)); } + protected boolean slowSearchEnabled() { + return !fastSearch(); + } + + protected boolean fastSearch() { + return table.coreOptions().globalIndexFastSearch(); + } + + protected ScoredGlobalIndexResult withSlowSearch( + ScoredGlobalIndexResult result, + List splits, + @Nullable GlobalIndexer globalIndexer, + float[] queryVector) { + if (!slowSearchEnabled()) { + return result.topK(limit); + } + + ScoredGlobalIndexResult rawResult = + readSlowSearch(splits, slowSearchMetric(globalIndexer), queryVector); + return result.or(rawResult).topK(limit); + } + + protected ScoredGlobalIndexResult[] emptyScoredResults(int n) { + ScoredGlobalIndexResult[] results = new ScoredGlobalIndexResult[n]; + for (int i = 0; i < n; i++) { + results[i] = ScoredGlobalIndexResult.createEmpty(); + } + return results; + } + private List buildIOMetaList( IndexPathFactory indexPathFactory, List vectorIndexFiles) { List indexIOMetaList = new ArrayList<>(); @@ -179,4 +241,164 @@ private List buildIOMetaList( } return indexIOMetaList; } + + private ScoredGlobalIndexResult readSlowSearch( + List splits, String metric, float[] queryVector) { + RowType readType = SpecialFields.rowTypeWithRowId(table.rowType()); + ReadBuilder rangeDiscoveryBuilder = newRawReadBuilder(readType, false); + TableScan.Plan allDataPlan = rangeDiscoveryBuilder.newScan().plan(); + List nonIndexedRanges = nonIndexedRanges(allDataPlan, splits); + if (nonIndexedRanges.isEmpty()) { + return ScoredGlobalIndexResult.createEmpty(); + } + + TableScan.Plan plan = + newRawReadBuilder(readType, false).withRowRanges(nonIndexedRanges).newScan().plan(); + ReadBuilder readBuilder = newRawReadBuilder(readType, true); + RoaringNavigableMap64 resultBitmap = new RoaringNavigableMap64(); + Map scoreMap = new HashMap<>(); + int vectorIndex = readType.getFieldIndex(vectorColumn.name()); + int rowIdIndex = readType.getFieldIndex(SpecialFields.ROW_ID.name()); + + try (RecordReader reader = + readBuilder.newRead().executeFilter().createReader(plan)) { + reader.forEachRemaining( + row -> { + if (row.isNullAt(vectorIndex)) { + return; + } + float[] stored = getVector(row, vectorIndex); + if (stored.length != queryVector.length) { + throw new IllegalArgumentException( + String.format( + "Query vector dimension mismatch: expected %d, got %d", + stored.length, queryVector.length)); + } + long rowId = row.getLong(rowIdIndex); + resultBitmap.add(rowId); + scoreMap.put(rowId, computeScore(queryVector, stored, metric)); + }); + } catch (IOException e) { + throw new RuntimeException("Failed to read raw vectors for vector slow search.", e); + } + + return ScoredGlobalIndexResult.create(resultBitmap, scoreMap::get).topK(limit); + } + + private ReadBuilder newRawReadBuilder(RowType readType, boolean includeFilter) { + ReadBuilder readBuilder = table.newReadBuilder().withReadType(readType); + if (partitionFilter != null) { + readBuilder.withPartitionFilter(partitionFilter); + } + if (includeFilter && filter != null) { + readBuilder.withFilter(filter); + } + return readBuilder; + } + + private List nonIndexedRanges( + TableScan.Plan allDataPlan, List splits) { + List dataRanges = new ArrayList<>(); + for (Split split : allDataPlan.splits()) { + if (split instanceof IndexedSplit) { + dataRanges.addAll(((IndexedSplit) split).rowRanges()); + continue; + } + if (!(split instanceof DataSplit)) { + continue; + } + for (DataFileMeta file : ((DataSplit) split).dataFiles()) { + if (file.firstRowId() != null) { + dataRanges.add(file.nonNullRowIdRange()); + } + } + } + + List indexedRanges = new ArrayList<>(); + for (VectorSearchSplit split : splits) { + indexedRanges.add(new Range(split.rowRangeStart(), split.rowRangeEnd())); + } + indexedRanges = Range.sortAndMergeOverlap(indexedRanges, true); + + List ranges = new ArrayList<>(); + for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { + ranges.addAll(dataRange.exclude(indexedRanges)); + } + return Range.sortAndMergeOverlap(ranges, true); + } + + private float[] getVector(InternalRow row, int vectorIndex) { + if (vectorColumn.type().getTypeRoot() == DataTypeRoot.VECTOR) { + InternalVector vector = row.getVector(vectorIndex); + return vector.toFloatArray(); + } else if (vectorColumn.type().getTypeRoot() == DataTypeRoot.ARRAY) { + InternalArray array = row.getArray(vectorIndex); + return array.toFloatArray(); + } + throw new IllegalArgumentException( + "Unsupported vector column type: " + vectorColumn.type()); + } + + private String slowSearchMetric(@Nullable GlobalIndexer globalIndexer) { + String metric = null; + if (globalIndexer != null) { + if (!(globalIndexer instanceof VectorGlobalIndexer)) { + throw new IllegalArgumentException( + "Index type '" + + globalIndexer.getClass().getName() + + "' does not provide vector metric for slow search."); + } + metric = ((VectorGlobalIndexer) globalIndexer).metric(); + } + if (metric == null) { + return "l2"; + } + return metric.toLowerCase().replace('-', '_'); + } + + private static IndexFileMeta firstVectorIndexFile(List splits) { + for (VectorSearchSplit split : splits) { + if (!split.vectorIndexFiles().isEmpty()) { + return split.vectorIndexFiles().get(0); + } + } + throw new IllegalArgumentException("No vector index files found."); + } + + private static float computeScore(float[] query, float[] stored, String metric) { + if ("l2".equals(metric)) { + float sumSq = 0; + for (int i = 0; i < query.length; i++) { + float diff = query[i] - stored[i]; + sumSq += diff * diff; + } + return 1.0f / (1.0f + sumSq); + } else if ("cosine".equals(metric)) { + float dot = 0; + float normA = 0; + float normB = 0; + for (int i = 0; i < query.length; i++) { + dot += query[i] * stored[i]; + normA += query[i] * query[i]; + normB += stored[i] * stored[i]; + } + float denominator = (float) (Math.sqrt(normA) * Math.sqrt(normB)); + return denominator == 0 ? 0 : dot / denominator; + } else if ("inner_product".equals(metric)) { + float dot = 0; + for (int i = 0; i < query.length; i++) { + dot += query[i] * stored[i]; + } + return dot; + } + throw new IllegalArgumentException("Unknown vector search metric: " + metric); + } + + private static List> emptyOptionalResults(int n) { + List> results = new ArrayList<>(n); + for (int i = 0; i < n; i++) { + results.add(Optional.empty()); + } + return results; + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java index 1da9ad554f1c..ae31dd03253c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java @@ -23,6 +23,7 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; @@ -51,14 +52,25 @@ public BatchVectorReadImpl( DataField vectorColumn, float[][] vectors, Map options) { - super(table, filter, limit, vectorColumn, options); + this(table, null, filter, limit, vectorColumn, vectors, options); + } + + public BatchVectorReadImpl( + FileStoreTable table, + PartitionPredicate partitionFilter, + Predicate filter, + int limit, + DataField vectorColumn, + float[][] vectors, + Map options) { + super(table, partitionFilter, filter, limit, vectorColumn, options); this.vectors = vectors; } @Override public List readBatch(List splits) { int n = vectors.length; - if (splits.isEmpty()) { + if (splits.isEmpty() && fastSearch()) { List empty = new ArrayList<>(n); for (int i = 0; i < n; i++) { empty.add(GlobalIndexResult.createEmpty()); @@ -66,9 +78,22 @@ public List readBatch(List splits) { return empty; } + GlobalIndexer globalIndexer = splits.isEmpty() ? null : createGlobalIndexer(splits); + ScoredGlobalIndexResult[] indexedResults = + splits.isEmpty() ? emptyScoredResults(n) : readIndexedBatch(splits, globalIndexer); + + List results = new ArrayList<>(n); + for (int i = 0; i < n; i++) { + results.add(withSlowSearch(indexedResults[i], splits, globalIndexer, vectors[i])); + } + return results; + } + + protected ScoredGlobalIndexResult[] readIndexedBatch( + List splits, GlobalIndexer globalIndexer) { + int n = vectors.length; RoaringNavigableMap64 preFilter = preFilter(splits).orElse(null); - GlobalIndexer globalIndexer = createGlobalIndexer(splits); IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); @@ -104,11 +129,6 @@ public List readBatch(List splits) { } } } - - List results = new ArrayList<>(n); - for (int i = 0; i < n; i++) { - results.add(merged[i].topK(limit)); - } - return results; + return merged; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilderImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilderImpl.java index 3e8dd877df98..3c15b7e67a99 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilderImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilderImpl.java @@ -114,6 +114,7 @@ public BatchVectorRead newBatchVectorRead() { for (float[] vector : vectors) { checkNotNull(vector, "Search vector element cannot be null"); } - return new BatchVectorReadImpl(table, filter, limit, vectorColumn, vectors, options); + return new BatchVectorReadImpl( + table, partitionFilter, filter, limit, vectorColumn, vectors, options); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index eab7ab273737..75a4ae73fed2 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -23,6 +23,7 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; @@ -51,19 +52,39 @@ public VectorReadImpl( DataField vectorColumn, float[] vector, Map options) { - super(table, filter, limit, vectorColumn, options); + this(table, null, filter, limit, vectorColumn, vector, options); + } + + public VectorReadImpl( + FileStoreTable table, + PartitionPredicate partitionFilter, + Predicate filter, + int limit, + DataField vectorColumn, + float[] vector, + Map options) { + super(table, partitionFilter, filter, limit, vectorColumn, options); this.vector = vector; } @Override public GlobalIndexResult read(List splits) { - if (splits.isEmpty()) { + if (splits.isEmpty() && fastSearch()) { return GlobalIndexResult.createEmpty(); } + GlobalIndexer globalIndexer = splits.isEmpty() ? null : createGlobalIndexer(splits); + ScoredGlobalIndexResult result = + splits.isEmpty() + ? ScoredGlobalIndexResult.createEmpty() + : readIndexed(splits, globalIndexer); + return withSlowSearch(result, splits, globalIndexer, vector); + } + + protected ScoredGlobalIndexResult readIndexed( + List splits, GlobalIndexer globalIndexer) { RoaringNavigableMap64 preFilter = preFilter(splits).orElse(null); - GlobalIndexer globalIndexer = createGlobalIndexer(splits); IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); @@ -93,6 +114,6 @@ public GlobalIndexResult read(List splits) { merged = merged.or(splitResult.get()); } } - return merged.topK(limit); + return merged; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java index 0ca9612436d4..6f657288d0a4 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java @@ -107,6 +107,6 @@ public VectorScan newVectorScan() { @Override public VectorRead newVectorRead() { checkNotNull(vector, "vector must be set via withVector()"); - return new VectorReadImpl(table, filter, limit, vectorColumn, vector, options); + return new VectorReadImpl(table, partitionFilter, filter, limit, vectorColumn, vector, options); } } diff --git a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java index 3fcd8f2ae497..7046c085a686 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java @@ -18,6 +18,7 @@ package org.apache.paimon.table; +import org.apache.paimon.CoreOptions; import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.GenericRow; import org.apache.paimon.globalindex.DataEvolutionBatchScan; @@ -36,6 +37,7 @@ import org.apache.paimon.table.source.DataSplit; import org.apache.paimon.table.source.ReadBuilder; import org.apache.paimon.table.source.Split; +import org.apache.paimon.table.source.TableScan; import org.apache.paimon.types.DataTypes; import org.apache.paimon.utils.Range; import org.apache.paimon.utils.RoaringNavigableMap64; @@ -44,6 +46,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.stream.Collectors; @@ -130,6 +133,47 @@ public void testBTreeGlobalIndexWithCoreScan() throws Exception { assertThat(readF1).containsExactly("a200", "a300", "a400", "a56789"); } + @Test + public void testBTreeGlobalIndexFastSearchControlsUnindexedData() throws Exception { + write(500L); + createIndex("f1"); + + FileStoreTable table = (FileStoreTable) catalog.getTable(identifier()); + BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder(); + try (BatchTableWrite write0 = writeBuilder.newWrite()) { + for (int i = 500; i < 1000; i++) { + write0.write( + GenericRow.of( + i, + BinaryString.fromString("a" + i), + BinaryString.fromString("b" + i))); + } + BatchTableCommit commit = writeBuilder.newCommit(); + commit.commit(write0.prepareCommit()); + } + + table = (FileStoreTable) catalog.getTable(identifier()); + Predicate predicate = + new PredicateBuilder(table.rowType()) + .in( + 1, + Arrays.asList( + BinaryString.fromString("a100"), + BinaryString.fromString("a700"))); + + ReadBuilder readBuilder = table.newReadBuilder().withFilter(predicate); + List fastSearchResult = readF1(readBuilder, readBuilder.newScan().plan()); + assertThat(fastSearchResult).containsExactly("a100"); + + table = + table.copy( + Collections.singletonMap( + CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false")); + readBuilder = table.newReadBuilder().withFilter(predicate); + List slowSearchResult = readF1(readBuilder, readBuilder.newScan().plan()); + assertThat(slowSearchResult).containsExactly("a100", "a700"); + } + @Test public void testMultipleBTreeIndices() throws Exception { write(100000L); @@ -237,6 +281,16 @@ private List indexSplits( .collect(Collectors.toList()); } + private List readF1(ReadBuilder readBuilder, TableScan.Plan plan) + throws Exception { + List readF1 = new ArrayList<>(); + readBuilder + .newRead() + .createReader(plan) + .forEachRemaining(row -> readF1.add(row.getString(1).toString())); + return readF1; + } + private RoaringNavigableMap64 globalIndexScan(FileStoreTable table, Predicate predicate) throws Exception { try (GlobalIndexScanner scanner = diff --git a/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java b/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java index 522d9fa732ac..07a63dad09eb 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java @@ -30,6 +30,7 @@ import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.btree.BTreeGlobalIndexerFactory; +import org.apache.paimon.globalindex.testvector.TestVectorGlobalIndexer; import org.apache.paimon.globalindex.testvector.TestVectorGlobalIndexerFactory; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.io.CompactIncrement; @@ -240,6 +241,108 @@ public void testVectorSearchEmptyResult() throws Exception { assertThat(result.results().isEmpty()).isTrue(); } + @Test + public void testVectorSearchScansUnindexedDataWhenFastSearchDisabled() throws Exception { + catalog.createTable( + identifier("slow_search_cosine_table"), + Schema.newBuilder() + .column("id", DataTypes.INT()) + .column(VECTOR_FIELD_NAME, new ArrayType(DataTypes.FLOAT())) + .option(CoreOptions.BUCKET.key(), "-1") + .option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true") + .option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true") + .option(CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false") + .option("test.vector.dimension", String.valueOf(DIMENSION)) + .option("test.vector.metric", "cosine") + .build(), + false); + FileStoreTable table = getTable(identifier("slow_search_cosine_table")); + + float[][] vectors = { + {0.0f, 1.0f}, + {0.1f, 0.9f}, + {0.5f, 0.0f}, + {0.99f, 0.01f} + }; + writeVectors(table, vectors); + + buildAndCommitVectorIndex(table, new float[][] {vectors[0], vectors[1]}, new Range(0, 1)); + + TestVectorGlobalIndexer.resetMetricCalls(); + GlobalIndexResult defaultResult = + table.newVectorSearchBuilder() + .withVector(new float[] {1.0f, 0.0f}) + .withLimit(2) + .withVectorColumn(VECTOR_FIELD_NAME) + .executeLocal(); + + assertThat(defaultResult.results()).containsExactlyInAnyOrder(2L, 3L); + assertThat(TestVectorGlobalIndexer.metricCalls()).isGreaterThan(0); + } + + @Test + public void testVectorSearchFastSearchSkipsUnindexedDataByDefault() throws Exception { + catalog.createTable( + identifier("fast_search_table"), + vectorSchemaBuilder(VECTOR_FIELD_NAME).build(), + false); + FileStoreTable table = getTable(identifier("fast_search_table")); + + float[][] vectors = { + {0.0f, 1.0f}, + {0.1f, 0.9f}, + {1.0f, 0.0f}, + {0.95f, 0.05f} + }; + writeVectors(table, vectors); + + buildAndCommitVectorIndex(table, new float[][] {vectors[0], vectors[1]}, new Range(0, 1)); + + GlobalIndexResult result = + table.newVectorSearchBuilder() + .withVector(new float[] {1.0f, 0.0f}) + .withLimit(2) + .withVectorColumn(VECTOR_FIELD_NAME) + .executeLocal(); + + assertThat(result.results()).doesNotContain(2L, 3L); + } + + @Test + public void testVectorSearchSlowSearchScansFilteredUnindexedData() throws Exception { + catalog.createTable( + identifier("slow_search_filtered_table"), + vectorSchemaBuilder(VECTOR_FIELD_NAME) + .option(CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false") + .build(), + false); + FileStoreTable table = getTable(identifier("slow_search_filtered_table")); + + float[][] vectors = { + {0.0f, 1.0f}, + {0.1f, 0.9f}, + {1.0f, 0.0f}, + {0.95f, 0.05f} + }; + writeVectors(table, vectors); + + Range indexedRange = new Range(0, 1); + buildAndCommitVectorIndex(table, new float[][] {vectors[0], vectors[1]}, indexedRange); + buildAndCommitBTreeIndex(table, new int[] {0, 1}, indexedRange); + + Predicate filter = new PredicateBuilder(table.rowType()).greaterOrEqual(0, 2); + GlobalIndexResult defaultResult = + table.newVectorSearchBuilder() + .withVector(new float[] {1.0f, 0.0f}) + .withLimit(2) + .withVectorColumn(VECTOR_FIELD_NAME) + .withFilter(filter) + .executeLocal(); + + assertThat(defaultResult).isInstanceOf(ScoredGlobalIndexResult.class); + assertThat(defaultResult.results()).contains(2L, 3L); + } + @Test public void testVectorSearchTopKLimit() throws Exception { createTableDefault(); diff --git a/paimon-lumina/src/main/java/org/apache/paimon/lumina/index/LuminaVectorGlobalIndexer.java b/paimon-lumina/src/main/java/org/apache/paimon/lumina/index/LuminaVectorGlobalIndexer.java index 276cc5aa8543..69b73a6bce94 100644 --- a/paimon-lumina/src/main/java/org/apache/paimon/lumina/index/LuminaVectorGlobalIndexer.java +++ b/paimon-lumina/src/main/java/org/apache/paimon/lumina/index/LuminaVectorGlobalIndexer.java @@ -21,7 +21,7 @@ import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; import org.apache.paimon.globalindex.GlobalIndexWriter; -import org.apache.paimon.globalindex.GlobalIndexer; +import org.apache.paimon.globalindex.VectorGlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; import org.apache.paimon.options.Options; @@ -31,7 +31,7 @@ import java.util.concurrent.ExecutorService; /** Lumina vector global indexer. */ -public class LuminaVectorGlobalIndexer implements GlobalIndexer { +public class LuminaVectorGlobalIndexer implements VectorGlobalIndexer { private final DataType fieldType; private final LuminaVectorIndexOptions options; @@ -53,4 +53,9 @@ public GlobalIndexReader createReader( ExecutorService executor) { return new LuminaVectorGlobalIndexReader(fileReader, files, fieldType, options, executor); } + + @Override + public String metric() { + return options.metric().getLuminaName(); + } } diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py index 78c8db22e5e1..9a273699c738 100644 --- a/paimon-python/pypaimon/common/options/core_options.py +++ b/paimon-python/pypaimon/common/options/core_options.py @@ -618,6 +618,17 @@ class CoreOptions: .with_description("Whether to enable global index for scan.") ) + GLOBAL_INDEX_FAST_SEARCH: ConfigOption[bool] = ( + ConfigOptions.key("global-index.fast-search") + .boolean_type() + .default_value(True) + .with_description( + "Whether global index queries only search indexed files. " + "Set to false to also scan files not covered by global indexes " + "when supported." + ) + ) + GLOBAL_INDEX_THREAD_NUM: ConfigOption[int] = ( ConfigOptions.key("global-index.thread-num") .int_type() @@ -1091,6 +1102,9 @@ def commit_max_retry_wait(self) -> int: def global_index_enabled(self, default=None): return self.options.get(CoreOptions.GLOBAL_INDEX_ENABLED, default) + def global_index_fast_search(self): + return self.options.get(CoreOptions.GLOBAL_INDEX_FAST_SEARCH) + def global_index_thread_num(self) -> Optional[int]: return self.options.get(CoreOptions.GLOBAL_INDEX_THREAD_NUM) diff --git a/paimon-python/pypaimon/globalindex/global_index_reader.py b/paimon-python/pypaimon/globalindex/global_index_reader.py index 84825c705607..d9a3ae3b6dd1 100644 --- a/paimon-python/pypaimon/globalindex/global_index_reader.py +++ b/paimon-python/pypaimon/globalindex/global_index_reader.py @@ -55,6 +55,9 @@ def on_done(f): class GlobalIndexReader(ABC): """Index reader for global index. All visit methods return Future[Optional[GlobalIndexResult]].""" + def vector_metric(self): + raise NotImplementedError("Vector metric not supported by this reader") + def visit_vector_search(self, vector_search: 'VectorSearch') -> 'Future[Optional[GlobalIndexResult]]': raise NotImplementedError("Vector search not supported by this reader") diff --git a/paimon-python/pypaimon/globalindex/global_index_scanner.py b/paimon-python/pypaimon/globalindex/global_index_scanner.py index 924d30608e27..b945fee5b8a9 100644 --- a/paimon-python/pypaimon/globalindex/global_index_scanner.py +++ b/paimon-python/pypaimon/globalindex/global_index_scanner.py @@ -122,7 +122,7 @@ def index_file_filter(entry): global_index_meta = entry.index_file.global_index_meta if global_index_meta is None: return False - return global_index_meta.index_field_id in filter_field_ids + return _indexed_field_matches(global_index_meta, filter_field_ids) if snapshot is None: snapshot = table.snapshot_manager().get_latest_snapshot() @@ -140,6 +140,64 @@ def index_file_filter(entry): thread_num=table.options.global_index_thread_num(), ) + @staticmethod + def indexed_ranges(table, snapshot) -> list: + from pypaimon.index.index_file_handler import IndexFileHandler + + index_file_handler = IndexFileHandler(table=table) + entries = index_file_handler.scan(snapshot, lambda entry: True) + ranges = [] + for entry in entries: + global_index_meta = entry.index_file.global_index_meta + if global_index_meta is not None: + ranges.append( + Range(global_index_meta.row_range_start, + global_index_meta.row_range_end) + ) + return ranges + + @staticmethod + def predicate_indexed_ranges(table, partition_filter, predicate, snapshot) -> list: + if predicate is None: + return [] + + from pypaimon.index.index_file_handler import IndexFileHandler + + filter_field_names = _get_all_fields(predicate) + filter_field_ids = set() + for field_item in table.fields: + if field_item.name in filter_field_names: + filter_field_ids.add(field_item.id) + + def index_file_filter(entry): + if partition_filter is not None: + if not partition_filter.test(entry.partition): + return False + global_index_meta = entry.index_file.global_index_meta + if global_index_meta is None: + return False + return _indexed_field_matches(global_index_meta, filter_field_ids) + + index_file_handler = IndexFileHandler(table=table) + entries = index_file_handler.scan(snapshot, index_file_filter) + coverage_by_field = {} + for entry in entries: + global_index_meta = entry.index_file.global_index_meta + if global_index_meta is None: + continue + range_key = Range( + global_index_meta.row_range_start, + global_index_meta.row_range_end, + ) + coverage_by_field.setdefault( + global_index_meta.index_field_id, []).append(range_key) + for extra_field_id in global_index_meta.extra_field_ids or []: + coverage_by_field.setdefault(extra_field_id, []).append(range_key) + + field_by_name = {field.name: field for field in table.fields} + return _predicate_indexed_ranges( + predicate, field_by_name, coverage_by_field) or [] + def scan(self, predicate: Optional[Predicate]) -> Optional[GlobalIndexResult]: """Scan the global index with the given predicate.""" return self._evaluator.evaluate(predicate) @@ -214,3 +272,51 @@ def _create_inner_readers(index_type, file_io, index_path, field, io_metas, exec raise ValueError( "Unsupported global-index type in scanner: '%s'" % index_type) + + +def _predicate_indexed_ranges(predicate, field_by_name, coverage_by_field): + method = getattr(predicate, "method", None) + if method not in ("and", "or"): + field = field_by_name.get(getattr(predicate, "field", None)) + if field is None: + return None + coverage = coverage_by_field.get(field.id) + if not coverage: + return None + return Range.sort_and_merge_overlap( + coverage, merge=True, adjacent=True) + + child_coverages = [ + _predicate_indexed_ranges(child, field_by_name, coverage_by_field) + for child in getattr(predicate, "literals", []) or [] + ] + + if method == "or": + result = None + for child_coverage in child_coverages: + if child_coverage is None: + return None + result = ( + Range.and_(result, child_coverage) + if result is not None else child_coverage + ) + return result + + result = None + for child_coverage in child_coverages: + if child_coverage is None: + continue + result = ( + Range.and_(result, child_coverage) + if result is not None else child_coverage + ) + return result + + +def _indexed_field_matches(global_index_meta, filter_field_ids): + if global_index_meta.index_field_id in filter_field_ids: + return True + for field_id in global_index_meta.extra_field_ids or []: + if field_id in filter_field_ids: + return True + return False diff --git a/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py b/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py index cabd4919111f..28a8f8ab320e 100644 --- a/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py +++ b/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py @@ -118,6 +118,10 @@ def visit_vector_search(self, vector_search): return _completed_future(DictBasedScoredIndexResult(id_to_scores)) + def vector_metric(self): + self._ensure_loaded() + return self._index_meta.distance_metric + def _ensure_loaded(self): if self._searcher is not None: return diff --git a/paimon-python/pypaimon/read/read_builder.py b/paimon-python/pypaimon/read/read_builder.py index 5537c97dd647..cc441cb51166 100644 --- a/paimon-python/pypaimon/read/read_builder.py +++ b/paimon-python/pypaimon/read/read_builder.py @@ -38,6 +38,8 @@ def __init__(self, table): self.table: FileStoreTable = table self._predicate: Optional[Predicate] = None + self._partition_predicate: Optional[Predicate] = None + self._read_type: Optional[List[DataField]] = None # ``_projection`` stores the user-facing name list from # :meth:`with_projection`. When dotted names are present, # ``_nested_paths`` is also populated and takes precedence @@ -50,6 +52,16 @@ def with_filter(self, predicate: Predicate) -> 'ReadBuilder': self._predicate = predicate return self + def with_partition_filter(self, predicate: Predicate) -> 'ReadBuilder': + self._partition_predicate = predicate + return self + + def with_read_type(self, read_type: List[DataField]) -> 'ReadBuilder': + self._read_type = list(read_type) + self._projection = None + self._nested_paths = None + return self + def with_projection(self, projection: List[str]) -> 'ReadBuilder': """Project to the given column names. @@ -62,6 +74,7 @@ def with_projection(self, projection: List[str]) -> 'ReadBuilder': Precedence: if a dotted name matches an actual top-level field, the top-level match wins and the name is not walked as a struct path. """ + self._read_type = None self._projection = projection if projection and any('.' in name for name in projection): self._nested_paths = self._resolve_dotted_paths(projection) @@ -77,7 +90,8 @@ def new_scan(self) -> TableScan: return TableScan( table=self.table, predicate=self._predicate, - limit=self._limit + limit=self._limit, + partition_predicate=self._partition_predicate, ) def new_read(self) -> TableRead: @@ -137,6 +151,9 @@ def explain(self, verbose: bool = False) -> ExplainResult: ) def read_type(self) -> List[DataField]: + if self._read_type is not None: + return self._read_type + table_fields = self.table.fields if not self._projection and not self._nested_paths: diff --git a/paimon-python/pypaimon/read/scanner/file_scanner.py b/paimon-python/pypaimon/read/scanner/file_scanner.py index 650bbe8ca402..67d2c626f3ef 100755 --- a/paimon-python/pypaimon/read/scanner/file_scanner.py +++ b/paimon-python/pypaimon/read/scanner/file_scanner.py @@ -24,6 +24,7 @@ from pypaimon.common.predicate import Predicate from pypaimon.globalindex import ScoredGlobalIndexResult +from pypaimon.globalindex.global_index_result import GlobalIndexResult from pypaimon.manifest.index_manifest_file import IndexManifestFile from pypaimon.manifest.manifest_file_manager import ManifestFileManager from pypaimon.manifest.manifest_list_manager import ManifestListManager @@ -49,6 +50,8 @@ from pypaimon.read.scanner.primary_key_table_split_generator import \ PrimaryKeyTableSplitGenerator from pypaimon.read.split import DataSplit +from pypaimon.table.special_fields import SpecialFields +from pypaimon.utils.roaring_bitmap import RoaringBitmap64 from pypaimon.snapshot.snapshot import Snapshot from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.source.deletion_file import DeletionFile @@ -326,6 +329,14 @@ def _create_data_evolution_split_generator(self): global_index_result = self._global_index_result if self._global_index_result is not None \ else self._eval_global_index(snapshot) if global_index_result is not None: + scan_unindexed_data = ( + self._global_index_result is None + and not isinstance(global_index_result, ScoredGlobalIndexResult) + and not self.table.options.global_index_fast_search() + ) + if scan_unindexed_data: + global_index_result = self._with_unindexed_rows( + global_index_result, manifest_files, snapshot) row_ranges = global_index_result.results().to_range_list() if isinstance(global_index_result, ScoredGlobalIndexResult): score_getter = global_index_result.score_getter() @@ -350,6 +361,64 @@ def _create_data_evolution_split_generator(self): score_getter ) + def _with_unindexed_rows(self, indexed_result, manifest_files, snapshot): + data_ranges = [] + entries = self.read_manifest_entries(manifest_files) + for entry in entries: + first_row_id = entry.file.first_row_id + if first_row_id is not None: + data_ranges.append(entry.file.row_id_range()) + + from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner + from pypaimon.utils.range import Range + + predicate_indexed_ranges = Range.sort_and_merge_overlap( + GlobalIndexScanner.predicate_indexed_ranges( + self.table, + self.partition_key_predicate, + self.predicate, + snapshot, + ), + merge=True, + adjacent=True, + ) + unindexed_ranges = [] + for data_range in Range.sort_and_merge_overlap( + data_ranges, merge=True, adjacent=True): + unindexed_ranges.extend(data_range.exclude(predicate_indexed_ranges)) + unindexed_ranges = Range.sort_and_merge_overlap( + unindexed_ranges, merge=True, adjacent=True) + + bitmap = RoaringBitmap64.or_( + indexed_result.results(), + self._matching_unindexed_rows(entries, unindexed_ranges)) + return GlobalIndexResult.create(bitmap) + + def _matching_unindexed_rows(self, entries, row_ranges): + rows = RoaringBitmap64() + if not row_ranges: + return rows + + entries = _filter_manifest_entries_by_row_ranges(entries, row_ranges) + if not entries: + return rows + + split_generator = DataEvolutionSplitGenerator( + self.table, + self.target_split_size, + self.open_file_cost, + self._deletion_files_map(entries), + row_ranges, + ) + splits = split_generator.create_splits(entries) + read_type = SpecialFields.row_type_with_row_id(self.table.fields) + row_id_index = len(read_type) - 1 + reader = self.table.new_read_builder().with_read_type(read_type) \ + .with_filter(self.predicate).new_read() + for row in reader.to_iterator(splits): + rows.add(int(row.get_field(row_id_index))) + return rows + def plan_files(self) -> List[ManifestEntry]: manifest_files, snapshot = self.manifest_scanner() self._scanned_snapshot = snapshot diff --git a/paimon-python/pypaimon/read/table_scan.py b/paimon-python/pypaimon/read/table_scan.py index 36568c618fa5..bceff4e3afb3 100755 --- a/paimon-python/pypaimon/read/table_scan.py +++ b/paimon-python/pypaimon/read/table_scan.py @@ -33,13 +33,15 @@ def __init__( self, table, predicate: Optional[Predicate], - limit: Optional[int] + limit: Optional[int], + partition_predicate: Optional[Predicate] = None ): from pypaimon.table.file_store_table import FileStoreTable self.table: FileStoreTable = table self.predicate = predicate self.limit = limit + self.partition_predicate = partition_predicate self.file_scanner = self._create_file_scanner() def plan(self) -> Plan: @@ -79,7 +81,9 @@ def _create_file_scanner(self) -> FileScanner: earliest_snapshot = snapshot_manager.try_get_earliest_snapshot() latest_snapshot = snapshot_manager.get_latest_snapshot() if earliest_snapshot is None or latest_snapshot is None: - return FileScanner(self.table, lambda: ([], None)) + return FileScanner( + self.table, lambda: ([], None), + partition_predicate=self.partition_predicate) start_timestamp = int(ts[0]) end_timestamp = int(ts[1]) if start_timestamp >= end_timestamp: @@ -87,7 +91,9 @@ def _create_file_scanner(self) -> FileScanner: "Ending timestamp %s should be >= starting timestamp %s." % (end_timestamp, start_timestamp)) if (start_timestamp == end_timestamp or start_timestamp > latest_snapshot.time_millis or end_timestamp < earliest_snapshot.time_millis): - return FileScanner(self.table, lambda: ([], None)) + return FileScanner( + self.table, lambda: ([], None), + partition_predicate=self.partition_predicate) starting_snapshot = snapshot_manager.earlier_or_equal_time_mills(start_timestamp) earliest_snapshot = snapshot_manager.try_get_earliest_snapshot() @@ -118,7 +124,13 @@ def incremental_manifest(): manifests.extend(manifest_files) return manifests, end_snapshot - return FileScanner(self.table, incremental_manifest, self.predicate, self.limit) + return FileScanner( + self.table, + incremental_manifest, + self.predicate, + self.limit, + self.partition_predicate, + ) if has_time_travel: def time_travel_manifest_scanner(): @@ -135,7 +147,8 @@ def time_travel_manifest_scanner(): self.table, time_travel_manifest_scanner, self.predicate, - self.limit + self.limit, + self.partition_predicate, ) def all_manifests(): @@ -146,7 +159,8 @@ def all_manifests(): self.table, all_manifests, self.predicate, - self.limit + self.limit, + self.partition_predicate, ) def with_shard(self, idx_of_this_subtask, number_of_para_subtasks) -> 'TableScan': diff --git a/paimon-python/pypaimon/table/source/batch_vector_search_builder.py b/paimon-python/pypaimon/table/source/batch_vector_search_builder.py index f987680a3661..98b2d8795749 100644 --- a/paimon-python/pypaimon/table/source/batch_vector_search_builder.py +++ b/paimon-python/pypaimon/table/source/batch_vector_search_builder.py @@ -122,6 +122,7 @@ def new_batch_vector_search_read(self): self._limit, self._vector_column, self._query_vectors, + partition_filter=self._partition_filter, filter_=self._filter, options=self._options, ) diff --git a/paimon-python/pypaimon/table/source/vector_search_builder.py b/paimon-python/pypaimon/table/source/vector_search_builder.py index f98547192903..3f904f9f23ae 100644 --- a/paimon-python/pypaimon/table/source/vector_search_builder.py +++ b/paimon-python/pypaimon/table/source/vector_search_builder.py @@ -244,6 +244,7 @@ def new_vector_search_read(self): self._limit, self._vector_column, self._query_vector, + partition_filter=self._partition_filter, filter_=self._filter, options=self._options, ) diff --git a/paimon-python/pypaimon/table/source/vector_search_read.py b/paimon-python/pypaimon/table/source/vector_search_read.py index 005a2ce85be9..1be4f16761cb 100644 --- a/paimon-python/pypaimon/table/source/vector_search_read.py +++ b/paimon-python/pypaimon/table/source/vector_search_read.py @@ -17,6 +17,7 @@ """Vector search read to read index files.""" +import math from abc import ABC, abstractmethod from concurrent.futures import wait @@ -25,6 +26,10 @@ from pypaimon.globalindex.offset_global_index_reader import OffsetGlobalIndexReader from pypaimon.globalindex.vector_search import VectorSearch from pypaimon.globalindex.vector_search_result import DictBasedScoredIndexResult +from pypaimon.utils.range import Range + + +GLOBAL_INDEX_FAST_SEARCH = "global-index.fast-search" class VectorSearchRead(ABC): @@ -56,12 +61,41 @@ def read_batch(self, splits): class AbstractVectorSearchReadImpl: """Base implementation for vector search reads.""" - def __init__(self, table, limit, vector_column, filter_=None, options=None): + GLOBAL_INDEX_FAST_SEARCH = GLOBAL_INDEX_FAST_SEARCH + + def __init__(self, table, limit, vector_column, filter_=None, + options=None, partition_filter=None): self._table = table self._limit = limit self._vector_column = vector_column self._filter = filter_ + self._partition_filter = partition_filter self._options = dict(options or {}) + self._vector_metric = None + + def _search_one(self, query_vector, splits, pre_filter): + # type: (list, list, Optional[RoaringBitmap64]) -> GlobalIndexResult + """Search one query vector across all splits and merge per-split results.""" + futures = [ + self._eval( + split.row_range_start, split.row_range_end, + split.vector_index_files, query_vector, pre_filter + ) + for split in splits + ] + + wait(futures) + + merged_scores = {} + for future in futures: + split_result = future.result() + if split_result is not None: + score_getter = split_result.score_getter() + for row_id in split_result.results(): + if row_id not in merged_scores: + merged_scores[row_id] = score_getter(row_id) + + return DictBasedScoredIndexResult(merged_scores) def _pre_filter(self, splits): # type: (list) -> Optional[RoaringBitmap64] @@ -122,7 +156,7 @@ def _eval(self, row_range_start, row_range_end, vector_index_files, vector=query_vector, limit=self._limit, field_name=self._vector_column.name, - options=self._options, + options=self._index_options(), ) if include_row_ids is not None: vector_search = vector_search.with_include_row_ids(include_row_ids) @@ -131,47 +165,242 @@ def _eval(self, row_range_start, row_range_end, vector_index_files, index_type, file_io, index_path, index_io_meta_list, options ) + if self._slow_search_enabled() and self._vector_metric is None: + self._vector_metric = reader.vector_metric() offset_reader = OffsetGlobalIndexReader(reader, row_range_start, row_range_end) future = offset_reader.visit_vector_search(vector_search) future.add_done_callback(lambda _: reader.close()) return future + def _slow_search_enabled(self): + return not self._fast_search() + + def _fast_search(self): + return str( + self._table_option(GLOBAL_INDEX_FAST_SEARCH, "true") + ).lower() == "true" + + def _with_slow_search(self, result, splits, query_vector): + if not self._slow_search_enabled(): + return result.top_k(self._limit) + + raw_result = self._read_slow_search(splits, query_vector) + return result.or_(raw_result).top_k(self._limit) + + def _read_slow_search(self, splits, query_vector): + from pypaimon.table.special_fields import SpecialFields + + read_type = self._read_type_with_row_id() + range_discovery_builder = self._new_raw_read_builder( + read_type, include_filter=False) + all_data_plan = range_discovery_builder.new_scan().plan() + non_indexed_ranges = self._non_indexed_ranges(all_data_plan, splits) + if not non_indexed_ranges: + return DictBasedScoredIndexResult({}) + + raw_splits = self._wrap_splits_with_row_ranges( + all_data_plan.splits(), non_indexed_ranges) + if not raw_splits: + return DictBasedScoredIndexResult({}) + + read_builder = self._new_raw_read_builder(read_type, include_filter=True) + arrow_table = read_builder.new_read().to_arrow(raw_splits) + if arrow_table is None or arrow_table.num_rows == 0: + return DictBasedScoredIndexResult({}) + + row_id_name = SpecialFields.ROW_ID.name + vector_name = self._vector_column.name + if row_id_name not in arrow_table.column_names: + raise ValueError( + "Vector slow search requires row tracking column %s." + % row_id_name) + if vector_name not in arrow_table.column_names: + raise ValueError( + "Vector slow search read type does not contain vector column %s." + % vector_name) + + metric = self._slow_search_metric() + query = self._normalize_vector(query_vector) + scores = {} + row_ids = arrow_table.column(row_id_name).to_pylist() + vectors = arrow_table.column(vector_name).to_pylist() + for row_id, stored in zip(row_ids, vectors): + if row_id is None or stored is None: + continue + row_id = int(row_id) + if not self._contains_row_id(non_indexed_ranges, row_id): + continue + stored_vector = self._normalize_vector(stored) + if len(stored_vector) != len(query): + raise ValueError( + "Query vector dimension mismatch: expected %d, got %d" + % (len(stored_vector), len(query))) + scores[row_id] = self._compute_score(query, stored_vector, metric) + + return DictBasedScoredIndexResult(scores).top_k(self._limit) + + def _read_type_with_row_id(self): + from pypaimon.table.special_fields import SpecialFields + + fields = list(self._table.fields) + if any(f.name == SpecialFields.ROW_ID.name for f in fields): + return fields + return SpecialFields.row_type_with_row_id(fields) + + def _new_raw_read_builder(self, read_type, include_filter): + read_builder = self._table.new_read_builder().with_read_type(read_type) + if self._partition_filter is not None: + read_builder = read_builder.with_partition_filter(self._partition_filter) + if include_filter and self._filter is not None: + read_builder = read_builder.with_filter(self._filter) + return read_builder + + def _non_indexed_ranges(self, all_data_plan, splits): + data_ranges = [] + for split in all_data_plan.splits(): + data_ranges.extend(self._split_row_ranges(split)) + + indexed_ranges = [ + Range(split.row_range_start, split.row_range_end) + for split in splits + ] + indexed_ranges = Range.sort_and_merge_overlap(indexed_ranges, True) + + ranges = [] + for data_range in Range.sort_and_merge_overlap(data_ranges, True): + ranges.extend(data_range.exclude(indexed_ranges)) + return Range.sort_and_merge_overlap(ranges, True) + + def _split_row_ranges(self, split): + from pypaimon.globalindex.indexed_split import IndexedSplit + + if isinstance(split, IndexedSplit): + return list(split.row_ranges()) + + ranges = [] + for data_file in getattr(split, "files", []) or []: + row_range = self._file_row_range(data_file) + if row_range is not None: + ranges.append(row_range) + return ranges + + def _file_row_range(self, data_file): + try: + row_range = data_file.row_id_range() + except Exception: + row_range = None + if row_range is not None: + return row_range + + first_row_id = getattr(data_file, "first_row_id", None) + row_count = getattr(data_file, "row_count", None) + if first_row_id is None or row_count is None or row_count <= 0: + return None + return Range(int(first_row_id), int(first_row_id) + int(row_count) - 1) + + def _wrap_splits_with_row_ranges(self, splits, row_ranges): + from pypaimon.globalindex.indexed_split import IndexedSplit + + indexed_splits = [] + for split in splits: + if isinstance(split, IndexedSplit): + data_split = split.data_split() + available_ranges = list(split.row_ranges()) + else: + data_split = split + available_ranges = self._split_row_ranges(split) + + expected = Range.and_(available_ranges, row_ranges) + if expected: + indexed_splits.append(IndexedSplit(data_split, expected)) + return indexed_splits + + def _contains_row_id(self, ranges, row_id): + for row_range in ranges: + if row_range.contains(row_id): + return True + return False + + def _index_options(self): + return dict(self._options) + + def _slow_search_metric(self): + metric = self._vector_metric + if metric is None: + return "l2" + return str(metric).lower().replace("-", "_") + + def _table_option(self, key, default=None): + options = self._table_schema_options() + if isinstance(options, dict): + return options.get(key, default) + if hasattr(options, "get"): + try: + value = options.get(key) + return default if value is None else value + except TypeError: + pass + return default + + def _table_schema_options(self): + table_schema = getattr(self._table, "table_schema", None) + return getattr(table_schema, "options", {}) or {} + + @staticmethod + def _normalize_vector(vector): + if hasattr(vector, "as_py"): + vector = vector.as_py() + if hasattr(vector, "tolist"): + vector = vector.tolist() + return [float(v) for v in vector] + + @staticmethod + def _compute_score(query, stored, metric): + if metric == "l2": + sum_sq = 0.0 + for query_value, stored_value in zip(query, stored): + diff = query_value - stored_value + sum_sq += diff * diff + return 1.0 / (1.0 + sum_sq) + if metric == "cosine": + dot = 0.0 + norm_a = 0.0 + norm_b = 0.0 + for query_value, stored_value in zip(query, stored): + dot += query_value * stored_value + norm_a += query_value * query_value + norm_b += stored_value * stored_value + denominator = math.sqrt(norm_a) * math.sqrt(norm_b) + return 0.0 if denominator == 0.0 else dot / denominator + if metric == "inner_product": + dot = 0.0 + for query_value, stored_value in zip(query, stored): + dot += query_value * stored_value + return dot + raise ValueError("Unknown vector search metric: %s" % metric) + class VectorSearchReadImpl(AbstractVectorSearchReadImpl, VectorSearchRead): """Implementation for VectorSearchRead.""" def __init__(self, table, limit, vector_column, query_vector, filter_=None, - options=None): + options=None, partition_filter=None): super().__init__(table, limit, vector_column, - filter_=filter_, options=options) + filter_=filter_, options=options, + partition_filter=partition_filter) self._query_vector = query_vector def read(self, splits): # type: (List[VectorSearchSplit]) -> GlobalIndexResult - if not splits: + if not splits and self._fast_search(): return GlobalIndexResult.create_empty() - pre_filter = self._pre_filter(splits) - futures = [ - self._eval( - split.row_range_start, split.row_range_end, - split.vector_index_files, self._query_vector, pre_filter - ) - for split in splits - ] - - wait(futures) - - merged_scores = {} - for future in futures: - split_result = future.result() - if split_result is not None: - score_getter = split_result.score_getter() - for row_id in split_result.results(): - if row_id not in merged_scores: - merged_scores[row_id] = score_getter(row_id) - - return DictBasedScoredIndexResult(merged_scores).top_k(self._limit) + result = ( + DictBasedScoredIndexResult({}) + if not splits + else self._search_one(self._query_vector, splits, self._pre_filter(splits)) + ) + return self._with_slow_search(result, splits, self._query_vector) class BatchVectorSearchReadImpl(AbstractVectorSearchReadImpl, @@ -179,43 +408,27 @@ class BatchVectorSearchReadImpl(AbstractVectorSearchReadImpl, """Batch vector search read; result ``i`` corresponds to query vector ``i``.""" def __init__(self, table, limit, vector_column, query_vectors, - filter_=None, options=None): + filter_=None, options=None, partition_filter=None): super().__init__(table, limit, vector_column, - filter_=filter_, options=options) + filter_=filter_, options=options, + partition_filter=partition_filter) self._query_vectors = list(query_vectors) def read_batch(self, splits): # type: (List[VectorSearchSplit]) -> List[GlobalIndexResult] n = len(self._query_vectors) - if not splits: + if not splits and self._fast_search(): return [GlobalIndexResult.create_empty() for _ in range(n)] - pre_filter = self._pre_filter(splits) - futures_by_vector = [ - [ - self._eval( - split.row_range_start, split.row_range_end, - split.vector_index_files, vector, pre_filter - ) - for split in splits - ] - for vector in self._query_vectors - ] - - for futures in futures_by_vector: - wait(futures) - results = [] - for futures in futures_by_vector: - merged_scores = {} - for future in futures: - split_result = future.result() - if split_result is not None: - score_getter = split_result.score_getter() - for row_id in split_result.results(): - if row_id not in merged_scores: - merged_scores[row_id] = score_getter(row_id) - results.append(DictBasedScoredIndexResult(merged_scores).top_k(self._limit)) + pre_filter = self._pre_filter(splits) if splits else None + for vector in self._query_vectors: + result = ( + DictBasedScoredIndexResult({}) + if not splits + else self._search_one(vector, splits, pre_filter) + ) + results.append(self._with_slow_search(result, splits, vector)) return results diff --git a/paimon-python/pypaimon/table/source/vector_search_scan.py b/paimon-python/pypaimon/table/source/vector_search_scan.py index 5b8b300d5c19..ecfa709bed54 100644 --- a/paimon-python/pypaimon/table/source/vector_search_scan.py +++ b/paimon-python/pypaimon/table/source/vector_search_scan.py @@ -96,7 +96,12 @@ def index_file_filter(entry): field_id = global_index_meta.index_field_id if vector_column.id == field_id: return True - return field_id in filter_field_ids + if field_id in filter_field_ids: + return True + for extra_field_id in global_index_meta.extra_field_ids or []: + if extra_field_id in filter_field_ids: + return True + return False entries = index_file_handler.scan(snapshot, index_file_filter) all_index_files = [entry.index_file for entry in entries] diff --git a/paimon-python/pypaimon/tests/global_index_test.py b/paimon-python/pypaimon/tests/global_index_test.py index 873e93358455..fcecc7b88a1b 100644 --- a/paimon-python/pypaimon/tests/global_index_test.py +++ b/paimon-python/pypaimon/tests/global_index_test.py @@ -20,13 +20,16 @@ import pyarrow as pa +from pypaimon.common.predicate_builder import PredicateBuilder from pypaimon.globalindex.global_index_result import GlobalIndexResult +from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner from pypaimon.index.index_file_handler import IndexFileHandler from pypaimon.snapshot.snapshot_manager import SnapshotManager from pypaimon.tests.data_evolution_test_helpers import ( BatchModeMixin, DataEvolutionTestBase, ) +from pypaimon.utils.roaring_bitmap import RoaringBitmap64 from pypaimon.utils.range import Range @@ -122,3 +125,34 @@ def spy_scan(self_h, snapshot, entry_filter=None): "GlobalIndexScanner.create self-fetched latest snapshot, " "so global index used latest while manifest used the " "time-travel snapshot — silent correctness bug.") + + def test_fast_search_false_filters_unindexed_rows_exactly(self): + table = self._create_table().copy({'global-index.fast-search': 'false'}) + self._write_arrow(table, pa.table( + {'id': [0, 1, 2, 3], 'name': ['a', 'b', 'c', 'd'], + 'age': [0, 1, 2, 3], 'city': ['x', 'x', 'y', 'y']}, + schema=self.pa_schema)) + + predicate = PredicateBuilder(table.fields).is_in('id', [0, 2]) + + indexed = RoaringBitmap64() + indexed.add(0) + + class FakeScanner: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + def scan(self, _predicate): + return GlobalIndexResult.create(indexed) + + with patch.object(GlobalIndexScanner, 'create', + return_value=FakeScanner()), \ + patch.object(GlobalIndexScanner, 'predicate_indexed_ranges', + return_value=[Range(0, 1)]): + rb = table.new_read_builder().with_filter(predicate) + result = rb.new_read().to_arrow(rb.new_scan().plan().splits()) + + self.assertEqual([0, 2], result.column('id').to_pylist()) diff --git a/paimon-python/pypaimon/tests/vector_search_filter_test.py b/paimon-python/pypaimon/tests/vector_search_filter_test.py index 5aa886220faf..c08a78e84380 100644 --- a/paimon-python/pypaimon/tests/vector_search_filter_test.py +++ b/paimon-python/pypaimon/tests/vector_search_filter_test.py @@ -30,6 +30,8 @@ from typing import List from unittest import mock +import pyarrow as pa + from pypaimon.common.predicate import Predicate from pypaimon.common.predicate_builder import PredicateBuilder from pypaimon.globalindex.btree.btree_index_meta import BTreeIndexMeta @@ -50,19 +52,19 @@ class _StubSchema: - def __init__(self): - self.options = {} + def __init__(self, options=None): + self.options = dict(options or {}) class _StubTable: """Minimal FileStoreTable stand-in.""" - def __init__(self, fields, entries, partition_fields=None): + def __init__(self, fields, entries, partition_fields=None, options=None): self.fields = fields self.partition_keys_fields = partition_fields or [] self.partition_keys: List[str] = [ f.name for f in self.partition_keys_fields] - self.table_schema = _StubSchema() + self.table_schema = _StubSchema(options) self.file_io = object() self._entries = entries @@ -100,11 +102,13 @@ def _field(fid, name, dtype="INT"): def _entry(partition_row, field_id, index_type, file_name, - row_range_start, row_range_end, external_path=None): + row_range_start, row_range_end, external_path=None, + extra_field_ids=None): meta = GlobalIndexMeta( row_range_start=row_range_start, row_range_end=row_range_end, index_field_id=field_id, + extra_field_ids=extra_field_ids, index_meta=b"", ) index_file = IndexFileMeta( @@ -875,6 +879,29 @@ def test_scan_attaches_overlapping_scalar_index_files(self): (splits_sorted[1].row_range_start, splits_sorted[1].row_range_end)) + def test_scan_attaches_scalar_index_when_filter_hits_extra_field(self): + category_field = _field(2, "category") + entry = _entry( + None, field_id=0, index_type="btree", + file_name="id-category-btree.index", + row_range_start=0, row_range_end=9, + extra_field_ids=[category_field.id]) + self.entries = [self.entries[0], self.entries[1], entry] + self.table = _StubTable( + fields=[self.id_field, self.embedding_field, category_field], + entries=self.entries) + mock.patch.stopall() + _patch_snapshot(self, self.entries) + + filter_pred = Predicate(method="equal", index=2, field="category", + literals=[1]) + splits = self._builder(filter_pred).new_vector_search_scan().scan().splits() + + for split in splits: + self.assertEqual( + ["id-category-btree.index"], + [f.file_name for f in split.scalar_index_files]) + def test_read_threads_prefilter_bitmap_as_include_row_ids(self): """preFilter bitmap from scanner.scan(filter) must reach each split's VectorSearch, offset-rebased to local coords by OffsetGlobalIndexReader. @@ -1431,6 +1458,137 @@ def test_hybrid_search_partition_filter_prunes_full_text_route(self): class VectorSearchManySplitsTest(unittest.TestCase): + def test_fast_search_controls_unindexed_range_scan(self): + from pypaimon.globalindex.vector_search_result import ( + DictBasedScoredIndexResult, + ) + from pypaimon.manifest.schema.data_file_meta import DataFileMeta + from pypaimon.manifest.schema.simple_stats import SimpleStats + from pypaimon.read.plan import Plan + from pypaimon.read.split import DataSplit + from pypaimon.table.source.vector_search_read import VectorSearchReadImpl + from pypaimon.table.source.vector_search_split import VectorSearchSplit + from pypaimon.utils.range import Range + + embedding_field = _field(1, "embedding", "FLOAT") + entry = _entry(None, field_id=1, index_type="lumina-vector-ann", + file_name="vec-0.index", + row_range_start=0, row_range_end=0) + raw_file = DataFileMeta( + file_name="data-0.orc", + file_size=10, + row_count=3, + min_key=GenericRow([], []), + max_key=GenericRow([], []), + key_stats=SimpleStats.empty_stats(), + value_stats=SimpleStats.empty_stats(), + min_sequence_number=0, + max_sequence_number=0, + schema_id=0, + level=0, + extra_files=[], + first_row_id=0, + ) + raw_split = DataSplit(files=[raw_file], partition=None, bucket=0) + table = _StubTable(fields=[embedding_field], entries=[entry]) + table.index_search_options = [] + + class _RawScan: + def plan(self_inner): + return Plan([raw_split]) + + class _RawRead: + def to_arrow(self_inner, splits): + table.raw_read_splits = splits + return pa.table({ + "embedding": [[2.0, 0.0], [4.0, 0.0]], + "_ROW_ID": [1, 2], + }) + + class _RawReadBuilder: + def __init__(self_inner): + self_inner.read_type = None + self_inner.filter = None + + def with_read_type(self_inner, read_type): + self_inner.read_type = read_type + table.raw_read_type = read_type + return self_inner + + def with_filter(self_inner, predicate): + self_inner.filter = predicate + table.raw_filter = predicate + return self_inner + + def with_partition_filter(self_inner, partition_filter): + table.raw_partition_filter = partition_filter + return self_inner + + def new_scan(self_inner): + return _RawScan() + + def new_read(self_inner): + return _RawRead() + + table.new_read_builder = lambda: _RawReadBuilder() + + def _fake_create(index_type, file_io, index_path, + index_io_meta_list, options=None): + class _FakeReader: + def vector_metric(self_inner): + table.metric_calls = getattr(table, "metric_calls", 0) + 1 + return "l2" + + def visit_vector_search(self_inner, vs): + table.index_search_options.append(dict(vs.options)) + return _completed_future( + DictBasedScoredIndexResult({0: 0.9})) + + def close(self_inner): + pass + + return _FakeReader() + + splits = [ + VectorSearchSplit( + row_range_start=0, row_range_end=0, + vector_index_files=[entry.index_file]) + ] + + with mock.patch( + "pypaimon.table.source.vector_search_read._create_vector_reader", + side_effect=_fake_create): + disabled = VectorSearchReadImpl( + table, limit=2, vector_column=embedding_field, + query_vector=[4.0, 0.0], filter_=None) + self.assertEqual([0], sorted(disabled.read(splits).results())) + + table.table_schema.options["global-index.fast-search"] = "false" + enabled = VectorSearchReadImpl( + table, limit=2, vector_column=embedding_field, + query_vector=[4.0, 0.0], filter_=None) + result = enabled.read(splits) + + self.assertEqual([0, 2], sorted(result.results())) + self.assertEqual([Range(1, 2)], + table.raw_read_splits[0].row_ranges()) + self.assertEqual({}, table.index_search_options[-1]) + self.assertEqual(1, table.metric_calls) + + table.metric_calls = 0 + table.index_search_options = [] + table.raw_read_splits = [] + with mock.patch( + "pypaimon.table.source.vector_search_read._create_vector_reader", + side_effect=_fake_create): + repeated_read = VectorSearchReadImpl( + table, limit=2, vector_column=embedding_field, + query_vector=[4.0, 0.0], filter_=None) + result = repeated_read.read(splits) + + self.assertEqual([0, 2], sorted(result.results())) + self.assertEqual(1, table.metric_calls) + def test_vector_search_with_many_splits(self): from pypaimon.globalindex.vector_search_result import ( DictBasedScoredIndexResult, diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java index 1ddc9c9ac96f..c85bba61f503 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java @@ -25,6 +25,7 @@ import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.source.VectorReadImpl; @@ -36,6 +37,8 @@ import org.apache.spark.broadcast.Broadcast; +import javax.annotation.Nullable; + import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -61,11 +64,27 @@ public SparkVectorReadImpl( DataField vectorColumn, float[] vector, Map options) { - super(table, filter, limit, vectorColumn, vector, options); + this(table, null, filter, limit, vectorColumn, vector, options); + } + + public SparkVectorReadImpl( + FileStoreTable table, + @Nullable PartitionPredicate partitionFilter, + Predicate filter, + int limit, + DataField vectorColumn, + float[] vector, + Map options) { + super(table, partitionFilter, filter, limit, vectorColumn, vector, options); } @Override public GlobalIndexResult read(List splits) { + // Slow search scans table data and should run in the coordinator with normal Paimon split + // planning; Spark distribution below is only for index-only evaluation. + if (slowSearchEnabled()) { + return super.read(splits); + } if (splits.isEmpty()) { return GlobalIndexResult.createEmpty(); } diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorSearchBuilderImpl.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorSearchBuilderImpl.java index be8d3d8cad5f..87044862582e 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorSearchBuilderImpl.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorSearchBuilderImpl.java @@ -38,6 +38,7 @@ public SparkVectorSearchBuilderImpl(InnerTable table) { @Override public VectorRead newVectorRead() { - return new SparkVectorReadImpl(table, filter, limit, vectorColumn, vector, options); + return new SparkVectorReadImpl( + table, partitionFilter, filter, limit, vectorColumn, vector, options); } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index a3f3bf51fb9a..deee2bdcdfaa 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -21,7 +21,6 @@ import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; import org.apache.paimon.globalindex.GlobalIndexWriter; -import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; import org.apache.paimon.types.DataType; @@ -32,7 +31,7 @@ import java.util.concurrent.ExecutorService; /** Vector global indexer backed by paimon-vector-index. */ -public class VectorGlobalIndexer implements GlobalIndexer { +public class VectorGlobalIndexer implements org.apache.paimon.globalindex.VectorGlobalIndexer { private final DataType fieldType; private final Map options; @@ -56,4 +55,9 @@ public GlobalIndexReader createReader( ExecutorService executor) { return new VectorGlobalIndexReader(fileReader, files, fieldType, executor); } + + @Override + public String metric() { + return options.getOrDefault("metric", "l2"); + } } From 447e3fc46457b78a4945b55f128269b067198a9c Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 11:40:37 +0800 Subject: [PATCH 02/10] fix checkstyle --- .../paimon/globalindex/DataEvolutionBatchScan.java | 11 ++++++++--- .../apache/paimon/globalindex/GlobalIndexScanner.java | 9 +++++---- .../paimon/table/source/VectorSearchBuilderImpl.java | 3 ++- .../paimon/table/BtreeGlobalIndexTableTest.java | 3 +-- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java index 741b281ff607..e1b587a6bf05 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java @@ -295,7 +295,10 @@ private RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedRes table, batchScan.snapshotReader().manifestsReader().partitionFilter(), filter, - batchScan.snapshotReader().snapshotManager().snapshot(snapshotId(allDataPlan))); + batchScan + .snapshotReader() + .snapshotManager() + .snapshot(snapshotId(allDataPlan))); predicateIndexedRanges = Range.sortAndMergeOverlap(predicateIndexedRanges, true); List unindexedRanges = new ArrayList<>(); @@ -319,7 +322,8 @@ private RoaringNavigableMap64 matchingRows(List ranges) { RowType readType = rowTypeWithRowId(table.rowType()); RowRangeIndex rowRangeIndex = RowRangeIndex.create(ranges); ReadBuilder readBuilder = table.newReadBuilder().withReadType(readType).withFilter(filter); - readBuilder.withPartitionFilter(batchScan.snapshotReader().manifestsReader().partitionFilter()); + readBuilder.withPartitionFilter( + batchScan.snapshotReader().manifestsReader().partitionFilter()); List splits = readBuilder.withRowRangeIndex(rowRangeIndex).newScan().plan().splits(); int rowIdIndex = readType.getFieldIndex(ROW_ID.name()); try { @@ -347,7 +351,8 @@ private long snapshotId(TableScan.Plan plan) { private TableScan.Plan allDataPlan() { ReadBuilder readBuilder = table.newReadBuilder(); - readBuilder.withPartitionFilter(batchScan.snapshotReader().manifestsReader().partitionFilter()); + readBuilder.withPartitionFilter( + batchScan.snapshotReader().manifestsReader().partitionFilter()); return readBuilder.newScan().plan(); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 21e36fffd94a..d5b13ef61075 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -211,7 +211,8 @@ public static List indexedRanges( continue; } Range range = new Range(globalIndex.rowRangeStart(), globalIndex.rowRangeEnd()); - coverageByField.computeIfAbsent(globalIndex.indexFieldId(), k -> new ArrayList<>()) + coverageByField + .computeIfAbsent(globalIndex.indexFieldId(), k -> new ArrayList<>()) .add(range); if (globalIndex.extraFieldIds() != null) { for (int id : globalIndex.extraFieldIds()) { @@ -226,8 +227,7 @@ public static List indexedRanges( public static Optional create( FileStoreTable table, PartitionPredicate partitionFilter, Predicate filter) { List indexFiles = - table.store() - .newIndexFileHandler() + table.store().newIndexFileHandler() .scan( tryTravelOrLatest(table), indexFileFilter(table, partitionFilter, filter)) @@ -283,7 +283,8 @@ private static Optional> indexedRanges( if (!fieldRef.isPresent() || !rowType.containsField(fieldRef.get().name())) { return Optional.empty(); } - List coverage = coverageByField.get(rowType.getField(fieldRef.get().name()).id()); + List coverage = + coverageByField.get(rowType.getField(fieldRef.get().name()).id()); if (coverage == null || coverage.isEmpty()) { return Optional.empty(); } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java index 6f657288d0a4..c2f9f440b792 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilderImpl.java @@ -107,6 +107,7 @@ public VectorScan newVectorScan() { @Override public VectorRead newVectorRead() { checkNotNull(vector, "vector must be set via withVector()"); - return new VectorReadImpl(table, partitionFilter, filter, limit, vectorColumn, vector, options); + return new VectorReadImpl( + table, partitionFilter, filter, limit, vectorColumn, vector, options); } } diff --git a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java index 7046c085a686..8c02fd392476 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java @@ -281,8 +281,7 @@ private List indexSplits( .collect(Collectors.toList()); } - private List readF1(ReadBuilder readBuilder, TableScan.Plan plan) - throws Exception { + private List readF1(ReadBuilder readBuilder, TableScan.Plan plan) throws Exception { List readF1 = new ArrayList<>(); readBuilder .newRead() From b2350bcc79aa8cf6996445f8484d69c57afb665b Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 11:59:29 +0800 Subject: [PATCH 03/10] [vector] Rename native vector global indexer --- ...orGlobalIndexer.java => NativeVectorGlobalIndexer.java} | 7 ++++--- .../paimon/vector/index/VectorGlobalIndexerFactory.java | 2 +- .../apache/paimon/vector/index/VectorGlobalIndexTest.java | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) rename paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/{VectorGlobalIndexer.java => NativeVectorGlobalIndexer.java} (88%) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java similarity index 88% rename from paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java index deee2bdcdfaa..fff0aaa65bd1 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java @@ -30,14 +30,15 @@ import java.util.Objects; import java.util.concurrent.ExecutorService; -/** Vector global indexer backed by paimon-vector-index. */ -public class VectorGlobalIndexer implements org.apache.paimon.globalindex.VectorGlobalIndexer { +/** Native vector global indexer backed by paimon-vector-index. */ +public class NativeVectorGlobalIndexer implements org.apache.paimon.globalindex.VectorGlobalIndexer { private final DataType fieldType; private final Map options; private final String identifier; - public VectorGlobalIndexer(DataType fieldType, Map options, String identifier) { + public NativeVectorGlobalIndexer( + DataType fieldType, Map options, String identifier) { this.fieldType = fieldType; this.options = Objects.requireNonNull(options, "options must not be null"); this.identifier = Objects.requireNonNull(identifier, "identifier must not be null"); diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index 936772311478..54934c42e3f0 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -36,7 +36,7 @@ public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory @Override public GlobalIndexer create(DataField field, Options options) { String identifier = identifier(); - return new VectorGlobalIndexer( + return new NativeVectorGlobalIndexer( field.type(), nativeOptions(field.type(), options, identifier, field.name()), identifier); diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index f08c072984d7..ad56430e8275 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -348,8 +348,8 @@ public void testViaIndexer() throws IOException { new float[] {0.7f, 0.7f} }; - VectorGlobalIndexer indexer = - new VectorGlobalIndexer( + NativeVectorGlobalIndexer indexer = + new NativeVectorGlobalIndexer( vectorType, VectorGlobalIndexerFactory.nativeOptions( vectorType, options, IVF_PQ_IDENTIFIER, fieldName), From 03ca56c18a982f67b3276430e7a8ea99056e1cb8 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 12:04:38 +0800 Subject: [PATCH 04/10] [vector] Add native prefix to vector index classes --- .../IvfFlatVectorGlobalIndexerFactory.java | 2 +- ...IvfHnswFlatVectorGlobalIndexerFactory.java | 2 +- .../IvfHnswSqVectorGlobalIndexerFactory.java | 2 +- ...PqAlgorithmVectorGlobalIndexerFactory.java | 2 +- ...ava => NativeVectorGlobalIndexReader.java} | 4 +- ...ava => NativeVectorGlobalIndexWriter.java} | 6 +- .../index/NativeVectorGlobalIndexer.java | 4 +- ... => NativeVectorGlobalIndexerFactory.java} | 2 +- ....java => NativeVectorGlobalIndexTest.java} | 76 +++++++++---------- ...NativeVectorGlobalIndexerFactoryTest.java} | 20 ++--- .../SeekableStreamVectorIndexInputTest.java | 10 +-- 11 files changed, 65 insertions(+), 65 deletions(-) rename paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/{VectorGlobalIndexReader.java => NativeVectorGlobalIndexReader.java} (99%) rename paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/{VectorGlobalIndexWriter.java => NativeVectorGlobalIndexWriter.java} (98%) rename paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/{VectorGlobalIndexerFactory.java => NativeVectorGlobalIndexerFactory.java} (98%) rename paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/{VectorGlobalIndexTest.java => NativeVectorGlobalIndexTest.java} (87%) rename paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/{VectorGlobalIndexerFactoryTest.java => NativeVectorGlobalIndexerFactoryTest.java} (92%) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java index 572c7cf4edb2..12da3c121df3 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java @@ -19,7 +19,7 @@ package org.apache.paimon.vector.index; /** Factory for the {@code ivf-flat} vector index identifier. */ -public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { +public class IvfFlatVectorGlobalIndexerFactory extends NativeVectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-flat"; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java index 159e7af6f1ba..764920d5d769 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -19,7 +19,7 @@ package org.apache.paimon.vector.index; /** Factory for the {@code ivf-hnsw-flat} vector index identifier. */ -public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { +public class IvfHnswFlatVectorGlobalIndexerFactory extends NativeVectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-hnsw-flat"; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java index 51c72cd8f39c..3f09984db5f5 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -19,7 +19,7 @@ package org.apache.paimon.vector.index; /** Factory for the {@code ivf-hnsw-sq} vector index identifier. */ -public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { +public class IvfHnswSqVectorGlobalIndexerFactory extends NativeVectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-hnsw-sq"; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java index f3932de46ed6..f00adfcbfc3e 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -19,7 +19,7 @@ package org.apache.paimon.vector.index; /** Factory for the {@code ivf-pq} vector index identifier. */ -public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { +public class IvfPqAlgorithmVectorGlobalIndexerFactory extends NativeVectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-pq"; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexReader.java similarity index 99% rename from paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexReader.java index cde8d2a83dd3..92f064d27089 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexReader.java @@ -60,7 +60,7 @@ *

    Each shard has exactly one vector index file. The reader lazily opens the index and performs * vector similarity search. */ -public class VectorGlobalIndexReader implements GlobalIndexReader { +public class NativeVectorGlobalIndexReader implements GlobalIndexReader { private static final String NPROBE_PARAMETER = "ivf.nprobe"; private static final String EF_SEARCH_PARAMETER = "hnsw.ef_search"; @@ -78,7 +78,7 @@ public class VectorGlobalIndexReader implements GlobalIndexReader { private volatile VectorIndexReader vectorReader; private SeekableInputStream openStream; - public VectorGlobalIndexReader( + public NativeVectorGlobalIndexReader( GlobalIndexFileReader fileReader, List ioMetas, DataType fieldType, diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexWriter.java similarity index 98% rename from paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexWriter.java index 0554bb78620e..fac897976eaf 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexWriter.java @@ -54,11 +54,11 @@ * *

    Thread safety: This class is not thread-safe. */ -public class VectorGlobalIndexWriter implements GlobalIndexSingleColumnWriter, Closeable { +public class NativeVectorGlobalIndexWriter implements GlobalIndexSingleColumnWriter, Closeable { private static final String FILE_NAME_PREFIX = "vector"; - private static final Logger LOG = LoggerFactory.getLogger(VectorGlobalIndexWriter.class); + private static final Logger LOG = LoggerFactory.getLogger(NativeVectorGlobalIndexWriter.class); private static final int IO_BUFFER_SIZE = 8 * 1024 * 1024; private static final int ADD_BATCH_SIZE = 10000; @@ -79,7 +79,7 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingleColumnWriter, C private long rowCount; - public VectorGlobalIndexWriter( + public NativeVectorGlobalIndexWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, Map options, diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java index fff0aaa65bd1..832573399238 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java @@ -46,7 +46,7 @@ public NativeVectorGlobalIndexer( @Override public GlobalIndexWriter createWriter(GlobalIndexFileWriter fileWriter) { - return new VectorGlobalIndexWriter(fileWriter, fieldType, options, identifier); + return new NativeVectorGlobalIndexWriter(fileWriter, fieldType, options, identifier); } @Override @@ -54,7 +54,7 @@ public GlobalIndexReader createReader( GlobalIndexFileReader fileReader, List files, ExecutorService executor) { - return new VectorGlobalIndexReader(fileReader, files, fieldType, executor); + return new NativeVectorGlobalIndexReader(fileReader, files, fieldType, executor); } @Override diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactory.java similarity index 98% rename from paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactory.java index 54934c42e3f0..41f09d6a7a60 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactory.java @@ -29,7 +29,7 @@ import java.util.Map; /** Factory for creating vector indexes backed by paimon-vector-index. */ -public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory { +public abstract class NativeVectorGlobalIndexerFactory implements GlobalIndexerFactory { private static final int DEFAULT_DIMENSION = 128; diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java similarity index 87% rename from paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java rename to paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java index ad56430e8275..ea00c0ec2fd4 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java @@ -58,8 +58,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -/** Tests for {@link VectorGlobalIndexWriter} and {@link VectorGlobalIndexReader}. */ -public class VectorGlobalIndexTest { +/** Tests for {@link NativeVectorGlobalIndexWriter} and {@link NativeVectorGlobalIndexReader}. */ +public class NativeVectorGlobalIndexTest { @TempDir java.nio.file.Path tempDir; @@ -104,7 +104,7 @@ public void cleanup() throws IOException { public void testDimensionMismatch() { Options options = createDefaultOptions(64); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); float[] wrongDimVector = new float[32]; assertThatThrownBy(() -> writer.write(wrongDimVector, 0)) @@ -129,7 +129,7 @@ public void testNanInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); assertThatThrownBy(() -> writer.write(new float[] {1.0f, Float.NaN}, 0)) .isInstanceOf(IllegalArgumentException.class) @@ -143,7 +143,7 @@ public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(null, 0); // row 0 - null assertThatThrownBy(() -> writer.write(new float[] {Float.POSITIVE_INFINITY, 0.0f}, 1)) @@ -158,7 +158,7 @@ public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(null, 0); writer.write(null, 1); @@ -185,18 +185,18 @@ public void testVectorSearchParameterParsing() { parameters.put("hnsw.ef_search", "80"); parameters.put("ignored", "bad"); - assertThat(VectorGlobalIndexReader.nprobe(parameters)).isEqualTo(24); - assertThat(VectorGlobalIndexReader.efSearch(parameters)).isEqualTo(80); - assertThat(VectorGlobalIndexReader.nprobe(Collections.emptyMap())).isEqualTo(16); - assertThat(VectorGlobalIndexReader.efSearch(Collections.emptyMap())).isEqualTo(0); + assertThat(NativeVectorGlobalIndexReader.nprobe(parameters)).isEqualTo(24); + assertThat(NativeVectorGlobalIndexReader.efSearch(parameters)).isEqualTo(80); + assertThat(NativeVectorGlobalIndexReader.nprobe(Collections.emptyMap())).isEqualTo(16); + assertThat(NativeVectorGlobalIndexReader.efSearch(Collections.emptyMap())).isEqualTo(0); } @Test public void testVectorSearchParameterRangeValidationDelegatedToNative() { - assertThat(VectorGlobalIndexReader.nprobe(Collections.singletonMap("ivf.nprobe", "0"))) + assertThat(NativeVectorGlobalIndexReader.nprobe(Collections.singletonMap("ivf.nprobe", "0"))) .isEqualTo(0); assertThat( - VectorGlobalIndexReader.efSearch( + NativeVectorGlobalIndexReader.efSearch( Collections.singletonMap("hnsw.ef_search", "-1"))) .isEqualTo(-1); } @@ -223,7 +223,7 @@ public void testFloatVectorEndToEnd() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); for (int i = 0; i < vectors.length; i++) { writer.write(vectors[i], i); } @@ -231,8 +231,8 @@ public void testFloatVectorEndToEnd() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(3); @@ -262,7 +262,7 @@ public void testSearchWithRoaringFilter() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); for (int i = 0; i < vectors.length; i++) { writer.write(vectors[i], i); } @@ -270,8 +270,8 @@ public void testSearchWithRoaringFilter() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { // Filter to rows {1, 4} only RoaringNavigableMap64 filter = new RoaringNavigableMap64(); @@ -303,7 +303,7 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(vectors[0], 0); // row 0 writer.write(null, 1); // row 1 - null @@ -318,8 +318,8 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(3); @@ -351,12 +351,12 @@ public void testViaIndexer() throws IOException { NativeVectorGlobalIndexer indexer = new NativeVectorGlobalIndexer( vectorType, - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( vectorType, options, IVF_PQ_IDENTIFIER, fieldName), IVF_PQ_IDENTIFIER); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); + NativeVectorGlobalIndexWriter writer = (NativeVectorGlobalIndexWriter) indexer.createWriter(fileWriter); for (int i = 0; i < vectors.length; i++) { writer.write(vectors[i], i); } @@ -364,8 +364,8 @@ public void testViaIndexer() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - (VectorGlobalIndexReader) indexer.createReader(fileReader, metas, executor)) { + try (NativeVectorGlobalIndexReader reader = + (NativeVectorGlobalIndexReader) indexer.createReader(fileReader, metas, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 2, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(2); @@ -393,14 +393,14 @@ public void testBatchVectorSearch() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writeVectors(writer, vectors); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { float[][] queryVectors = new float[][] { new float[] {1.0f, 0.0f}, @@ -445,14 +445,14 @@ public void testBatchVectorSearchWithFilter() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writeVectors(writer, vectors); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { float[][] queryVectors = new float[][] {new float[] {1.0f, 0.0f}, new float[] {-1.0f, 0.0f}}; @@ -500,14 +500,14 @@ public void testBatchConsistentWithSingle() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); + NativeVectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writeVectors(writer, vectors); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { + try (NativeVectorGlobalIndexReader reader = + new NativeVectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { float[][] queryVectors = new float[][] { new float[] {1.0f, 0.0f}, @@ -541,12 +541,12 @@ public void testBatchConsistentWithSingle() throws IOException { // =================== Helpers ===================== - private VectorGlobalIndexWriter createIvfPqWriter( + private NativeVectorGlobalIndexWriter createIvfPqWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, Options options) { - return new VectorGlobalIndexWriter( + return new NativeVectorGlobalIndexWriter( fileWriter, fieldType, - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( fieldType, options, IVF_PQ_IDENTIFIER, fieldName), IVF_PQ_IDENTIFIER); } @@ -558,7 +558,7 @@ private Options createDefaultOptions(int dimension) { return options; } - private void writeVectors(VectorGlobalIndexWriter writer, float[][] vectors) { + private void writeVectors(NativeVectorGlobalIndexWriter writer, float[][] vectors) { for (int i = 0; i < vectors.length; i++) { writer.write(vectors[i], i); } diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactoryTest.java similarity index 92% rename from paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java rename to paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactoryTest.java index b0f33d770621..92c56b648522 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexerFactoryTest.java @@ -32,7 +32,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; /** Tests for vector global indexer factory SPI registration. */ -public class VectorGlobalIndexerFactoryTest { +public class NativeVectorGlobalIndexerFactoryTest { @Test public void testIdentifier() { @@ -67,7 +67,7 @@ public void testNativeOptionsOnlyUsesIdentifierPrefix() { options.setString("ivf-pq.nlist", "256"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -90,7 +90,7 @@ public void testNativeOptionsUsesVectorTypeDimension() { options.setString("ivf-flat.dimension", "32"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new VectorType(8, new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -106,7 +106,7 @@ public void testInvalidDimension() { assertThatThrownBy( () -> - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -124,7 +124,7 @@ public void testFieldLevelOptionsOverrideIndexTypeOptions() { options.setString("fields.vec.nlist", "256"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -143,7 +143,7 @@ public void testFieldLevelDimensionOverridesIndexTypeDimension() { options.setString("fields.vec.dimension", "64"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -159,7 +159,7 @@ public void testFieldLevelOptionsOnlyApplyToMatchingField() { options.setString("fields.vec.nlist", "256"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -175,7 +175,7 @@ public void testFieldLevelOptionsRequireExactFieldName() { options.setString("fields.vec_extra.nlist", "512"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -190,7 +190,7 @@ public void testFieldLevelOptionsWithoutIndexTypeOption() { options.setString("fields.vec.distance.metric", "cosine"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, @@ -207,7 +207,7 @@ public void testFieldLevelVectorOptionsCoexistWithCoreFieldOptions() { options.setString("fields.vec.aggregate-function", "sum"); Map nativeOptions = - VectorGlobalIndexerFactory.nativeOptions( + NativeVectorGlobalIndexerFactory.nativeOptions( new ArrayType(new FloatType()), options, IvfFlatVectorGlobalIndexerFactory.IDENTIFIER, diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java index ddb29eab6e97..82eeee198e7e 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java @@ -30,15 +30,15 @@ import static org.assertj.core.api.Assertions.assertThat; -/** Tests for {@link VectorGlobalIndexReader.SeekableStreamVectorIndexInput}. */ +/** Tests for {@link NativeVectorGlobalIndexReader.SeekableStreamVectorIndexInput}. */ public class SeekableStreamVectorIndexInputTest { @Test public void testVectoredReadableInputUsesParallelPositionReads() throws Exception { byte[] data = data(128 * 1024); TestVectoredSeekableInputStream input = new TestVectoredSeekableInputStream(data, 2); - VectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = - new VectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); + NativeVectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = + new NativeVectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); byte[][] buffers = new byte[][] {new byte[64], new byte[64]}; indexInput.pread(new long[] {0, 32 * 1024}, buffers); @@ -54,8 +54,8 @@ public void testVectoredReadableInputUsesParallelPositionReads() throws Exceptio public void testFallbackToSequentialReadWhenRangesOverlap() { byte[] data = data(1024); TestVectoredSeekableInputStream input = new TestVectoredSeekableInputStream(data, 0); - VectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = - new VectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); + NativeVectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = + new NativeVectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); byte[][] buffers = new byte[][] {new byte[64], new byte[64]}; indexInput.pread(new long[] {0, 32}, buffers); From 8e0ea12bbd0951f06c9461345fecf2d5207eee91 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 12:22:11 +0800 Subject: [PATCH 05/10] [vector] Fix native vector index formatting --- .../paimon/vector/index/NativeVectorGlobalIndexer.java | 3 ++- .../paimon/vector/index/NativeVectorGlobalIndexTest.java | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java index 832573399238..cb0e8a5cd9cd 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexer.java @@ -31,7 +31,8 @@ import java.util.concurrent.ExecutorService; /** Native vector global indexer backed by paimon-vector-index. */ -public class NativeVectorGlobalIndexer implements org.apache.paimon.globalindex.VectorGlobalIndexer { +public class NativeVectorGlobalIndexer + implements org.apache.paimon.globalindex.VectorGlobalIndexer { private final DataType fieldType; private final Map options; diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java index ea00c0ec2fd4..3d83b06f773c 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/NativeVectorGlobalIndexTest.java @@ -193,7 +193,9 @@ public void testVectorSearchParameterParsing() { @Test public void testVectorSearchParameterRangeValidationDelegatedToNative() { - assertThat(NativeVectorGlobalIndexReader.nprobe(Collections.singletonMap("ivf.nprobe", "0"))) + assertThat( + NativeVectorGlobalIndexReader.nprobe( + Collections.singletonMap("ivf.nprobe", "0"))) .isEqualTo(0); assertThat( NativeVectorGlobalIndexReader.efSearch( @@ -356,7 +358,8 @@ public void testViaIndexer() throws IOException { IVF_PQ_IDENTIFIER); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - NativeVectorGlobalIndexWriter writer = (NativeVectorGlobalIndexWriter) indexer.createWriter(fileWriter); + NativeVectorGlobalIndexWriter writer = + (NativeVectorGlobalIndexWriter) indexer.createWriter(fileWriter); for (int i = 0; i < vectors.length; i++) { writer.write(vectors[i], i); } From a5b64714dfb1a1d284ffff20636624bf3fb483dc Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 13:07:41 +0800 Subject: [PATCH 06/10] [lance] Add test dependency for vector fast search --- paimon-lance/pom.xml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/paimon-lance/pom.xml b/paimon-lance/pom.xml index cb6a70dcb33f..4306989eaa9b 100644 --- a/paimon-lance/pom.xml +++ b/paimon-lance/pom.xml @@ -120,6 +120,43 @@ under the License. + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version} + test + + + org.apache.avro + avro + + + com.google.protobuf + protobuf-java + + + ch.qos.reload4j + reload4j + + + org.slf4j + slf4j-reload4j + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + jdk.tools + jdk.tools + + + From a49d70c0218bf94d304f96bf4d9538558ee185e8 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 15:28:02 +0800 Subject: [PATCH 07/10] [core][python] Add global index search modes --- docs/docs/multimodal-table/global-index.mdx | 13 +- docs/generated/core_configuration.html | 12 +- .../java/org/apache/paimon/CoreOptions.java | 48 ++++++-- .../globalindex/DataEvolutionBatchScan.java | 63 +++++++--- .../table/source/AbstractVectorRead.java | 67 +++++++--- .../paimon/table/source/BatchVectorRead.java | 7 ++ .../table/source/BatchVectorReadImpl.java | 17 ++- .../source/BatchVectorSearchBuilder.java | 3 +- .../paimon/table/source/VectorRead.java | 6 + .../paimon/table/source/VectorReadImpl.java | 15 ++- .../paimon/table/source/VectorScan.java | 7 ++ .../paimon/table/source/VectorScanImpl.java | 13 +- .../table/source/VectorSearchBuilder.java | 3 +- .../table/BtreeGlobalIndexTableTest.java | 12 +- .../table/source/VectorSearchBuilderTest.java | 18 +-- .../pypaimon/common/options/core_options.py | 23 ++-- .../pypaimon/read/scanner/file_scanner.py | 32 +++-- .../table/source/vector_search_read.py | 116 +++++++++++------- .../table/source/vector_search_scan.py | 8 +- .../pypaimon/tests/global_index_test.py | 4 +- .../tests/vector_search_filter_test.py | 8 +- 21 files changed, 341 insertions(+), 154 deletions(-) diff --git a/docs/docs/multimodal-table/global-index.mdx b/docs/docs/multimodal-table/global-index.mdx index 001175278c87..1cecce799387 100644 --- a/docs/docs/multimodal-table/global-index.mdx +++ b/docs/docs/multimodal-table/global-index.mdx @@ -132,14 +132,17 @@ new rows are not automatically covered by the existing index files. Run `create_ to build index files for newly uncovered data. By default, queries use fast search and only read indexed row ranges; rows in uncovered ranges are not returned for that indexed query. -To improve freshness for query types that support slow search, set: +To improve freshness for query types that support raw-data search, set: ```sql -ALTER TABLE my_table SET ('global-index.fast-search' = 'false'); +ALTER TABLE my_table SET ('global-index.search-mode' = 'full'); ``` -With fast search disabled, supported global-index queries merge indexed results with a scan over -files not covered by global indexes. +With `full` search, supported global-index queries first use the snapshot `nextRowId` and global +index row-id coverage to detect whether any row range is missing from the index. Raw data is scanned +only when such a gap exists. Use `detail` search when data files may have been rewritten or updated +after index creation; it scans data file metadata to find the exact unindexed row ranges and can +handle index invalidation caused by updates or rewrites. To temporarily disable global-index scan acceleration while keeping the index files, set: @@ -156,7 +159,7 @@ These table options affect global index build and read behavior: | Option | Default | Description | |---|---|---| | `global-index.enabled` | `true` | Whether scans can use global indexes. | -| `global-index.fast-search` | `true` | Whether global index queries only search indexed files. Set to `false` to also scan files not covered by global indexes when supported. | +| `global-index.search-mode` | `fast` | Search mode for global-index queries. `fast` searches indexed data only. `full` checks snapshot `nextRowId` against global index row-id coverage and scans raw data only if a gap exists. `detail` scans data file metadata to find exact unindexed rows and can handle index invalidation caused by updates or rewrites. | | `global-index.external-path` | Not set | Root directory for global index files. If not set, files are stored under the table index directory. | | `sorted-index.records-per-range` | `10000000` | Expected number of records per sorted global index file for BTree and Bitmap builds. | | `sorted-index.build.max-parallelism` | `4096` | Maximum Flink or Spark parallelism for building sorted global indexes. | diff --git a/docs/generated/core_configuration.html b/docs/generated/core_configuration.html index 8b395657ae05..5917d7067744 100644 --- a/docs/generated/core_configuration.html +++ b/docs/generated/core_configuration.html @@ -734,18 +734,18 @@ Boolean Whether to enable global index for scan. - -

    global-index.fast-search
    - true - Boolean - Whether global index queries only search indexed files. Set to false to also scan files not covered by global indexes when supported. -
    global-index.external-path
    (none) String Global index root directory, if not set, the global index files will be stored under the <table-root-directory>/index. + +
    global-index.search-mode
    + fast +

    Enum

    + Search mode for global index queries. Supported values are 'fast', 'full', and 'detail'.

    Possible values:
    • "fast": Only search indexed data.
    • "full": Use snapshot next row id and global index coverage to detect missing row ids, and scan raw data only when a gap exists.
    • "detail": Scan data files to find exact unindexed rows. This can handle index invalidation caused by updates or rewrites.
    +
    global-index.row-count-per-shard
    100000 diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java index 400571509f2f..a884752434cf 100644 --- a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java +++ b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java @@ -2543,14 +2543,13 @@ public InlineElement getDescription() { .defaultValue(true) .withDescription("Whether to enable global index for scan."); - public static final ConfigOption GLOBAL_INDEX_FAST_SEARCH = - key("global-index.fast-search") - .booleanType() - .defaultValue(true) + public static final ConfigOption GLOBAL_INDEX_SEARCH_MODE = + key("global-index.search-mode") + .enumType(GlobalIndexSearchMode.class) + .defaultValue(GlobalIndexSearchMode.FAST) .withDescription( - "Whether global index queries only search indexed files. " - + "Set to false to also scan files not covered by global indexes " - + "when supported."); + "Search mode for global index queries. " + + "Supported values are 'fast', 'full', and 'detail'."); public static final ConfigOption GLOBAL_INDEX_THREAD_NUM = key("global-index.thread-num") @@ -4058,8 +4057,8 @@ public boolean globalIndexEnabled() { return options.get(GLOBAL_INDEX_ENABLED); } - public boolean globalIndexFastSearch() { - return options.get(GLOBAL_INDEX_FAST_SEARCH); + public GlobalIndexSearchMode globalIndexSearchMode() { + return options.get(GLOBAL_INDEX_SEARCH_MODE); } public Integer globalIndexThreadNum() { @@ -4859,4 +4858,35 @@ public enum GlobalIndexColumnUpdateAction { /** Drop all global index entries for the whole partitions affected by the update. */ DROP_PARTITION_INDEX } + + /** Search mode for global index queries. */ + public enum GlobalIndexSearchMode implements DescribedEnum { + FAST("fast", "Only search indexed data."), + FULL( + "full", + "Use snapshot next row id and global index coverage to detect missing row ids, " + + "and scan raw data only when a gap exists."), + DETAIL( + "detail", + "Scan data files to find exact unindexed rows. " + + "This can handle index invalidation caused by updates or rewrites."); + + private final String value; + private final String description; + + GlobalIndexSearchMode(String value, String description) { + this.value = value; + this.description = description; + } + + @Override + public String toString() { + return value; + } + + @Override + public InlineElement getDescription() { + return text(description); + } + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java index e1b587a6bf05..a9b4fdb09126 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java @@ -19,6 +19,8 @@ package org.apache.paimon.globalindex; import org.apache.paimon.CoreOptions; +import org.apache.paimon.CoreOptions.GlobalIndexSearchMode; +import org.apache.paimon.Snapshot; import org.apache.paimon.annotation.VisibleForTesting; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; @@ -62,6 +64,7 @@ import static org.apache.paimon.table.SpecialFields.ROW_ID; import static org.apache.paimon.table.SpecialFields.rowTypeWithRowId; +import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; import static org.apache.paimon.utils.ManifestReadThreadPool.randomlyExecuteSequentialReturn; /** Scan for data evolution table. */ @@ -256,7 +259,8 @@ public Plan plan() { boolean scanUnindexedRanges = globalIndexResult == null && !(result instanceof ScoredGlobalIndexResult) - && !table.coreOptions().globalIndexFastSearch(); + && table.coreOptions().globalIndexSearchMode() + != GlobalIndexSearchMode.FAST; if (scanUnindexedRanges) { rowIds = withUnindexedRows(rowIds); rowRanges = rowIds.toRangeList(); @@ -277,17 +281,46 @@ public Plan plan() { } private RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedResultRows) { - TableScan.Plan allDataPlan = allDataPlan(); + TableScan.Plan allDataPlan = null; + Snapshot snapshot; + if (table.coreOptions().globalIndexSearchMode() == GlobalIndexSearchMode.DETAIL) { + allDataPlan = allDataPlan(); + snapshot = + batchScan.snapshotReader().snapshotManager().snapshot(snapshotId(allDataPlan)); + } else { + snapshot = tryTravelOrLatest(table); + } + + List unindexedRanges = unindexedRanges(allDataPlan, snapshot); + + RoaringNavigableMap64 rows = new RoaringNavigableMap64(); + rows.or(indexedResultRows); + rows.or(matchingRows(unindexedRanges)); + return rows; + } + + private List unindexedRanges(@Nullable TableScan.Plan allDataPlan, Snapshot snapshot) { + if (snapshot == null || snapshot.nextRowId() == null || snapshot.nextRowId() <= 0) { + return Collections.emptyList(); + } + List dataRanges = new ArrayList<>(); - for (Split split : allDataPlan.splits()) { - if (!(split instanceof DataSplit)) { - continue; + if (table.coreOptions().globalIndexSearchMode() == GlobalIndexSearchMode.DETAIL) { + if (allDataPlan == null) { + return Collections.emptyList(); } - for (DataFileMeta file : ((DataSplit) split).dataFiles()) { - if (file.firstRowId() != null) { - dataRanges.add(file.nonNullRowIdRange()); + for (Split split : allDataPlan.splits()) { + if (!(split instanceof DataSplit)) { + continue; + } + for (DataFileMeta file : ((DataSplit) split).dataFiles()) { + if (file.firstRowId() != null) { + dataRanges.add(file.nonNullRowIdRange()); + } } } + } else { + dataRanges.add(new Range(0, snapshot.nextRowId() - 1)); } List predicateIndexedRanges = @@ -295,22 +328,14 @@ private RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedRes table, batchScan.snapshotReader().manifestsReader().partitionFilter(), filter, - batchScan - .snapshotReader() - .snapshotManager() - .snapshot(snapshotId(allDataPlan))); + snapshot); predicateIndexedRanges = Range.sortAndMergeOverlap(predicateIndexedRanges, true); List unindexedRanges = new ArrayList<>(); for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { unindexedRanges.addAll(dataRange.exclude(predicateIndexedRanges)); } - unindexedRanges = Range.sortAndMergeOverlap(unindexedRanges, true); - - RoaringNavigableMap64 rows = new RoaringNavigableMap64(); - rows.or(indexedResultRows); - rows.or(matchingRows(unindexedRanges)); - return rows; + return Range.sortAndMergeOverlap(unindexedRanges, true); } private RoaringNavigableMap64 matchingRows(List ranges) { @@ -334,7 +359,7 @@ private RoaringNavigableMap64 matchingRows(List ranges) { } } catch (IOException e) { throw new RuntimeException( - "Failed to scan unindexed data for global index slow search.", e); + "Failed to scan unindexed data for global index raw search.", e); } return rows; } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java index 74e7623bd436..de9b8eee7af1 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractVectorRead.java @@ -18,6 +18,7 @@ package org.apache.paimon.table.source; +import org.apache.paimon.CoreOptions.GlobalIndexSearchMode; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.InternalVector; @@ -198,25 +199,26 @@ protected CompletableFuture>> evalBatch( .whenComplete((r, t) -> IOUtils.closeQuietly(reader)); } - protected boolean slowSearchEnabled() { - return !fastSearch(); + protected boolean rawSearchEnabled() { + return searchMode() != GlobalIndexSearchMode.FAST; } - protected boolean fastSearch() { - return table.coreOptions().globalIndexFastSearch(); + protected GlobalIndexSearchMode searchMode() { + return table.coreOptions().globalIndexSearchMode(); } - protected ScoredGlobalIndexResult withSlowSearch( + protected ScoredGlobalIndexResult withRawSearch( ScoredGlobalIndexResult result, List splits, @Nullable GlobalIndexer globalIndexer, + @Nullable Long nextRowId, float[] queryVector) { - if (!slowSearchEnabled()) { + if (!rawSearchEnabled()) { return result.topK(limit); } ScoredGlobalIndexResult rawResult = - readSlowSearch(splits, slowSearchMetric(globalIndexer), queryVector); + readRawSearch(splits, rawSearchMetric(globalIndexer), nextRowId, queryVector); return result.or(rawResult).topK(limit); } @@ -242,12 +244,13 @@ private List buildIOMetaList( return indexIOMetaList; } - private ScoredGlobalIndexResult readSlowSearch( - List splits, String metric, float[] queryVector) { + private ScoredGlobalIndexResult readRawSearch( + List splits, + String metric, + @Nullable Long nextRowId, + float[] queryVector) { RowType readType = SpecialFields.rowTypeWithRowId(table.rowType()); - ReadBuilder rangeDiscoveryBuilder = newRawReadBuilder(readType, false); - TableScan.Plan allDataPlan = rangeDiscoveryBuilder.newScan().plan(); - List nonIndexedRanges = nonIndexedRanges(allDataPlan, splits); + List nonIndexedRanges = nonIndexedRanges(readType, splits, nextRowId); if (nonIndexedRanges.isEmpty()) { return ScoredGlobalIndexResult.createEmpty(); } @@ -279,7 +282,7 @@ private ScoredGlobalIndexResult readSlowSearch( scoreMap.put(rowId, computeScore(queryVector, stored, metric)); }); } catch (IOException e) { - throw new RuntimeException("Failed to read raw vectors for vector slow search.", e); + throw new RuntimeException("Failed to read raw vectors for vector raw search.", e); } return ScoredGlobalIndexResult.create(resultBitmap, scoreMap::get).topK(limit); @@ -297,6 +300,26 @@ private ReadBuilder newRawReadBuilder(RowType readType, boolean includeFilter) { } private List nonIndexedRanges( + RowType readType, List splits, @Nullable Long nextRowId) { + if (searchMode() == GlobalIndexSearchMode.DETAIL) { + ReadBuilder rangeDiscoveryBuilder = newRawReadBuilder(readType, false); + TableScan.Plan allDataPlan = rangeDiscoveryBuilder.newScan().plan(); + return nonIndexedRangesByDataFiles(allDataPlan, splits); + } + return nonIndexedRangesByNextRowId(splits, nextRowId); + } + + private List nonIndexedRangesByNextRowId( + List splits, @Nullable Long nextRowId) { + if (nextRowId == null || nextRowId <= 0) { + return Collections.emptyList(); + } + + List indexedRanges = indexedRanges(splits); + return Range.sortAndMergeOverlap(new Range(0, nextRowId - 1).exclude(indexedRanges), true); + } + + private List nonIndexedRangesByDataFiles( TableScan.Plan allDataPlan, List splits) { List dataRanges = new ArrayList<>(); for (Split split : allDataPlan.splits()) { @@ -314,11 +337,7 @@ private List nonIndexedRanges( } } - List indexedRanges = new ArrayList<>(); - for (VectorSearchSplit split : splits) { - indexedRanges.add(new Range(split.rowRangeStart(), split.rowRangeEnd())); - } - indexedRanges = Range.sortAndMergeOverlap(indexedRanges, true); + List indexedRanges = indexedRanges(splits); List ranges = new ArrayList<>(); for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { @@ -327,6 +346,14 @@ private List nonIndexedRanges( return Range.sortAndMergeOverlap(ranges, true); } + private static List indexedRanges(List splits) { + List indexedRanges = new ArrayList<>(); + for (VectorSearchSplit split : splits) { + indexedRanges.add(new Range(split.rowRangeStart(), split.rowRangeEnd())); + } + return Range.sortAndMergeOverlap(indexedRanges, true); + } + private float[] getVector(InternalRow row, int vectorIndex) { if (vectorColumn.type().getTypeRoot() == DataTypeRoot.VECTOR) { InternalVector vector = row.getVector(vectorIndex); @@ -339,14 +366,14 @@ private float[] getVector(InternalRow row, int vectorIndex) { "Unsupported vector column type: " + vectorColumn.type()); } - private String slowSearchMetric(@Nullable GlobalIndexer globalIndexer) { + private String rawSearchMetric(@Nullable GlobalIndexer globalIndexer) { String metric = null; if (globalIndexer != null) { if (!(globalIndexer instanceof VectorGlobalIndexer)) { throw new IllegalArgumentException( "Index type '" + globalIndexer.getClass().getName() - + "' does not provide vector metric for slow search."); + + "' does not provide vector metric for raw search."); } metric = ((VectorGlobalIndexer) globalIndexer).metric(); } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorRead.java index 7b2169779ddb..c1356da5873c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorRead.java @@ -20,6 +20,8 @@ import org.apache.paimon.globalindex.GlobalIndexResult; +import javax.annotation.Nullable; + import java.util.List; /** Batch vector read to read index files for multiple query vectors. */ @@ -27,6 +29,11 @@ public interface BatchVectorRead { /** Read batch results; result {@code i} corresponds to input vector {@code i}. */ default List readBatch(VectorScan.Plan plan) { + return readBatch(plan, plan.nextRowId()); + } + + /** Read batch results; result {@code i} corresponds to input vector {@code i}. */ + default List readBatch(VectorScan.Plan plan, @Nullable Long nextRowId) { return readBatch(plan.splits()); } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java index ae31dd03253c..8062a61d8d86 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorReadImpl.java @@ -29,6 +29,8 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.RoaringNavigableMap64; +import javax.annotation.Nullable; + import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -69,8 +71,18 @@ public BatchVectorReadImpl( @Override public List readBatch(List splits) { + return readBatch(splits, null); + } + + @Override + public List readBatch(VectorScan.Plan plan, @Nullable Long nextRowId) { + return readBatch(plan.splits(), nextRowId); + } + + private List readBatch( + List splits, @Nullable Long nextRowId) { int n = vectors.length; - if (splits.isEmpty() && fastSearch()) { + if (splits.isEmpty() && !rawSearchEnabled()) { List empty = new ArrayList<>(n); for (int i = 0; i < n; i++) { empty.add(GlobalIndexResult.createEmpty()); @@ -84,7 +96,8 @@ public List readBatch(List splits) { List results = new ArrayList<>(n); for (int i = 0; i < n; i++) { - results.add(withSlowSearch(indexedResults[i], splits, globalIndexer, vectors[i])); + results.add( + withRawSearch(indexedResults[i], splits, globalIndexer, nextRowId, vectors[i])); } return results; } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilder.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilder.java index f4037249a368..97bb0306e8c5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilder.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BatchVectorSearchBuilder.java @@ -64,6 +64,7 @@ default BatchVectorSearchBuilder withOptions(Map options) { /** Execute batch vector search locally; result {@code i} corresponds to {@code vectors[i]}. */ default List executeBatchLocal() { - return newBatchVectorRead().readBatch(newVectorScan().scan()); + VectorScan.Plan plan = newVectorScan().scan(); + return newBatchVectorRead().readBatch(plan); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorRead.java index 74e17e2845b8..f0ca306863d6 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorRead.java @@ -20,12 +20,18 @@ import org.apache.paimon.globalindex.GlobalIndexResult; +import javax.annotation.Nullable; + import java.util.List; /** Vector read to read index files. */ public interface VectorRead { default GlobalIndexResult read(VectorScan.Plan plan) { + return read(plan, plan.nextRowId()); + } + + default GlobalIndexResult read(VectorScan.Plan plan, @Nullable Long nextRowId) { return read(plan.splits()); } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 75a4ae73fed2..51b9a743e126 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -29,6 +29,8 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.RoaringNavigableMap64; +import javax.annotation.Nullable; + import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -69,7 +71,16 @@ public VectorReadImpl( @Override public GlobalIndexResult read(List splits) { - if (splits.isEmpty() && fastSearch()) { + return read(splits, null); + } + + @Override + public GlobalIndexResult read(VectorScan.Plan plan, @Nullable Long nextRowId) { + return read(plan.splits(), nextRowId); + } + + private GlobalIndexResult read(List splits, @Nullable Long nextRowId) { + if (splits.isEmpty() && !rawSearchEnabled()) { return GlobalIndexResult.createEmpty(); } @@ -78,7 +89,7 @@ public GlobalIndexResult read(List splits) { splits.isEmpty() ? ScoredGlobalIndexResult.createEmpty() : readIndexed(splits, globalIndexer); - return withSlowSearch(result, splits, globalIndexer, vector); + return withRawSearch(result, splits, globalIndexer, nextRowId, vector); } protected ScoredGlobalIndexResult readIndexed( diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScan.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScan.java index 80f667cc0bef..a056abc85a92 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScan.java @@ -18,6 +18,8 @@ package org.apache.paimon.table.source; +import javax.annotation.Nullable; + import java.util.List; /** Vector scan to pre-filter and scan index files. */ @@ -28,5 +30,10 @@ public interface VectorScan { /** Plan of vector scan. */ interface Plan { List splits(); + + @Nullable + default Long nextRowId() { + return null; + } } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index 74ae2afe8d69..2f787efb0ed4 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -127,7 +127,18 @@ public Plan scan() { splits.add(new VectorSearchSplit(range.from, range.to, vectorFiles, scalarFiles)); } - return () -> splits; + Long nextRowId = snapshot == null ? null : snapshot.nextRowId(); + return new Plan() { + @Override + public List splits() { + return splits; + } + + @Override + public Long nextRowId() { + return nextRowId; + } + }; } private static boolean isPrimaryColumn(GlobalIndexMeta meta, int fieldId) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilder.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilder.java index e33f1028669a..9a4f6462fd9b 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilder.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorSearchBuilder.java @@ -63,6 +63,7 @@ default VectorSearchBuilder withOptions(Map options) { /** Execute vector index search in local. */ default GlobalIndexResult executeLocal() { - return newVectorRead().read(newVectorScan().scan()); + VectorScan.Plan plan = newVectorScan().scan(); + return newVectorRead().read(plan); } } diff --git a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java index 8c02fd392476..028338e9888e 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/BtreeGlobalIndexTableTest.java @@ -134,7 +134,7 @@ public void testBTreeGlobalIndexWithCoreScan() throws Exception { } @Test - public void testBTreeGlobalIndexFastSearchControlsUnindexedData() throws Exception { + public void testBTreeGlobalIndexSearchModeControlsUnindexedData() throws Exception { write(500L); createIndex("f1"); @@ -162,16 +162,16 @@ public void testBTreeGlobalIndexFastSearchControlsUnindexedData() throws Excepti BinaryString.fromString("a700"))); ReadBuilder readBuilder = table.newReadBuilder().withFilter(predicate); - List fastSearchResult = readF1(readBuilder, readBuilder.newScan().plan()); - assertThat(fastSearchResult).containsExactly("a100"); + List fastModeResult = readF1(readBuilder, readBuilder.newScan().plan()); + assertThat(fastModeResult).containsExactly("a100"); table = table.copy( Collections.singletonMap( - CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false")); + CoreOptions.GLOBAL_INDEX_SEARCH_MODE.key(), "full")); readBuilder = table.newReadBuilder().withFilter(predicate); - List slowSearchResult = readF1(readBuilder, readBuilder.newScan().plan()); - assertThat(slowSearchResult).containsExactly("a100", "a700"); + List fullSearchResult = readF1(readBuilder, readBuilder.newScan().plan()); + assertThat(fullSearchResult).containsExactly("a100", "a700"); } @Test diff --git a/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java b/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java index 07a63dad09eb..71fc5db0ad07 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/source/VectorSearchBuilderTest.java @@ -242,21 +242,21 @@ public void testVectorSearchEmptyResult() throws Exception { } @Test - public void testVectorSearchScansUnindexedDataWhenFastSearchDisabled() throws Exception { + public void testVectorSearchFullModeScansUnindexedData() throws Exception { catalog.createTable( - identifier("slow_search_cosine_table"), + identifier("full_search_cosine_table"), Schema.newBuilder() .column("id", DataTypes.INT()) .column(VECTOR_FIELD_NAME, new ArrayType(DataTypes.FLOAT())) .option(CoreOptions.BUCKET.key(), "-1") .option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true") .option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true") - .option(CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false") + .option(CoreOptions.GLOBAL_INDEX_SEARCH_MODE.key(), "full") .option("test.vector.dimension", String.valueOf(DIMENSION)) .option("test.vector.metric", "cosine") .build(), false); - FileStoreTable table = getTable(identifier("slow_search_cosine_table")); + FileStoreTable table = getTable(identifier("full_search_cosine_table")); float[][] vectors = { {0.0f, 1.0f}, @@ -281,7 +281,7 @@ public void testVectorSearchScansUnindexedDataWhenFastSearchDisabled() throws Ex } @Test - public void testVectorSearchFastSearchSkipsUnindexedDataByDefault() throws Exception { + public void testVectorSearchFastModeSkipsUnindexedDataByDefault() throws Exception { catalog.createTable( identifier("fast_search_table"), vectorSchemaBuilder(VECTOR_FIELD_NAME).build(), @@ -309,14 +309,14 @@ public void testVectorSearchFastSearchSkipsUnindexedDataByDefault() throws Excep } @Test - public void testVectorSearchSlowSearchScansFilteredUnindexedData() throws Exception { + public void testVectorSearchFullModeScansFilteredUnindexedData() throws Exception { catalog.createTable( - identifier("slow_search_filtered_table"), + identifier("full_search_filtered_table"), vectorSchemaBuilder(VECTOR_FIELD_NAME) - .option(CoreOptions.GLOBAL_INDEX_FAST_SEARCH.key(), "false") + .option(CoreOptions.GLOBAL_INDEX_SEARCH_MODE.key(), "full") .build(), false); - FileStoreTable table = getTable(identifier("slow_search_filtered_table")); + FileStoreTable table = getTable(identifier("full_search_filtered_table")); float[][] vectors = { {0.0f, 1.0f}, diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py index 9a273699c738..bc1bbdea3bb8 100644 --- a/paimon-python/pypaimon/common/options/core_options.py +++ b/paimon-python/pypaimon/common/options/core_options.py @@ -90,6 +90,12 @@ class GlobalIndexColumnUpdateAction(str, Enum): DROP_PARTITION_INDEX = "DROP_PARTITION_INDEX" +class GlobalIndexSearchMode(str, Enum): + FAST = "fast" + FULL = "full" + DETAIL = "detail" + + class CoreOptions: """Core options for Paimon tables.""" # File format constants @@ -618,14 +624,13 @@ class CoreOptions: .with_description("Whether to enable global index for scan.") ) - GLOBAL_INDEX_FAST_SEARCH: ConfigOption[bool] = ( - ConfigOptions.key("global-index.fast-search") - .boolean_type() - .default_value(True) + GLOBAL_INDEX_SEARCH_MODE: ConfigOption[GlobalIndexSearchMode] = ( + ConfigOptions.key("global-index.search-mode") + .enum_type(GlobalIndexSearchMode) + .default_value(GlobalIndexSearchMode.FAST) .with_description( - "Whether global index queries only search indexed files. " - "Set to false to also scan files not covered by global indexes " - "when supported." + "Search mode for global index queries. Supported values are " + "'fast', 'full', and 'detail'." ) ) @@ -1102,8 +1107,8 @@ def commit_max_retry_wait(self) -> int: def global_index_enabled(self, default=None): return self.options.get(CoreOptions.GLOBAL_INDEX_ENABLED, default) - def global_index_fast_search(self): - return self.options.get(CoreOptions.GLOBAL_INDEX_FAST_SEARCH) + def global_index_search_mode(self): + return self.options.get(CoreOptions.GLOBAL_INDEX_SEARCH_MODE) def global_index_thread_num(self) -> Optional[int]: return self.options.get(CoreOptions.GLOBAL_INDEX_THREAD_NUM) diff --git a/paimon-python/pypaimon/read/scanner/file_scanner.py b/paimon-python/pypaimon/read/scanner/file_scanner.py index 67d2c626f3ef..ae7ee818b837 100755 --- a/paimon-python/pypaimon/read/scanner/file_scanner.py +++ b/paimon-python/pypaimon/read/scanner/file_scanner.py @@ -22,6 +22,7 @@ logger = logging.getLogger(__name__) +from pypaimon.common.options.core_options import GlobalIndexSearchMode from pypaimon.common.predicate import Predicate from pypaimon.globalindex import ScoredGlobalIndexResult from pypaimon.globalindex.global_index_result import GlobalIndexResult @@ -51,10 +52,11 @@ PrimaryKeyTableSplitGenerator from pypaimon.read.split import DataSplit from pypaimon.table.special_fields import SpecialFields -from pypaimon.utils.roaring_bitmap import RoaringBitmap64 from pypaimon.snapshot.snapshot import Snapshot from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.source.deletion_file import DeletionFile +from pypaimon.utils.range import Range +from pypaimon.utils.roaring_bitmap import RoaringBitmap64 def _row_ranges_from_predicate(predicate: Optional[Predicate]) -> Optional[List]: @@ -332,7 +334,8 @@ def _create_data_evolution_split_generator(self): scan_unindexed_data = ( self._global_index_result is None and not isinstance(global_index_result, ScoredGlobalIndexResult) - and not self.table.options.global_index_fast_search() + and self.table.options.global_index_search_mode() + != GlobalIndexSearchMode.FAST ) if scan_unindexed_data: global_index_result = self._with_unindexed_rows( @@ -362,15 +365,21 @@ def _create_data_evolution_split_generator(self): ) def _with_unindexed_rows(self, indexed_result, manifest_files, snapshot): - data_ranges = [] - entries = self.read_manifest_entries(manifest_files) - for entry in entries: - first_row_id = entry.file.first_row_id - if first_row_id is not None: - data_ranges.append(entry.file.row_id_range()) + mode = self.table.options.global_index_search_mode() + entries = None + if mode == GlobalIndexSearchMode.DETAIL: + entries = self.read_manifest_entries(manifest_files) + data_ranges = [] + for entry in entries: + first_row_id = entry.file.first_row_id + if first_row_id is not None: + data_ranges.append(entry.file.row_id_range()) + elif snapshot is not None and snapshot.next_row_id is not None and snapshot.next_row_id > 0: + data_ranges = [Range(0, int(snapshot.next_row_id) - 1)] + else: + data_ranges = [] from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner - from pypaimon.utils.range import Range predicate_indexed_ranges = Range.sort_and_merge_overlap( GlobalIndexScanner.predicate_indexed_ranges( @@ -389,6 +398,11 @@ def _with_unindexed_rows(self, indexed_result, manifest_files, snapshot): unindexed_ranges = Range.sort_and_merge_overlap( unindexed_ranges, merge=True, adjacent=True) + if entries is None and unindexed_ranges: + entries = self.read_manifest_entries(manifest_files) + if entries is None: + entries = [] + bitmap = RoaringBitmap64.or_( indexed_result.results(), self._matching_unindexed_rows(entries, unindexed_ranges)) diff --git a/paimon-python/pypaimon/table/source/vector_search_read.py b/paimon-python/pypaimon/table/source/vector_search_read.py index 1be4f16761cb..5209289e9aba 100644 --- a/paimon-python/pypaimon/table/source/vector_search_read.py +++ b/paimon-python/pypaimon/table/source/vector_search_read.py @@ -21,6 +21,7 @@ from abc import ABC, abstractmethod from concurrent.futures import wait +from pypaimon.common.options.core_options import GlobalIndexSearchMode from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta from pypaimon.globalindex.global_index_result import GlobalIndexResult from pypaimon.globalindex.offset_global_index_reader import OffsetGlobalIndexReader @@ -29,19 +30,16 @@ from pypaimon.utils.range import Range -GLOBAL_INDEX_FAST_SEARCH = "global-index.fast-search" - - class VectorSearchRead(ABC): """Vector search read to read index files.""" def read_plan(self, plan): # type: (VectorSearchScanPlan) -> GlobalIndexResult - return self.read(plan.splits()) + return self.read(plan.splits(), next_row_id=getattr(plan, "next_row_id", None)) @abstractmethod - def read(self, splits): - # type: (List[VectorSearchSplit]) -> GlobalIndexResult + def read(self, splits, next_row_id=None): + # type: (List[VectorSearchSplit], Optional[int]) -> GlobalIndexResult pass @@ -50,19 +48,18 @@ class BatchVectorSearchRead(ABC): def read_batch_plan(self, plan): # type: (VectorSearchScanPlan) -> List[GlobalIndexResult] - return self.read_batch(plan.splits()) + return self.read_batch( + plan.splits(), next_row_id=getattr(plan, "next_row_id", None)) @abstractmethod - def read_batch(self, splits): - # type: (List[VectorSearchSplit]) -> List[GlobalIndexResult] + def read_batch(self, splits, next_row_id=None): + # type: (List[VectorSearchSplit], Optional[int]) -> List[GlobalIndexResult] pass class AbstractVectorSearchReadImpl: """Base implementation for vector search reads.""" - GLOBAL_INDEX_FAST_SEARCH = GLOBAL_INDEX_FAST_SEARCH - def __init__(self, table, limit, vector_column, filter_=None, options=None, partition_filter=None): self._table = table @@ -165,39 +162,45 @@ def _eval(self, row_range_start, row_range_end, vector_index_files, index_type, file_io, index_path, index_io_meta_list, options ) - if self._slow_search_enabled() and self._vector_metric is None: + if self._raw_search_enabled() and self._vector_metric is None: self._vector_metric = reader.vector_metric() offset_reader = OffsetGlobalIndexReader(reader, row_range_start, row_range_end) future = offset_reader.visit_vector_search(vector_search) future.add_done_callback(lambda _: reader.close()) return future - def _slow_search_enabled(self): - return not self._fast_search() - - def _fast_search(self): - return str( - self._table_option(GLOBAL_INDEX_FAST_SEARCH, "true") - ).lower() == "true" - - def _with_slow_search(self, result, splits, query_vector): - if not self._slow_search_enabled(): + def _raw_search_enabled(self): + return self._search_mode() != GlobalIndexSearchMode.FAST + + def _search_mode(self): + options = getattr(self._table, "options", None) + if options is not None and hasattr(options, "global_index_search_mode"): + return options.global_index_search_mode() + raw = self._table_option( + "global-index.search-mode", GlobalIndexSearchMode.FAST.value) + if isinstance(raw, GlobalIndexSearchMode): + return raw + return GlobalIndexSearchMode(str(raw).strip().lower()) + + def _with_raw_search(self, result, splits, query_vector, next_row_id): + if not self._raw_search_enabled(): return result.top_k(self._limit) - raw_result = self._read_slow_search(splits, query_vector) + raw_result = self._read_raw_search(splits, query_vector, next_row_id) return result.or_(raw_result).top_k(self._limit) - def _read_slow_search(self, splits, query_vector): + def _read_raw_search(self, splits, query_vector, next_row_id): from pypaimon.table.special_fields import SpecialFields read_type = self._read_type_with_row_id() - range_discovery_builder = self._new_raw_read_builder( - read_type, include_filter=False) - all_data_plan = range_discovery_builder.new_scan().plan() - non_indexed_ranges = self._non_indexed_ranges(all_data_plan, splits) + non_indexed_ranges = self._non_indexed_ranges( + read_type, splits, next_row_id) if not non_indexed_ranges: return DictBasedScoredIndexResult({}) + range_discovery_builder = self._new_raw_read_builder( + read_type, include_filter=False) + all_data_plan = range_discovery_builder.new_scan().plan() raw_splits = self._wrap_splits_with_row_ranges( all_data_plan.splits(), non_indexed_ranges) if not raw_splits: @@ -212,14 +215,14 @@ def _read_slow_search(self, splits, query_vector): vector_name = self._vector_column.name if row_id_name not in arrow_table.column_names: raise ValueError( - "Vector slow search requires row tracking column %s." + "Vector raw search requires row tracking column %s." % row_id_name) if vector_name not in arrow_table.column_names: raise ValueError( - "Vector slow search read type does not contain vector column %s." + "Vector raw search read type does not contain vector column %s." % vector_name) - metric = self._slow_search_metric() + metric = self._raw_search_metric() query = self._normalize_vector(query_vector) scores = {} row_ids = arrow_table.column(row_id_name).to_pylist() @@ -255,22 +258,41 @@ def _new_raw_read_builder(self, read_type, include_filter): read_builder = read_builder.with_filter(self._filter) return read_builder - def _non_indexed_ranges(self, all_data_plan, splits): + def _non_indexed_ranges(self, read_type, splits, next_row_id): + if self._search_mode() == GlobalIndexSearchMode.DETAIL: + range_discovery_builder = self._new_raw_read_builder( + read_type, include_filter=False) + all_data_plan = range_discovery_builder.new_scan().plan() + return self._non_indexed_ranges_by_data_files(all_data_plan, splits) + return self._non_indexed_ranges_by_next_row_id(splits, next_row_id) + + def _non_indexed_ranges_by_next_row_id(self, splits, next_row_id): + if next_row_id is None or int(next_row_id) <= 0: + return [] + + data_range = Range(0, int(next_row_id) - 1) + return Range.sort_and_merge_overlap( + data_range.exclude(self._indexed_ranges(splits)), True) + + def _non_indexed_ranges_by_data_files(self, all_data_plan, splits): data_ranges = [] for split in all_data_plan.splits(): data_ranges.extend(self._split_row_ranges(split)) - indexed_ranges = [ - Range(split.row_range_start, split.row_range_end) - for split in splits - ] - indexed_ranges = Range.sort_and_merge_overlap(indexed_ranges, True) + indexed_ranges = self._indexed_ranges(splits) ranges = [] for data_range in Range.sort_and_merge_overlap(data_ranges, True): ranges.extend(data_range.exclude(indexed_ranges)) return Range.sort_and_merge_overlap(ranges, True) + def _indexed_ranges(self, splits): + indexed_ranges = [ + Range(split.row_range_start, split.row_range_end) + for split in splits + ] + return Range.sort_and_merge_overlap(indexed_ranges, True) + def _split_row_ranges(self, split): from pypaimon.globalindex.indexed_split import IndexedSplit @@ -324,7 +346,7 @@ def _contains_row_id(self, ranges, row_id): def _index_options(self): return dict(self._options) - def _slow_search_metric(self): + def _raw_search_metric(self): metric = self._vector_metric if metric is None: return "l2" @@ -390,9 +412,9 @@ def __init__(self, table, limit, vector_column, query_vector, filter_=None, partition_filter=partition_filter) self._query_vector = query_vector - def read(self, splits): - # type: (List[VectorSearchSplit]) -> GlobalIndexResult - if not splits and self._fast_search(): + def read(self, splits, next_row_id=None): + # type: (List[VectorSearchSplit], Optional[int]) -> GlobalIndexResult + if not splits and not self._raw_search_enabled(): return GlobalIndexResult.create_empty() result = ( @@ -400,7 +422,8 @@ def read(self, splits): if not splits else self._search_one(self._query_vector, splits, self._pre_filter(splits)) ) - return self._with_slow_search(result, splits, self._query_vector) + return self._with_raw_search( + result, splits, self._query_vector, next_row_id) class BatchVectorSearchReadImpl(AbstractVectorSearchReadImpl, @@ -414,10 +437,10 @@ def __init__(self, table, limit, vector_column, query_vectors, partition_filter=partition_filter) self._query_vectors = list(query_vectors) - def read_batch(self, splits): - # type: (List[VectorSearchSplit]) -> List[GlobalIndexResult] + def read_batch(self, splits, next_row_id=None): + # type: (List[VectorSearchSplit], Optional[int]) -> List[GlobalIndexResult] n = len(self._query_vectors) - if not splits and self._fast_search(): + if not splits and not self._raw_search_enabled(): return [GlobalIndexResult.create_empty() for _ in range(n)] results = [] @@ -428,7 +451,8 @@ def read_batch(self, splits): if not splits else self._search_one(vector, splits, pre_filter) ) - results.append(self._with_slow_search(result, splits, vector)) + results.append( + self._with_raw_search(result, splits, vector, next_row_id)) return results diff --git a/paimon-python/pypaimon/table/source/vector_search_scan.py b/paimon-python/pypaimon/table/source/vector_search_scan.py index ecfa709bed54..11866e263083 100644 --- a/paimon-python/pypaimon/table/source/vector_search_scan.py +++ b/paimon-python/pypaimon/table/source/vector_search_scan.py @@ -27,9 +27,10 @@ class VectorSearchScanPlan: """Plan of vector search scan.""" - def __init__(self, splits): - # type: (List[VectorSearchSplit]) -> None + def __init__(self, splits, next_row_id=None): + # type: (List[VectorSearchSplit], Optional[int]) -> None self._splits = splits + self.next_row_id = next_row_id def splits(self): # type: () -> List[VectorSearchSplit] @@ -138,4 +139,5 @@ def index_file_filter(entry): ) ) - return VectorSearchScanPlan(splits) + next_row_id = None if snapshot is None else snapshot.next_row_id + return VectorSearchScanPlan(splits, next_row_id=next_row_id) diff --git a/paimon-python/pypaimon/tests/global_index_test.py b/paimon-python/pypaimon/tests/global_index_test.py index fcecc7b88a1b..fc04667037ab 100644 --- a/paimon-python/pypaimon/tests/global_index_test.py +++ b/paimon-python/pypaimon/tests/global_index_test.py @@ -126,8 +126,8 @@ def spy_scan(self_h, snapshot, entry_filter=None): "so global index used latest while manifest used the " "time-travel snapshot — silent correctness bug.") - def test_fast_search_false_filters_unindexed_rows_exactly(self): - table = self._create_table().copy({'global-index.fast-search': 'false'}) + def test_search_mode_detail_filters_unindexed_rows_exactly(self): + table = self._create_table().copy({'global-index.search-mode': 'detail'}) self._write_arrow(table, pa.table( {'id': [0, 1, 2, 3], 'name': ['a', 'b', 'c', 'd'], 'age': [0, 1, 2, 3], 'city': ['x', 'x', 'y', 'y']}, diff --git a/paimon-python/pypaimon/tests/vector_search_filter_test.py b/paimon-python/pypaimon/tests/vector_search_filter_test.py index c08a78e84380..ff170b8a0fc2 100644 --- a/paimon-python/pypaimon/tests/vector_search_filter_test.py +++ b/paimon-python/pypaimon/tests/vector_search_filter_test.py @@ -1458,7 +1458,7 @@ def test_hybrid_search_partition_filter_prunes_full_text_route(self): class VectorSearchManySplitsTest(unittest.TestCase): - def test_fast_search_controls_unindexed_range_scan(self): + def test_search_mode_controls_unindexed_range_scan(self): from pypaimon.globalindex.vector_search_result import ( DictBasedScoredIndexResult, ) @@ -1563,11 +1563,11 @@ def close(self_inner): query_vector=[4.0, 0.0], filter_=None) self.assertEqual([0], sorted(disabled.read(splits).results())) - table.table_schema.options["global-index.fast-search"] = "false" + table.table_schema.options["global-index.search-mode"] = "full" enabled = VectorSearchReadImpl( table, limit=2, vector_column=embedding_field, query_vector=[4.0, 0.0], filter_=None) - result = enabled.read(splits) + result = enabled.read(splits, next_row_id=3) self.assertEqual([0, 2], sorted(result.results())) self.assertEqual([Range(1, 2)], @@ -1584,7 +1584,7 @@ def close(self_inner): repeated_read = VectorSearchReadImpl( table, limit=2, vector_column=embedding_field, query_vector=[4.0, 0.0], filter_=None) - result = repeated_read.read(splits) + result = repeated_read.read(splits, next_row_id=3) self.assertEqual([0, 2], sorted(result.results())) self.assertEqual(1, table.metric_calls) From 81820230092a634f558bce6f223e4e52100c7b60 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 15:54:49 +0800 Subject: [PATCH 08/10] [core][python] Refactor unindexed global index scan --- .../globalindex/DataEvolutionBatchScan.java | 128 ++----------- .../globalindex/GlobalIndexScanner.java | 13 +- .../GlobalIndexUnindexedRowsScanner.java | 170 +++++++++++++++++ .../global_index_unindexed_rows_scanner.py | 171 ++++++++++++++++++ .../pypaimon/read/scanner/file_scanner.py | 78 ++------ 5 files changed, 380 insertions(+), 180 deletions(-) create mode 100644 paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexUnindexedRowsScanner.java create mode 100644 paimon-python/pypaimon/globalindex/global_index_unindexed_rows_scanner.py diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java index a9b4fdb09126..dbac26d4d21b 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/DataEvolutionBatchScan.java @@ -23,7 +23,6 @@ import org.apache.paimon.Snapshot; import org.apache.paimon.annotation.VisibleForTesting; import org.apache.paimon.data.BinaryRow; -import org.apache.paimon.data.InternalRow; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.manifest.PartitionEntry; import org.apache.paimon.metrics.MetricRegistry; @@ -38,11 +37,7 @@ import org.apache.paimon.table.source.DataTableBatchScan; import org.apache.paimon.table.source.DataTableScan; import org.apache.paimon.table.source.InnerTableScan; -import org.apache.paimon.table.source.ReadBuilder; import org.apache.paimon.table.source.Split; -import org.apache.paimon.table.source.TableRead; -import org.apache.paimon.table.source.TableScan; -import org.apache.paimon.table.source.snapshot.SnapshotReader; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.Filter; import org.apache.paimon.utils.Range; @@ -63,7 +58,6 @@ import java.util.function.Function; import static org.apache.paimon.table.SpecialFields.ROW_ID; -import static org.apache.paimon.table.SpecialFields.rowTypeWithRowId; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; import static org.apache.paimon.utils.ManifestReadThreadPool.randomlyExecuteSequentialReturn; @@ -251,7 +245,16 @@ public Plan plan() { ScoreGetter scoreGetter = null; if (rowRangeIndex == null) { - Optional indexResult = evalGlobalIndex(); + Snapshot snapshot = null; + Optional indexResult; + if (globalIndexResult != null) { + indexResult = Optional.of(globalIndexResult); + } else if (filter == null || !table.coreOptions().globalIndexEnabled()) { + indexResult = Optional.empty(); + } else { + snapshot = tryTravelOrLatest(table); + indexResult = evalGlobalIndex(snapshot); + } if (indexResult.isPresent()) { GlobalIndexResult result = indexResult.get(); RoaringNavigableMap64 rowIds = result.results(); @@ -262,7 +265,7 @@ public Plan plan() { && table.coreOptions().globalIndexSearchMode() != GlobalIndexSearchMode.FAST; if (scanUnindexedRanges) { - rowIds = withUnindexedRows(rowIds); + rowIds = unindexedRowsScanner(snapshot).withUnindexedRows(rowIds); rowRanges = rowIds.toRangeList(); } rowRangeIndex = RowRangeIndex.create(rowRanges); @@ -280,108 +283,15 @@ public Plan plan() { return wrapToIndexSplits(splits, rowRangeIndex, scoreGetter); } - private RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedResultRows) { - TableScan.Plan allDataPlan = null; - Snapshot snapshot; - if (table.coreOptions().globalIndexSearchMode() == GlobalIndexSearchMode.DETAIL) { - allDataPlan = allDataPlan(); - snapshot = - batchScan.snapshotReader().snapshotManager().snapshot(snapshotId(allDataPlan)); - } else { - snapshot = tryTravelOrLatest(table); - } - - List unindexedRanges = unindexedRanges(allDataPlan, snapshot); - - RoaringNavigableMap64 rows = new RoaringNavigableMap64(); - rows.or(indexedResultRows); - rows.or(matchingRows(unindexedRanges)); - return rows; - } - - private List unindexedRanges(@Nullable TableScan.Plan allDataPlan, Snapshot snapshot) { - if (snapshot == null || snapshot.nextRowId() == null || snapshot.nextRowId() <= 0) { - return Collections.emptyList(); - } - - List dataRanges = new ArrayList<>(); - if (table.coreOptions().globalIndexSearchMode() == GlobalIndexSearchMode.DETAIL) { - if (allDataPlan == null) { - return Collections.emptyList(); - } - for (Split split : allDataPlan.splits()) { - if (!(split instanceof DataSplit)) { - continue; - } - for (DataFileMeta file : ((DataSplit) split).dataFiles()) { - if (file.firstRowId() != null) { - dataRanges.add(file.nonNullRowIdRange()); - } - } - } - } else { - dataRanges.add(new Range(0, snapshot.nextRowId() - 1)); - } - - List predicateIndexedRanges = - GlobalIndexScanner.indexedRanges( - table, - batchScan.snapshotReader().manifestsReader().partitionFilter(), - filter, - snapshot); - predicateIndexedRanges = Range.sortAndMergeOverlap(predicateIndexedRanges, true); - - List unindexedRanges = new ArrayList<>(); - for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { - unindexedRanges.addAll(dataRange.exclude(predicateIndexedRanges)); - } - return Range.sortAndMergeOverlap(unindexedRanges, true); - } - - private RoaringNavigableMap64 matchingRows(List ranges) { - RoaringNavigableMap64 rows = new RoaringNavigableMap64(); - if (ranges.isEmpty()) { - return rows; - } - - RowType readType = rowTypeWithRowId(table.rowType()); - RowRangeIndex rowRangeIndex = RowRangeIndex.create(ranges); - ReadBuilder readBuilder = table.newReadBuilder().withReadType(readType).withFilter(filter); - readBuilder.withPartitionFilter( - batchScan.snapshotReader().manifestsReader().partitionFilter()); - List splits = readBuilder.withRowRangeIndex(rowRangeIndex).newScan().plan().splits(); - int rowIdIndex = readType.getFieldIndex(ROW_ID.name()); - try { - TableRead read = readBuilder.newRead(); - try (org.apache.paimon.reader.RecordReader reader = - read.executeFilter().createReader(splits)) { - reader.forEachRemaining(row -> rows.add(row.getLong(rowIdIndex))); - } - } catch (IOException e) { - throw new RuntimeException( - "Failed to scan unindexed data for global index raw search.", e); - } - return rows; - } - - private long snapshotId(TableScan.Plan plan) { - if (plan instanceof SnapshotReader.Plan) { - Long snapshotId = ((SnapshotReader.Plan) plan).snapshotId(); - if (snapshotId != null) { - return snapshotId; - } - } - throw new IllegalStateException("Cannot read global index coverage without a snapshot."); - } - - private TableScan.Plan allDataPlan() { - ReadBuilder readBuilder = table.newReadBuilder(); - readBuilder.withPartitionFilter( - batchScan.snapshotReader().manifestsReader().partitionFilter()); - return readBuilder.newScan().plan(); + private GlobalIndexUnindexedRowsScanner unindexedRowsScanner(Snapshot snapshot) { + return new GlobalIndexUnindexedRowsScanner( + table, + snapshot, + batchScan.snapshotReader().manifestsReader().partitionFilter(), + filter); } - private Optional evalGlobalIndex() { + private Optional evalGlobalIndex(Snapshot snapshot) { if (this.globalIndexResult != null) { return Optional.of(globalIndexResult); } @@ -395,7 +305,7 @@ private Optional evalGlobalIndex() { PartitionPredicate partitionFilter = batchScan.snapshotReader().manifestsReader().partitionFilter(); Optional optionalScanner = - GlobalIndexScanner.create(table, partitionFilter, filter); + GlobalIndexScanner.create(table, partitionFilter, filter, snapshot); if (!optionalScanner.isPresent()) { return Optional.empty(); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index d5b13ef61075..dcdbe1dc59d1 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -226,12 +226,17 @@ public static List indexedRanges( public static Optional create( FileStoreTable table, PartitionPredicate partitionFilter, Predicate filter) { + return create(table, partitionFilter, filter, tryTravelOrLatest(table)); + } + + public static Optional create( + FileStoreTable table, + PartitionPredicate partitionFilter, + Predicate filter, + Snapshot snapshot) { List indexFiles = table.store().newIndexFileHandler() - .scan( - tryTravelOrLatest(table), - indexFileFilter(table, partitionFilter, filter)) - .stream() + .scan(snapshot, indexFileFilter(table, partitionFilter, filter)).stream() .map(IndexManifestEntry::indexFile) .collect(Collectors.toList()); return create(table, indexFiles); diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexUnindexedRowsScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexUnindexedRowsScanner.java new file mode 100644 index 000000000000..11998b42b7c9 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexUnindexedRowsScanner.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +import org.apache.paimon.CoreOptions.GlobalIndexSearchMode; +import org.apache.paimon.Snapshot; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.partition.PartitionPredicate; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.source.DataSplit; +import org.apache.paimon.table.source.ReadBuilder; +import org.apache.paimon.table.source.ScanMode; +import org.apache.paimon.table.source.Split; +import org.apache.paimon.table.source.TableRead; +import org.apache.paimon.table.source.snapshot.SnapshotReader; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Range; +import org.apache.paimon.utils.RoaringNavigableMap64; +import org.apache.paimon.utils.RowRangeIndex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.paimon.CoreOptions.SCAN_CREATION_TIME_MILLIS; +import static org.apache.paimon.CoreOptions.SCAN_FILE_CREATION_TIME_MILLIS; +import static org.apache.paimon.CoreOptions.SCAN_MODE; +import static org.apache.paimon.CoreOptions.SCAN_SNAPSHOT_ID; +import static org.apache.paimon.CoreOptions.SCAN_TAG_NAME; +import static org.apache.paimon.CoreOptions.SCAN_TIMESTAMP; +import static org.apache.paimon.CoreOptions.SCAN_TIMESTAMP_MILLIS; +import static org.apache.paimon.CoreOptions.SCAN_VERSION; +import static org.apache.paimon.CoreOptions.SCAN_WATERMARK; +import static org.apache.paimon.CoreOptions.StartupMode.FROM_SNAPSHOT; +import static org.apache.paimon.table.SpecialFields.ROW_ID; +import static org.apache.paimon.table.SpecialFields.rowTypeWithRowId; + +/** Scans raw rows whose row ids are not covered by a global index query. */ +public class GlobalIndexUnindexedRowsScanner { + + private final FileStoreTable table; + private final Snapshot snapshot; + private final PartitionPredicate partitionFilter; + private final Predicate filter; + + public GlobalIndexUnindexedRowsScanner( + FileStoreTable table, + Snapshot snapshot, + PartitionPredicate partitionFilter, + Predicate filter) { + this.table = table; + this.snapshot = snapshot; + this.partitionFilter = partitionFilter; + this.filter = filter; + } + + public RoaringNavigableMap64 withUnindexedRows(RoaringNavigableMap64 indexedResultRows) { + RoaringNavigableMap64 rows = new RoaringNavigableMap64(); + rows.or(indexedResultRows); + rows.or(matchingRows(unindexedRanges())); + return rows; + } + + private List unindexedRanges() { + if (snapshot == null || snapshot.nextRowId() == null || snapshot.nextRowId() <= 0) { + return Collections.emptyList(); + } + + List dataRanges; + if (table.coreOptions().globalIndexSearchMode() == GlobalIndexSearchMode.DETAIL) { + dataRanges = dataRangesByDataFiles(); + } else { + dataRanges = Collections.singletonList(new Range(0, snapshot.nextRowId() - 1)); + } + + List predicateIndexedRanges = + GlobalIndexScanner.indexedRanges(table, partitionFilter, filter, snapshot); + predicateIndexedRanges = Range.sortAndMergeOverlap(predicateIndexedRanges, true); + + List unindexedRanges = new ArrayList<>(); + for (Range dataRange : Range.sortAndMergeOverlap(dataRanges, true)) { + unindexedRanges.addAll(dataRange.exclude(predicateIndexedRanges)); + } + return Range.sortAndMergeOverlap(unindexedRanges, true); + } + + private List dataRangesByDataFiles() { + SnapshotReader snapshotReader = + table.newSnapshotReader() + .withPartitionFilter(partitionFilter) + .withMode(ScanMode.ALL) + .withSnapshot(snapshot); + List dataRanges = new ArrayList<>(); + for (Split split : snapshotReader.read().splits()) { + if (!(split instanceof DataSplit)) { + continue; + } + for (DataFileMeta file : ((DataSplit) split).dataFiles()) { + if (file.firstRowId() != null) { + dataRanges.add(file.nonNullRowIdRange()); + } + } + } + return dataRanges; + } + + private RoaringNavigableMap64 matchingRows(List ranges) { + RoaringNavigableMap64 rows = new RoaringNavigableMap64(); + if (ranges.isEmpty()) { + return rows; + } + + RowType readType = rowTypeWithRowId(table.rowType()); + RowRangeIndex rowRangeIndex = RowRangeIndex.create(ranges); + ReadBuilder readBuilder = + table.copyWithoutTimeTravel(snapshotReadOptions()) + .newReadBuilder() + .withReadType(readType) + .withFilter(filter) + .withPartitionFilter(partitionFilter); + List splits = readBuilder.withRowRangeIndex(rowRangeIndex).newScan().plan().splits(); + int rowIdIndex = readType.getFieldIndex(ROW_ID.name()); + try { + TableRead read = readBuilder.newRead(); + try (org.apache.paimon.reader.RecordReader reader = + read.executeFilter().createReader(splits)) { + reader.forEachRemaining(row -> rows.add(row.getLong(rowIdIndex))); + } + } catch (IOException e) { + throw new RuntimeException( + "Failed to scan unindexed data for global index raw search.", e); + } + return rows; + } + + private Map snapshotReadOptions() { + Map options = new HashMap<>(); + options.put(SCAN_MODE.key(), FROM_SNAPSHOT.toString()); + options.put(SCAN_SNAPSHOT_ID.key(), String.valueOf(snapshot.id())); + options.put(SCAN_TAG_NAME.key(), null); + options.put(SCAN_WATERMARK.key(), null); + options.put(SCAN_TIMESTAMP.key(), null); + options.put(SCAN_TIMESTAMP_MILLIS.key(), null); + options.put(SCAN_FILE_CREATION_TIME_MILLIS.key(), null); + options.put(SCAN_CREATION_TIME_MILLIS.key(), null); + options.put(SCAN_VERSION.key(), null); + return options; + } +} diff --git a/paimon-python/pypaimon/globalindex/global_index_unindexed_rows_scanner.py b/paimon-python/pypaimon/globalindex/global_index_unindexed_rows_scanner.py new file mode 100644 index 000000000000..58438b99ccd4 --- /dev/null +++ b/paimon-python/pypaimon/globalindex/global_index_unindexed_rows_scanner.py @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Scan raw rows whose row ids are not covered by a global index query.""" + +from typing import Callable, List + +from pypaimon.common.options.core_options import ( + CoreOptions, + GlobalIndexSearchMode, + StartupMode, +) +from pypaimon.globalindex.global_index_result import GlobalIndexResult +from pypaimon.read.scanner.data_evolution_split_generator import ( + DataEvolutionSplitGenerator, +) +from pypaimon.table.special_fields import SpecialFields +from pypaimon.utils.range import Range +from pypaimon.utils.roaring_bitmap import RoaringBitmap64 + + +class GlobalIndexUnindexedRowsScanner: + + def __init__( + self, + table, + snapshot, + manifest_files, + partition_filter, + predicate, + target_split_size, + open_file_cost, + read_manifest_entries: Callable, + deletion_files_map: Callable, + ): + self._table = table + self._snapshot = snapshot + self._manifest_files = manifest_files + self._partition_filter = partition_filter + self._predicate = predicate + self._target_split_size = target_split_size + self._open_file_cost = open_file_cost + self._read_manifest_entries = read_manifest_entries + self._deletion_files_map = deletion_files_map + + def with_unindexed_rows(self, indexed_result): + entries = None + data_ranges = [] + mode = self._table.options.global_index_search_mode() + + if mode == GlobalIndexSearchMode.DETAIL: + entries = self._read_manifest_entries(self._manifest_files) + for entry in entries: + first_row_id = entry.file.first_row_id + if first_row_id is not None: + data_ranges.append(entry.file.row_id_range()) + elif (self._snapshot is not None + and self._snapshot.next_row_id is not None + and self._snapshot.next_row_id > 0): + data_ranges = [Range(0, int(self._snapshot.next_row_id) - 1)] + + unindexed_ranges = self._unindexed_ranges(data_ranges) + + if entries is None and unindexed_ranges: + entries = self._read_manifest_entries(self._manifest_files) + if entries is None: + entries = [] + + bitmap = RoaringBitmap64.or_( + indexed_result.results(), + self._matching_rows(entries, unindexed_ranges), + ) + return GlobalIndexResult.create(bitmap) + + def _unindexed_ranges(self, data_ranges): + if (self._snapshot is None + or self._snapshot.next_row_id is None + or self._snapshot.next_row_id <= 0): + return [] + + from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner + + predicate_indexed_ranges = Range.sort_and_merge_overlap( + GlobalIndexScanner.predicate_indexed_ranges( + self._table, + self._partition_filter, + self._predicate, + self._snapshot, + ), + merge=True, + adjacent=True, + ) + + unindexed_ranges = [] + for data_range in Range.sort_and_merge_overlap( + data_ranges, merge=True, adjacent=True): + unindexed_ranges.extend(data_range.exclude(predicate_indexed_ranges)) + return Range.sort_and_merge_overlap( + unindexed_ranges, merge=True, adjacent=True) + + def _matching_rows(self, entries, row_ranges): + rows = RoaringBitmap64() + if not row_ranges: + return rows + + entries = _filter_manifest_entries_by_row_ranges(entries, row_ranges) + if not entries: + return rows + + split_generator = DataEvolutionSplitGenerator( + self._table, + self._target_split_size, + self._open_file_cost, + self._deletion_files_map(entries), + row_ranges, + ) + splits = split_generator.create_splits(entries) + read_type = SpecialFields.row_type_with_row_id(self._table.fields) + row_id_index = len(read_type) - 1 + reader = self._table.copy(self._snapshot_read_options()) \ + .new_read_builder().with_read_type(read_type) \ + .with_filter(self._predicate).new_read() + for row in reader.to_iterator(splits): + rows.add(int(row.get_field(row_id_index))) + return rows + + def _snapshot_read_options(self): + options = { + CoreOptions.SCAN_MODE.key(): StartupMode.FROM_SNAPSHOT.value, + CoreOptions.SCAN_SNAPSHOT_ID.key(): str(self._snapshot.id), + } + current_options = self._table.options.options.to_map() + for option in ( + CoreOptions.SCAN_TAG_NAME, + CoreOptions.SCAN_WATERMARK, + CoreOptions.SCAN_TIMESTAMP, + CoreOptions.SCAN_TIMESTAMP_MILLIS, + CoreOptions.SCAN_FILE_CREATION_TIME_MILLIS, + CoreOptions.SCAN_CREATION_TIME_MILLIS, + ): + if option.key() in current_options: + options[option.key()] = None + return options + + +def _filter_manifest_entries_by_row_ranges(entries: List, row_ranges: List) -> List: + filtered_entries = [] + for entry in entries: + file = entry.file + first_row_id = file.first_row_id + if first_row_id is None: + filtered_entries.append(entry) + continue + file_range = file.row_id_range() + if any(file_range.overlaps(row_range) for row_range in row_ranges): + filtered_entries.append(entry) + return filtered_entries diff --git a/paimon-python/pypaimon/read/scanner/file_scanner.py b/paimon-python/pypaimon/read/scanner/file_scanner.py index ae7ee818b837..0fac746f0879 100755 --- a/paimon-python/pypaimon/read/scanner/file_scanner.py +++ b/paimon-python/pypaimon/read/scanner/file_scanner.py @@ -25,7 +25,9 @@ from pypaimon.common.options.core_options import GlobalIndexSearchMode from pypaimon.common.predicate import Predicate from pypaimon.globalindex import ScoredGlobalIndexResult -from pypaimon.globalindex.global_index_result import GlobalIndexResult +from pypaimon.globalindex.global_index_unindexed_rows_scanner import ( + GlobalIndexUnindexedRowsScanner, +) from pypaimon.manifest.index_manifest_file import IndexManifestFile from pypaimon.manifest.manifest_file_manager import ManifestFileManager from pypaimon.manifest.manifest_list_manager import ManifestListManager @@ -51,12 +53,10 @@ from pypaimon.read.scanner.primary_key_table_split_generator import \ PrimaryKeyTableSplitGenerator from pypaimon.read.split import DataSplit -from pypaimon.table.special_fields import SpecialFields from pypaimon.snapshot.snapshot import Snapshot from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.source.deletion_file import DeletionFile from pypaimon.utils.range import Range -from pypaimon.utils.roaring_bitmap import RoaringBitmap64 def _row_ranges_from_predicate(predicate: Optional[Predicate]) -> Optional[List]: @@ -365,73 +365,17 @@ def _create_data_evolution_split_generator(self): ) def _with_unindexed_rows(self, indexed_result, manifest_files, snapshot): - mode = self.table.options.global_index_search_mode() - entries = None - if mode == GlobalIndexSearchMode.DETAIL: - entries = self.read_manifest_entries(manifest_files) - data_ranges = [] - for entry in entries: - first_row_id = entry.file.first_row_id - if first_row_id is not None: - data_ranges.append(entry.file.row_id_range()) - elif snapshot is not None and snapshot.next_row_id is not None and snapshot.next_row_id > 0: - data_ranges = [Range(0, int(snapshot.next_row_id) - 1)] - else: - data_ranges = [] - - from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner - - predicate_indexed_ranges = Range.sort_and_merge_overlap( - GlobalIndexScanner.predicate_indexed_ranges( + return GlobalIndexUnindexedRowsScanner( self.table, + snapshot, + manifest_files, self.partition_key_predicate, self.predicate, - snapshot, - ), - merge=True, - adjacent=True, - ) - unindexed_ranges = [] - for data_range in Range.sort_and_merge_overlap( - data_ranges, merge=True, adjacent=True): - unindexed_ranges.extend(data_range.exclude(predicate_indexed_ranges)) - unindexed_ranges = Range.sort_and_merge_overlap( - unindexed_ranges, merge=True, adjacent=True) - - if entries is None and unindexed_ranges: - entries = self.read_manifest_entries(manifest_files) - if entries is None: - entries = [] - - bitmap = RoaringBitmap64.or_( - indexed_result.results(), - self._matching_unindexed_rows(entries, unindexed_ranges)) - return GlobalIndexResult.create(bitmap) - - def _matching_unindexed_rows(self, entries, row_ranges): - rows = RoaringBitmap64() - if not row_ranges: - return rows - - entries = _filter_manifest_entries_by_row_ranges(entries, row_ranges) - if not entries: - return rows - - split_generator = DataEvolutionSplitGenerator( - self.table, - self.target_split_size, - self.open_file_cost, - self._deletion_files_map(entries), - row_ranges, - ) - splits = split_generator.create_splits(entries) - read_type = SpecialFields.row_type_with_row_id(self.table.fields) - row_id_index = len(read_type) - 1 - reader = self.table.new_read_builder().with_read_type(read_type) \ - .with_filter(self.predicate).new_read() - for row in reader.to_iterator(splits): - rows.add(int(row.get_field(row_id_index))) - return rows + self.target_split_size, + self.open_file_cost, + self.read_manifest_entries, + self._deletion_files_map, + ).with_unindexed_rows(indexed_result) def plan_files(self) -> List[ManifestEntry]: manifest_files, snapshot = self.manifest_scanner() From 33e1e9700ee1d5f7fa68ad9b7b0a2bdd9475bb76 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 18:11:50 +0800 Subject: [PATCH 09/10] [spark] Fix vector search mode compile --- .../paimon/spark/read/SparkVectorReadImpl.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java index c85bba61f503..9a2bc68d5d09 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/read/SparkVectorReadImpl.java @@ -29,6 +29,7 @@ import org.apache.paimon.predicate.Predicate; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.source.VectorReadImpl; +import org.apache.paimon.table.source.VectorScan; import org.apache.paimon.table.source.VectorSearchSplit; import org.apache.paimon.types.DataField; import org.apache.paimon.utils.InstantiationUtil; @@ -78,11 +79,21 @@ public SparkVectorReadImpl( super(table, partitionFilter, filter, limit, vectorColumn, vector, options); } + @Override + public GlobalIndexResult read(VectorScan.Plan plan, @Nullable Long nextRowId) { + // Raw search scans table data and should run in the coordinator with normal Paimon split + // planning; Spark distribution below is only for index-only evaluation. + if (rawSearchEnabled()) { + return super.read(plan, nextRowId); + } + return read(plan.splits()); + } + @Override public GlobalIndexResult read(List splits) { - // Slow search scans table data and should run in the coordinator with normal Paimon split + // Raw search scans table data and should run in the coordinator with normal Paimon split // planning; Spark distribution below is only for index-only evaluation. - if (slowSearchEnabled()) { + if (rawSearchEnabled()) { return super.read(splits); } if (splits.isEmpty()) { From be05b44d5158bacfcf54093f8e13e40f0c34c65c Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 19 Jun 2026 18:31:51 +0800 Subject: [PATCH 10/10] [python] Fix global index search lint --- .../pypaimon/read/scanner/file_scanner.py | 19 +++++++++---------- .../source/batch_vector_search_builder.py | 1 - 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/paimon-python/pypaimon/read/scanner/file_scanner.py b/paimon-python/pypaimon/read/scanner/file_scanner.py index 0fac746f0879..13e106474a2c 100755 --- a/paimon-python/pypaimon/read/scanner/file_scanner.py +++ b/paimon-python/pypaimon/read/scanner/file_scanner.py @@ -56,7 +56,6 @@ from pypaimon.snapshot.snapshot import Snapshot from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.source.deletion_file import DeletionFile -from pypaimon.utils.range import Range def _row_ranges_from_predicate(predicate: Optional[Predicate]) -> Optional[List]: @@ -366,15 +365,15 @@ def _create_data_evolution_split_generator(self): def _with_unindexed_rows(self, indexed_result, manifest_files, snapshot): return GlobalIndexUnindexedRowsScanner( - self.table, - snapshot, - manifest_files, - self.partition_key_predicate, - self.predicate, - self.target_split_size, - self.open_file_cost, - self.read_manifest_entries, - self._deletion_files_map, + self.table, + snapshot, + manifest_files, + self.partition_key_predicate, + self.predicate, + self.target_split_size, + self.open_file_cost, + self.read_manifest_entries, + self._deletion_files_map, ).with_unindexed_rows(indexed_result) def plan_files(self) -> List[ManifestEntry]: diff --git a/paimon-python/pypaimon/table/source/batch_vector_search_builder.py b/paimon-python/pypaimon/table/source/batch_vector_search_builder.py index 98b2d8795749..f3afff7198f6 100644 --- a/paimon-python/pypaimon/table/source/batch_vector_search_builder.py +++ b/paimon-python/pypaimon/table/source/batch_vector_search_builder.py @@ -23,7 +23,6 @@ AbstractVectorSearchBuilderImpl, ) from pypaimon.table.source.vector_search_read import ( - BatchVectorSearchRead, BatchVectorSearchReadImpl, )