Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize parquet footer reader #24007

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -14,57 +14,29 @@
package io.trino.parquet.metadata;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.trino.parquet.ParquetCorruptionException;
import io.trino.parquet.ParquetDataSourceId;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.hadoop.metadata.ColumnPath;

import java.util.List;
import java.util.Map;
import java.util.Set;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.util.Arrays.asList;
import static java.util.function.Function.identity;

public final class PrunedBlockMetadata
{
/**
* Stores only the necessary columns metadata from BlockMetadata and indexes them by path for efficient look-ups
*/
public static PrunedBlockMetadata createPrunedColumnsMetadata(BlockMetadata blockMetadata, ParquetDataSourceId dataSourceId, Map<List<String>, ColumnDescriptor> descriptorsByPath)
throws ParquetCorruptionException
{
Set<List<String>> requiredPaths = descriptorsByPath.keySet();
Map<List<String>, ColumnChunkMetadata> columnMetadataByPath = blockMetadata.columns().stream()
.collect(toImmutableMap(
column -> asList(column.getPath().toArray()),
identity(),
// Same column name may occur more than once when the file is written by case-sensitive tools
(oldValue, _) -> oldValue));
ImmutableMap.Builder<List<String>, ColumnChunkMetadata> columnMetadataByPathBuilder = ImmutableMap.builderWithExpectedSize(requiredPaths.size());
for (Map.Entry<List<String>, ColumnDescriptor> entry : descriptorsByPath.entrySet()) {
List<String> requiredPath = entry.getKey();
ColumnDescriptor columnDescriptor = entry.getValue();
ColumnChunkMetadata columnChunkMetadata = columnMetadataByPath.get(requiredPath);
if (columnChunkMetadata == null) {
throw new ParquetCorruptionException(dataSourceId, "Metadata is missing for column: %s", columnDescriptor);
}
columnMetadataByPathBuilder.put(requiredPath, columnChunkMetadata);
}
return new PrunedBlockMetadata(blockMetadata.rowCount(), dataSourceId, columnMetadataByPathBuilder.buildOrThrow());
}

private final long rowCount;
private final ParquetDataSourceId dataSourceId;
private final Map<List<String>, ColumnChunkMetadata> columnMetadataByPath;
private final Map<ColumnPath, ColumnChunkMetadata> columnMetadataByPath;
private final BlockMetadata blockMetadata;

private PrunedBlockMetadata(long rowCount, ParquetDataSourceId dataSourceId, Map<List<String>, ColumnChunkMetadata> columnMetadataByPath)
public PrunedBlockMetadata(long rowCount, ParquetDataSourceId dataSourceId, Map<ColumnPath, ColumnChunkMetadata> columnMetadataByPath)
{
this.rowCount = rowCount;
this.dataSourceId = dataSourceId;
this.columnMetadataByPath = columnMetadataByPath;
this.blockMetadata = new BlockMetadata(rowCount, ImmutableList.copyOf(columnMetadataByPath.values()));
}

public long getRowCount()
Expand All @@ -77,10 +49,15 @@ public List<ColumnChunkMetadata> getColumns()
return ImmutableList.copyOf(columnMetadataByPath.values());
}

public BlockMetadata getBlockMetadata()
{
return blockMetadata;
}

public ColumnChunkMetadata getColumnChunkMetaData(ColumnDescriptor columnDescriptor)
throws ParquetCorruptionException
{
ColumnChunkMetadata columnChunkMetadata = columnMetadataByPath.get(asList(columnDescriptor.getPath()));
ColumnChunkMetadata columnChunkMetadata = columnMetadataByPath.get(ColumnPath.get(columnDescriptor.getPath()));
if (columnChunkMetadata == null) {
throw new ParquetCorruptionException(dataSourceId, "Metadata is missing for column: %s", columnDescriptor);
}
Expand All @@ -93,6 +70,7 @@ public String toString()
return toStringHelper(this)
.add("rowCount", rowCount)
.add("columnMetadataByPath", columnMetadataByPath)
.add("blockMetadata", blockMetadata)
.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import io.trino.parquet.ParquetReaderOptions;
import io.trino.parquet.metadata.BlockMetadata;
import io.trino.parquet.metadata.ColumnChunkMetadata;
import io.trino.parquet.metadata.ParquetMetadata;
import io.trino.parquet.metadata.PrunedBlockMetadata;
import io.trino.parquet.reader.RowGroupInfo;
import io.trino.spi.predicate.TupleDomain;
Expand All @@ -39,6 +40,7 @@
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.PageType;
import org.apache.parquet.format.Util;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.io.ParquetDecodingException;
Expand All @@ -54,11 +56,11 @@
import java.util.Optional;
import java.util.Set;

import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static io.trino.parquet.BloomFilterStore.getBloomFilterStore;
import static io.trino.parquet.ParquetCompressionUtils.decompress;
import static io.trino.parquet.ParquetReaderUtils.isOnlyDictionaryEncodingPages;
import static io.trino.parquet.ParquetTypeUtils.getParquetEncoding;
import static io.trino.parquet.metadata.PrunedBlockMetadata.createPrunedColumnsMetadata;
import static io.trino.parquet.reader.TrinoColumnIndexStore.getColumnIndexStore;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.DateType.DATE;
Expand Down Expand Up @@ -180,10 +182,8 @@ public static boolean predicateMatches(
}

public static List<RowGroupInfo> getFilteredRowGroups(
long splitStart,
long splitLength,
ParquetMetadata parquetMetadata,
ParquetDataSource dataSource,
List<BlockMetadata> blocksMetaData,
List<TupleDomain<ColumnDescriptor>> parquetTupleDomains,
List<TupleDomainParquetPredicate> parquetPredicates,
Map<List<String>, ColumnDescriptor> descriptorsByPath,
Expand All @@ -192,35 +192,37 @@ public static List<RowGroupInfo> getFilteredRowGroups(
ParquetReaderOptions options)
throws IOException
{
long fileRowCount = 0;
Set<ColumnPath> columnPaths = descriptorsByPath.keySet().stream()
.map(p -> p.toArray(new String[0]))
.map(ColumnPath::get)
.collect(toImmutableSet());

List<RowGroupInfo> rowGroupInfos = parquetMetadata.getRowGroupInfo(Optional.of(dataSource), Optional.of(descriptorsByPath));
ImmutableList.Builder<RowGroupInfo> rowGroupInfoBuilder = ImmutableList.builder();
for (BlockMetadata block : blocksMetaData) {
long blockStart = block.getStartingPos();
boolean splitContainsBlock = splitStart <= blockStart && blockStart < splitStart + splitLength;
if (splitContainsBlock) {
for (int i = 0; i < parquetTupleDomains.size(); i++) {
TupleDomain<ColumnDescriptor> parquetTupleDomain = parquetTupleDomains.get(i);
TupleDomainParquetPredicate parquetPredicate = parquetPredicates.get(i);
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
Optional<BloomFilterStore> bloomFilterStore = getBloomFilterStore(dataSource, block, parquetTupleDomain, options);
PrunedBlockMetadata columnsMetadata = createPrunedColumnsMetadata(block, dataSource.getId(), descriptorsByPath);
if (predicateMatches(
parquetPredicate,
columnsMetadata,
dataSource,
descriptorsByPath,
parquetTupleDomain,
columnIndex,
bloomFilterStore,
timeZone,
domainCompactionThreshold)) {
rowGroupInfoBuilder.add(new RowGroupInfo(columnsMetadata, fileRowCount, columnIndex));
break;
}
for (RowGroupInfo rowGroupInfo : rowGroupInfos) {
BlockMetadata block = rowGroupInfo.prunedBlockMetadata().getBlockMetadata();

for (int i = 0; i < parquetTupleDomains.size(); i++) {
TupleDomain<ColumnDescriptor> parquetTupleDomain = parquetTupleDomains.get(i);
TupleDomainParquetPredicate parquetPredicate = parquetPredicates.get(i);
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, columnPaths, parquetTupleDomain, options);
Optional<BloomFilterStore> bloomFilterStore = getBloomFilterStore(dataSource, block, parquetTupleDomain, options);
if (predicateMatches(
parquetPredicate,
rowGroupInfo.prunedBlockMetadata(),
dataSource,
descriptorsByPath,
parquetTupleDomain,
columnIndex,
bloomFilterStore,
timeZone,
domainCompactionThreshold)) {
rowGroupInfoBuilder.add(new RowGroupInfo(rowGroupInfo.prunedBlockMetadata(), rowGroupInfo.fileRowOffset(), columnIndex));
break;
}
}
fileRowCount += block.rowCount();
}

return rowGroupInfoBuilder.build();
}

Expand Down
Loading
Loading