use v2 filter

apache · Jan 17, 2025 · dd71d2b · dd71d2b
1 parent dbd129d
commit dd71d2b
Show file tree

Hide file tree

Showing 24 changed files with 931 additions and 189 deletions.
diff --git a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonScan.scala b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonScan.scala
@@ -35,7 +35,7 @@ case class PaimonScan(
     filters: Seq[Predicate],
     reservedFilters: Seq[Filter],
     override val pushDownLimit: Option[Int],
-    disableBucketedScan: Boolean = false)
+    disableBucketedScan: Boolean = true)
   extends PaimonBaseScan(table, requiredSchema, filters, reservedFilters, pushDownLimit)
   with SupportsRuntimeFiltering {
 
@@ -57,11 +57,9 @@ case class PaimonScan(
       case _ => None
     }
     if (partitionFilter.nonEmpty) {
-      this.runtimeFilters = filters
       readBuilder.withFilter(partitionFilter.head)
       // set inputPartitions null to trigger to get the new splits.
       inputPartitions = null
     }
   }
-
 }
diff --git a/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonScanBuilder.scala b/paimon-spark/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonScanBuilder.scala
@@ -18,6 +18,61 @@
 
 package org.apache.paimon.spark
 
+import org.apache.paimon.predicate.{PartitionPredicateVisitor, Predicate}
 import org.apache.paimon.table.Table
 
-class PaimonScanBuilder(table: Table) extends PaimonBaseScanBuilder(table)
+import org.apache.spark.sql.connector.read.SupportsPushDownFilters
+import org.apache.spark.sql.sources.Filter
+
+import scala.collection.mutable
+
+class PaimonScanBuilder(table: Table)
+  extends PaimonBaseScanBuilder(table)
+  with SupportsPushDownFilters {
+
+  private var pushedSparkFilters = Array.empty[Filter]
+
+  /**
+   * Pushes down filters, and returns filters that need to be evaluated after scanning. <p> Rows
+   * should be returned from the data source if and only if all the filters match. That is, filters
+   * must be interpreted as ANDed together.
+   */
+  override def pushFilters(filters: Array[Filter]): Array[Filter] = {
+    val pushable = mutable.ArrayBuffer.empty[(Filter, Predicate)]
+    val postScan = mutable.ArrayBuffer.empty[Filter]
+    val reserved = mutable.ArrayBuffer.empty[Filter]
+
+    val converter = new SparkFilterConverter(table.rowType)
+    val visitor = new PartitionPredicateVisitor(table.partitionKeys())
+    filters.foreach {
+      filter =>
+        val predicate = converter.convertIgnoreFailure(filter)
+        if (predicate == null) {
+          postScan.append(filter)
+        } else {
+          pushable.append((filter, predicate))
+          if (predicate.visit(visitor)) {
+            reserved.append(filter)
+          } else {
+            postScan.append(filter)
+          }
+        }
+    }
+
+    if (pushable.nonEmpty) {
+      this.pushedSparkFilters = pushable.map(_._1).toArray
+      this.pushedPaimonPredicates = pushable.map(_._2).toArray
+    }
+    if (reserved.nonEmpty) {
+      this.reservedFilters = reserved.toArray
+    }
+    if (postScan.nonEmpty) {
+      this.hasPostScanPredicates = true
+    }
+    postScan.toArray
+  }
+
+  override def pushedFilters(): Array[Filter] = {
+    pushedSparkFilters
+  }
+}
diff --git a/...park/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonSplitScanBuilder.scala b/...park/paimon-spark-3.2/src/main/scala/org/apache/paimon/spark/PaimonSplitScanBuilder.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark
+
+import org.apache.paimon.table.KnownSplitsTable
+
+import org.apache.spark.sql.connector.read.Scan
+
+class PaimonSplitScanBuilder(table: KnownSplitsTable) extends PaimonScanBuilder(table) {
+  override def build(): Scan = {
+    PaimonSplitScan(table, table.splits(), requiredSchema, pushedPaimonPredicates)
+  }
+}
diff --git a/...c/main/scala/org/apache/paimon/spark/catalyst/analysis/expressions/ExpressionHelper.scala b/...c/main/scala/org/apache/paimon/spark/catalyst/analysis/expressions/ExpressionHelper.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.catalyst.analysis.expressions
+
+import org.apache.paimon.predicate.{Predicate, PredicateBuilder}
+import org.apache.paimon.spark.SparkFilterConverter
+import org.apache.paimon.types.RowType
+
+import org.apache.spark.sql.PaimonUtils.{normalizeExprs, translateFilter}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+
+trait ExpressionHelper extends ExpressionHelperBase {
+
+  def convertConditionToPaimonPredicate(
+      condition: Expression,
+      output: Seq[Attribute],
+      rowType: RowType,
+      ignorePartialFailure: Boolean = false): Option[Predicate] = {
+    val converter = new SparkFilterConverter(rowType)
+    val filters = normalizeExprs(Seq(condition), output)
+      .flatMap(splitConjunctivePredicates(_).flatMap {
+        f =>
+          val filter = translateFilter(f, supportNestedPredicatePushdown = true)
+          if (filter.isEmpty && !ignorePartialFailure) {
+            throw new RuntimeException(
+              "Exec update failed:" +
+                s" cannot translate expression to source filter: $f")
+          }
+          filter
+      })
+
+    val predicates = filters.map(converter.convert(_, ignorePartialFailure)).filter(_ != null)
+    if (predicates.isEmpty) {
+      None
+    } else {
+      Some(PredicateBuilder.and(predicates: _*))
+    }
+  }
+}
diff --git a/...park/paimon-spark-3.2/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala b/...park/paimon-spark-3.2/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, Literal}
+import org.apache.spark.sql.catalyst.plans.logical.Filter
+
+class PaimonPushDownTest extends PaimonPushDownTestBase {
+
+  override def checkFilterExists(sql: String): Boolean = {
+    spark
+      .sql(sql)
+      .queryExecution
+      .optimizedPlan
+      .find {
+        case Filter(_: Expression, _) => true
+        case _ => false
+      }
+      .isDefined
+  }
+
+  override def checkEqualToFilterExists(sql: String, name: String, value: Literal): Boolean = {
+    spark
+      .sql(sql)
+      .queryExecution
+      .optimizedPlan
+      .find {
+        case Filter(c: Expression, _) =>
+          c.find {
+            case EqualTo(a: AttributeReference, r: Literal) =>
+              a.name.equals(name) && r.equals(value)
+            case _ => false
+          }.isDefined
+        case _ => false
+      }
+      .isDefined
+  }
+}
diff --git a/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/PaimonScan.scala b/paimon-spark/paimon-spark-3.3/src/main/scala/org/apache/paimon/spark/PaimonScan.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark
+
+import org.apache.paimon.predicate.Predicate
+import org.apache.paimon.table.{BucketMode, FileStoreTable, Table}
+import org.apache.paimon.table.source.{DataSplit, Split}
+
+import org.apache.spark.sql.PaimonUtils.fieldReference
+import org.apache.spark.sql.connector.expressions._
+import org.apache.spark.sql.connector.read.{SupportsReportPartitioning, SupportsRuntimeFiltering}
+import org.apache.spark.sql.connector.read.partitioning.{KeyGroupedPartitioning, Partitioning, UnknownPartitioning}
+import org.apache.spark.sql.sources.{Filter, In}
+import org.apache.spark.sql.types.StructType
+
+import scala.collection.JavaConverters._
+
+case class PaimonScan(
+    table: Table,
+    requiredSchema: StructType,
+    filters: Seq[Predicate],
+    reservedFilters: Seq[Filter],
+    override val pushDownLimit: Option[Int],
+    bucketedScanDisabled: Boolean = false)
+  extends PaimonBaseScan(table, requiredSchema, filters, reservedFilters, pushDownLimit)
+  with SupportsRuntimeFiltering
+  with SupportsReportPartitioning {
+
+  def disableBucketedScan(): PaimonScan = {
+    copy(bucketedScanDisabled = true)
+  }
+
+  @transient
+  private lazy val extractBucketTransform: Option[Transform] = {
+    table match {
+      case fileStoreTable: FileStoreTable =>
+        val bucketSpec = fileStoreTable.bucketSpec()
+        if (bucketSpec.getBucketMode != BucketMode.HASH_FIXED) {
+          None
+        } else if (bucketSpec.getBucketKeys.size() > 1) {
+          None
+        } else {
+          // Spark does not support bucket with several input attributes,
+          // so we only support one bucket key case.
+          assert(bucketSpec.getNumBuckets > 0)
+          assert(bucketSpec.getBucketKeys.size() == 1)
+          val bucketKey = bucketSpec.getBucketKeys.get(0)
+          if (requiredSchema.exists(f => conf.resolver(f.name, bucketKey))) {
+            Some(Expressions.bucket(bucketSpec.getNumBuckets, bucketKey))
+          } else {
+            None
+          }
+        }
+
+      case _ => None
+    }
+  }
+
+  private def shouldDoBucketedScan: Boolean = {
+    !bucketedScanDisabled && conf.v2BucketingEnabled && extractBucketTransform.isDefined
+  }
+
+  // Since Spark 3.3
+  override def outputPartitioning: Partitioning = {
+    extractBucketTransform
+      .map(bucket => new KeyGroupedPartitioning(Array(bucket), lazyInputPartitions.size))
+      .getOrElse(new UnknownPartitioning(0))
+  }
+
+  override def getInputPartitions(splits: Array[Split]): Seq[PaimonInputPartition] = {
+    if (!shouldDoBucketedScan || splits.exists(!_.isInstanceOf[DataSplit])) {
+      return super.getInputPartitions(splits)
+    }
+
+    splits
+      .map(_.asInstanceOf[DataSplit])
+      .groupBy(_.bucket())
+      .map {
+        case (bucket, groupedSplits) =>
+          PaimonBucketedInputPartition(groupedSplits, bucket)
+      }
+      .toSeq
+  }
+
+  // Since Spark 3.2
+  override def filterAttributes(): Array[NamedReference] = {
+    val requiredFields = readBuilder.readType().getFieldNames.asScala
+    table
+      .partitionKeys()
+      .asScala
+      .toArray
+      .filter(requiredFields.contains)
+      .map(fieldReference)
+  }
+
+  override def filter(filters: Array[Filter]): Unit = {
+    val converter = new SparkFilterConverter(table.rowType())
+    val partitionFilter = filters.flatMap {
+      case in @ In(attr, _) if table.partitionKeys().contains(attr) =>
+        Some(converter.convert(in))
+      case _ => None
+    }
+    if (partitionFilter.nonEmpty) {
+      readBuilder.withFilter(partitionFilter.head)
+      // set inputPartitions null to trigger to get the new splits.
+      inputPartitions = null
+    }
+  }
+}