snmvaughan
diff --git a/‎sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/util/InternalRowSet.scala
Lines changed: 0 additions & 65 deletions b/‎sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/util/InternalRowSet.scala
Lines changed: 0 additions & 65 deletions
diff --git a/‎sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/util/InternalRowSet.scala
Lines changed: 0 additions & 69 deletions b/‎sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/util/InternalRowSet.scala
Lines changed: 0 additions & 69 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
Lines changed: 4 additions & 4 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
Lines changed: 4 additions & 4 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala
Lines changed: 84 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala
Lines changed: 84 additions & 0 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
Lines changed: 22 additions & 3 deletions b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
Lines changed: 22 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala
Lines changed: 21 additions & 20 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala
Lines changed: 21 additions & 20 deletions
@@ -22,6 +22,7 @@ import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.InternalRowComparableWrapper
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
@@ -677,9 +678,6 @@ case class KeyGroupedShuffleSpec(
     }
   }
 
-  lazy val ordering: Ordering[InternalRow] =
-    RowOrdering.createNaturalAscendingOrdering(partitioning.expressions.map(_.dataType))
-
   override def numPartitions: Int = partitioning.numPartitions
 
   override def isCompatibleWith(other: ShuffleSpec): Boolean = other match {
@@ -697,7 +695,9 @@ case class KeyGroupedShuffleSpec(
       distribution.clustering.length == otherDistribution.clustering.length &&
         numPartitions == other.numPartitions && areKeysCompatible(otherSpec) &&
           partitioning.partitionValues.zip(otherPartitioning.partitionValues).forall {
-            case (left, right) => ordering.compare(left, right) == 0
+            case (left, right) =>
+              InternalRowComparableWrapper(left, partitioning.expressions)
+                .equals(InternalRowComparableWrapper(right, partitioning.expressions))
           }
     case ShuffleSpecCollection(specs) =>
       specs.exists(isCompatibleWith)
 
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Expression, Murmur3HashFunction, RowOrdering}
+import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
+import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition}
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+
+/**
+ * Wraps the [[InternalRow]] with the corresponding [[DataType]] to make it comparable with
+ * the values in [[InternalRow]].
+ * It uses Spark's internal murmur hash to compute hash code from an row, and uses [[RowOrdering]]
+ * to perform equality checks.
+ *
+ * @param dataTypes the data types for the row
+ */
+class InternalRowComparableWrapper(val row: InternalRow, val dataTypes: Seq[DataType]) {
+
+  private val structType = StructType(dataTypes.map(t => StructField("f", t)))
+  private val ordering = RowOrdering.createNaturalAscendingOrdering(dataTypes)
+
+  override def hashCode(): Int = Murmur3HashFunction.hash(row, structType, 42L).toInt
+
+  override def equals(other: Any): Boolean = {
+    if (!other.isInstanceOf[InternalRowComparableWrapper]) {
+      return false
+    }
+    val otherWrapper = other.asInstanceOf[InternalRowComparableWrapper]
+    if (!otherWrapper.dataTypes.equals(this.dataTypes)) {
+      return false
+    }
+    ordering.compare(row, otherWrapper.row) == 0
+  }
+}
+
+object InternalRowComparableWrapper {
+
+  def apply(
+      partition: InputPartition with HasPartitionKey,
+      partitionExpression: Seq[Expression]): InternalRowComparableWrapper = {
+    new InternalRowComparableWrapper(
+      partition.asInstanceOf[HasPartitionKey].partitionKey(), partitionExpression.map(_.dataType))
+  }
+
+  def apply(
+      partitionRow: InternalRow,
+      partitionExpression: Seq[Expression]): InternalRowComparableWrapper = {
+    new InternalRowComparableWrapper(partitionRow, partitionExpression.map(_.dataType))
+  }
+
+  def mergePartitions(
+      leftPartitioning: KeyGroupedPartitioning,
+      rightPartitioning: KeyGroupedPartitioning,
+      partitionExpression: Seq[Expression]): Seq[InternalRow] = {
+    val partitionDataTypes = partitionExpression.map(_.dataType)
+    val partitionsSet = new mutable.HashSet[InternalRowComparableWrapper]
+    leftPartitioning.partitionValues
+      .map(new InternalRowComparableWrapper(_, partitionDataTypes))
+      .foreach(partition => partitionsSet.add(partition))
+    rightPartitioning.partitionValues
+      .map(new InternalRowComparableWrapper(_, partitionDataTypes))
+      .foreach(partition => partitionsSet.add(partition))
+    partitionsSet.map(_.row).toSeq
+  }
+}
@@ -24,6 +24,7 @@ import java.util.OptionalLong
 
 import scala.collection.mutable
 
+import com.google.common.base.Objects
 import org.scalatest.Assertions._
 
 import org.apache.spark.sql.catalyst.InternalRow
@@ -541,13 +542,31 @@ class BufferedRows(val key: Seq[Any] = Seq.empty) extends WriterCommitMessage
 
   def keyString(): String = key.toArray.mkString("/")
 
-  override def partitionKey(): InternalRow = {
-    InternalRow.fromSeq(key)
-  }
+  override def partitionKey(): InternalRow = PartitionInternalRow(key.toArray)
 
   def clear(): Unit = rows.clear()
 }
 
+/**
+ * Theoretically, [[InternalRow]] returned by [[HasPartitionKey#partitionKey()]]
+ * does not need to implement equal and hashcode methods.
+ * But [[GenericInternalRow]] implements equals and hashcode methods already. Here we override it
+ * to simulate that it has not been implemented to verify codes correctness.
+ */
+case class PartitionInternalRow(keys: Array[Any])
+  extends GenericInternalRow(keys) {
+  override def equals(other: Any): Boolean = {
+    if (!other.isInstanceOf[PartitionInternalRow]) {
+      return false
+    }
+    // Just compare by reference, not by value
+    this.keys == other.asInstanceOf[PartitionInternalRow].keys
+  }
+  override def hashCode: Int = {
+    Objects.hashCode(keys)
+  }
+}
+
 private class BufferedRowsReaderFactory(
     metadataColumnNames: Seq[String],
     nonMetaDataColumns: Seq[StructField],
 
@@ -25,10 +25,9 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, Partitioning, SinglePartition}
-import org.apache.spark.sql.catalyst.util.InternalRowSet
-import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowComparableWrapper}
 import org.apache.spark.sql.connector.catalog.Table
-import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition, PartitionReaderFactory, Scan, SupportsRuntimeV2Filtering}
+import org.apache.spark.sql.connector.read._
 
 /**
  * Physical plan node for scanning a batch of data from a data source v2.
@@ -80,24 +79,24 @@ case class BatchScanExec(
                 "during runtime filtering: not all partitions implement HasPartitionKey after " +
                 "filtering")
           }
-
-          val newRows = new InternalRowSet(p.expressions.map(_.dataType))
-          newRows ++= newPartitions.map(_.asInstanceOf[HasPartitionKey].partitionKey())
-
-          val oldRows = p.partitionValues.toSet
-          // We require the new number of partition keys to be equal or less than the old number
-          // of partition keys here. In the case of less than, empty partitions will be added for
-          // those missing keys that are not present in the new input partitions.
-          if (oldRows.size < newRows.size) {
+          val newPartitionValues = newPartitions.map(partition =>
+              InternalRowComparableWrapper(partition.asInstanceOf[HasPartitionKey], p.expressions))
+            .toSet
+          val oldPartitionValues = p.partitionValues
+            .map(partition => InternalRowComparableWrapper(partition, p.expressions)).toSet
+          // We require the new number of partition values to be equal or less than the old number
+          // of partition values here. In the case of less than, empty partitions will be added for
+          // those missing values that are not present in the new input partitions.
+          if (oldPartitionValues.size < newPartitionValues.size) {
             throw new SparkException("During runtime filtering, data source must either report " +
-                "the same number of partition keys, or a subset of partition keys from the " +
-                s"original. Before: ${oldRows.size} partition keys. After: ${newRows.size} " +
-                "partition keys")
+                "the same number of partition values, or a subset of partition values from the " +
+                s"original. Before: ${oldPartitionValues.size} partition values. " +
+                s"After: ${newPartitionValues.size} partition values")
           }
 
-          if (!newRows.forall(oldRows.contains)) {
+          if (!newPartitionValues.forall(oldPartitionValues.contains)) {
             throw new SparkException("During runtime filtering, data source must not report new " +
-                "partition keys that are not present in the original partitioning.")
+                "partition values that are not present in the original partitioning.")
           }
 
           groupPartitions(newPartitions).get.map(_._2)
@@ -132,11 +131,13 @@ case class BatchScanExec(
 
       outputPartitioning match {
         case p: KeyGroupedPartitioning =>
-          val partitionMapping = finalPartitions.map(s =>
-            s.head.asInstanceOf[HasPartitionKey].partitionKey() -> s).toMap
+          val partitionMapping = finalPartitions.map(s => InternalRowComparableWrapper(
+            s.head.asInstanceOf[HasPartitionKey], p.expressions) -> s)
+            .toMap
           finalPartitions = p.partitionValues.map { partValue =>
             // Use empty partition for those partition values that are not present
-            partitionMapping.getOrElse(partValue, Seq.empty)
+            partitionMapping.getOrElse(
+              InternalRowComparableWrapper(partValue, p.expressions), Seq.empty)
           }
         case _ =>
       }