Skip to content

Commit c8ff41d

Browse files
allisonwang-dbdongjoon-hyun
authored andcommitted
[SPARK-40149][SQL][FOLLOWUP] Avoid adding extra Project in AddMetadataColumns
This PR is a follow-up for apache#37758. It updates the rule `AddMetadataColumns` to avoid introducing extra `Project`. To fix an issue introduced by apache#37758. ```sql -- t1: [key, value] t2: [key, value] select t1.key, t2.key from t1 full outer join t2 using (key) ``` Before this PR, the rule `AddMetadataColumns` will add a new Project between the using join and the select list: ``` Project [key, key] +- Project [key, key, key, key] <--- extra project +- Project [coalesce(key, key) AS key, value, value, key, key] +- Join FullOuter, (key = key) :- LocalRelation <empty>, [key#0, value#0] +- LocalRelation <empty>, [key#0, value#0] ``` After this PR, this extra Project will be removed. No Add a new UT. Closes apache#39895 from allisonwang-db/spark-40149-follow-up. Authored-by: allisonwang-db <[email protected]> Signed-off-by: Wenchen Fan <[email protected]> (cherry picked from commit 286d336) Signed-off-by: Wenchen Fan <[email protected]>
1 parent 7d2d995 commit c8ff41d

File tree

2 files changed

+31
-14
lines changed

2 files changed

+31
-14
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
992992
if (metaCols.isEmpty) {
993993
node
994994
} else {
995-
val newNode = addMetadataCol(node, metaCols.map(_.exprId).toSet)
995+
val newNode = node.mapChildren(addMetadataCol(_, metaCols.map(_.exprId).toSet))
996996
// We should not change the output schema of the plan. We should project away the extra
997997
// metadata columns if necessary.
998998
if (newNode.sameOutput(node)) {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
3636
import org.apache.spark.sql.catalyst.expressions._
3737
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Count, Sum}
3838
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan
39-
import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
39+
import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, UsingJoin}
4040
import org.apache.spark.sql.catalyst.plans.logical._
4141
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning}
4242
import org.apache.spark.sql.catalyst.util._
@@ -1359,6 +1359,20 @@ class AnalysisSuite extends AnalysisTest with Matchers {
13591359
parsePlan("SELECT c FROM a WHERE c < 20"))
13601360
}
13611361

1362+
test("SPARK-41489: type of filter expression should be a bool") {
1363+
assertAnalysisErrorClass(parsePlan(
1364+
s"""
1365+
|WITH t1 as (SELECT 1 user_id)
1366+
|SELECT *
1367+
|FROM t1
1368+
|WHERE 'true'""".stripMargin),
1369+
expectedErrorClass = "DATATYPE_MISMATCH.FILTER_NOT_BOOLEAN",
1370+
expectedMessageParameters = Map(
1371+
"sqlExpr" -> "\"true\"", "filter" -> "\"true\"", "type" -> "\"STRING\"")
1372+
,
1373+
queryContext = Array(ExpectedContext("SELECT *\nFROM t1\nWHERE 'true'", 31, 59)))
1374+
}
1375+
13621376
test("SPARK-38591: resolve left and right CoGroup sort order on respective side only") {
13631377
def func(k: Int, left: Iterator[Int], right: Iterator[Int]): Iterator[Int] = {
13641378
Iterator.empty
@@ -1403,17 +1417,20 @@ class AnalysisSuite extends AnalysisTest with Matchers {
14031417
}
14041418
}
14051419

1406-
test("SPARK-41489: type of filter expression should be a bool") {
1407-
assertAnalysisErrorClass(parsePlan(
1408-
s"""
1409-
|WITH t1 as (SELECT 1 user_id)
1410-
|SELECT *
1411-
|FROM t1
1412-
|WHERE 'true'""".stripMargin),
1413-
expectedErrorClass = "DATATYPE_MISMATCH.FILTER_NOT_BOOLEAN",
1414-
expectedMessageParameters = Map(
1415-
"sqlExpr" -> "\"true\"", "filter" -> "\"true\"", "type" -> "\"STRING\"")
1416-
,
1417-
queryContext = Array(ExpectedContext("SELECT *\nFROM t1\nWHERE 'true'", 31, 59)))
1420+
test("SPARK-40149: add metadata column with no extra project") {
1421+
val t1 = LocalRelation($"key".int, $"value".string).as("t1")
1422+
val t2 = LocalRelation($"key".int, $"value".string).as("t2")
1423+
val query =
1424+
Project(Seq($"t1.key", $"t2.key"),
1425+
Join(t1, t2, UsingJoin(FullOuter, Seq("key")), None, JoinHint.NONE))
1426+
checkAnalysis(
1427+
query,
1428+
Project(Seq($"t1.key", $"t2.key"),
1429+
Project(Seq(coalesce($"t1.key", $"t2.key").as("key"),
1430+
$"t1.value", $"t2.value", $"t1.key", $"t2.key"),
1431+
Join(t1, t2, FullOuter, Some($"t1.key" === $"t2.key"), JoinHint.NONE)
1432+
)
1433+
).analyze
1434+
)
14181435
}
14191436
}

0 commit comments

Comments
 (0)