apache · dengziming · Jul 28, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.../src/test/scala/org/apache/spark/sql/jdbc/v2/join/MySQLJoinPushdownIntegrationSuite.scala b/.../src/test/scala/org/apache/spark/sql/jdbc/v2/join/MySQLJoinPushdownIntegrationSuite.scala
@@ -43,6 +43,8 @@ class MySQLJoinPushdownIntegrationSuite
 
   override def caseConvert(identifier: String): String = identifier.toUpperCase(Locale.ROOT)
 
+  override def remainColumnCase(identifier: String): String = "`" + identifier + "`"
+
   // This method comes from DockerJDBCIntegrationSuite
   override def dataPreparation(connection: Connection): Unit = {
     super.dataPreparation()

diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
+import java.util.Locale
+
 import scala.collection.mutable
 
 import org.apache.spark.internal.LogKeys.{AGGREGATE_FUNCTIONS, GROUP_BY_EXPRS, POST_SCAN_FILTERS, PUSHED_FILTERS, RELATION_NAME, RELATION_OUTPUT}
@@ -137,42 +139,11 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
         // Cross joins are not supported because they increase the amount of data.
         condition.isDefined &&
         lBuilder.isOtherSideCompatibleForJoin(rBuilder) =>
-      val leftSideRequiredColumnNames = getRequiredColumnNames(leftProjections, leftHolder)
-      val rightSideRequiredColumnNames = getRequiredColumnNames(rightProjections, rightHolder)
-
-      // Alias the duplicated columns from left side of the join. We are creating the
-      // Map[String, Int] to tell how many times each column name has occured within one side.
-      val leftSideNameCounts: Map[String, Int] =
-        leftSideRequiredColumnNames.groupBy(identity).view.mapValues(_.size).toMap
-      val rightSideNameCounts: Map[String, Int] =
-        rightSideRequiredColumnNames.groupBy(identity).view.mapValues(_.size).toMap
-      // It's more performant to call contains on Set than on Seq
-      val rightSideColumnNamesSet = rightSideRequiredColumnNames.toSet
-
-      val leftSideRequiredColumnsWithAliases = leftSideRequiredColumnNames.map { name =>
-        val aliasName =
-          if (leftSideNameCounts(name) > 1 || rightSideColumnNamesSet.contains(name)) {
-            generateJoinOutputAlias(name)
-          } else {
-            null
-          }
-
-        new SupportsPushDownJoin.ColumnWithAlias(name, aliasName)
-      }
-
-      // Aliasing of duplicated columns in right side is done only if there are duplicates in
-      // right side only. There won't be a conflict with left side columns because they are
-      // already aliased.
-      val rightSideRequiredColumnsWithAliases = rightSideRequiredColumnNames.map { name =>
-        val aliasName =
-          if (rightSideNameCounts(name) > 1) {
-            generateJoinOutputAlias(name)
-          } else {
-            null
-          }
-
-        new SupportsPushDownJoin.ColumnWithAlias(name, aliasName)
-      }
+      // Process left and right columns in original order
+      val (leftSideRequiredColumnsWithAliases, rightSideRequiredColumnsWithAliases) =
+        generateColumnAliasesForDuplicatedName(
+          getRequiredColumnNames(leftProjections, leftHolder),
+          getRequiredColumnNames(rightProjections, rightHolder))
 
       // Create the AttributeMap that holds (Attribute -> Attribute with up to date name) mapping.
       val pushedJoinOutputMap = AttributeMap[Expression](
@@ -225,11 +196,80 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
         node
       }
   }
+  /**
+   * Generates unique column aliases for join operations to avoid naming conflicts.
+   * Handles case sensitivity issues across different databases (SQL Server, MySQL, etc.).
+   *
+   * @param leftSideRequiredColumnNames  Columns from the left side of the join
+   * @param rightSideRequiredColumnNames Columns from the right side of the join
+   * @return Tuple of (leftColumnsWithAliases, rightColumnsWithAliases)
+   */
+  private[v2] def generateColumnAliasesForDuplicatedName(
+    leftSideRequiredColumnNames: Array[String],
+    rightSideRequiredColumnNames: Array[String]
+  ): (Array[SupportsPushDownJoin.ColumnWithAlias],
+    Array[SupportsPushDownJoin.ColumnWithAlias]) = {
+    // Normalize all column names to lowercase for case-insensitive comparison
+    val normalizeCase: String => String = _.toLowerCase(Locale.ROOT)
+
+    // Count occurrences of each column name (case-insensitive)
+    val allRequiredColumnNames = leftSideRequiredColumnNames ++ rightSideRequiredColumnNames
+    val allNameCounts: Map[String, Int] =
+      allRequiredColumnNames.map(normalizeCase)
+        .groupBy(identity)
+        .view
+        .mapValues(_.length)
+        .toMap
+
+    // Track claimed aliases using normalized names.
+    // Use Set for O(1) lookups when checking existing column names, claim all names
+    // that appears only once to ensure they have highest priority.
+    val allClaimedAliases = mutable.Set.from(
+      allNameCounts.filter(_._2 == 1).keys
+    )
+
+    // Track suffix index for each base column name (starts at 0) to avoid extreme worst
+    // case of O(n^2) alias generation.
+    val aliasSuffixIndex = mutable.HashMap[String, Int]().withDefaultValue(0)
+
+    def processColumn(originalName: String): SupportsPushDownJoin.ColumnWithAlias = {
+      val normalizedName = normalizeCase(originalName)
+
+      // No alias needed for unique column names
+      if (allNameCounts(normalizedName) == 1) {
+        new SupportsPushDownJoin.ColumnWithAlias(originalName, null)
+      } else {
+        var attempt = aliasSuffixIndex(normalizedName)
+        var candidate = if (attempt == 0) originalName else s"${originalName}_$attempt"
+        var normalizedCandidate = normalizeCase(candidate)
+
+        // Find first available unique alias, use original name for the first attempt, then append
+        // suffix for more attempts.
+        while (allClaimedAliases.contains(normalizedCandidate)) {
+          attempt += 1
+          candidate = s"${originalName}_$attempt"
+          normalizedCandidate = normalizeCase(candidate)
+        }
+
+        // Update tracking state
+        aliasSuffixIndex(normalizedName) = attempt + 1
+        allClaimedAliases.add(normalizedCandidate)
 
-  def generateJoinOutputAlias(name: String): String =
-    s"${name}_${java.util.UUID.randomUUID().toString.replace("-", "_")}"
+        if (originalName == candidate) {
+          new SupportsPushDownJoin.ColumnWithAlias(originalName, null)
+        } else {
+          new SupportsPushDownJoin.ColumnWithAlias(originalName, candidate)
+        }
+      }
+    }
+
+    (
+      leftSideRequiredColumnNames.map(processColumn),
+      rightSideRequiredColumnNames.map(processColumn)
+    )
+  }
 
-  // projections' names are maybe not up to date if the joins have been previously pushed down.
+  // Projections' names are maybe not up to date if the joins have been previously pushed down.
   // For this reason, we need to use pushedJoinOutputMap to get up to date names.
   def getRequiredColumnNames(
       projections: Seq[NamedExpression],

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourcePushdownTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourcePushdownTestUtils.scala
@@ -182,9 +182,7 @@ trait DataSourcePushdownTestUtils extends ExplainSuiteHelper {
 
           assert(dfSchema.length == schema.length)
           dfSchema.fields.zip(schema.fields).foreach { case (f1, f2) =>
-            if (f2.name.nonEmpty) {
-              assert(f1.name == f2.name)
-            }
+            assert(f1.name == f2.name)
             assert(f1.dataType == f2.dataType)
             assert(f1.nullable == f2.nullable)
           }

diff --git a/.../org/apache/spark/sql/execution/datasources/v2/DSV2JoinPushDownAliasGenerationSuite.scala b/.../org/apache/spark/sql/execution/datasources/v2/DSV2JoinPushDownAliasGenerationSuite.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import java.util.Locale
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.connector.read.SupportsPushDownJoin.ColumnWithAlias
+
+class DSV2JoinPushDownAliasGenerationSuite extends SparkFunSuite {
+
+  private def assertAliases(
+    leftInput: Array[String],
+    rightInput: Array[String],
+    expectedLeft: Array[ColumnWithAlias],
+    expectedRight: Array[ColumnWithAlias]
+  ): Unit = {
+    val (actualLeft, actualRight) = V2ScanRelationPushDown
+      .generateColumnAliasesForDuplicatedName(leftInput, rightInput)
+
+    val uniqName: ColumnWithAlias => String = col => {
+      if (col.alias() == null) col.colName() else col.alias().toLowerCase(Locale.ROOT)
+    }
+    // Ensure no duplicate column names after ignoring capitalization
+    assert((actualLeft ++ actualRight).map(uniqName).distinct.length
+      == actualLeft.length + actualRight.length)
+
+    assert(
+      actualLeft === expectedLeft,
+      s"""Left side aliases mismatch.
+         |Expected: ${expectedLeft.map(_.alias()).mkString(", ")}
+         |Actual: ${actualLeft.map(_.alias()).mkString(", ")}""".stripMargin
+    )
+
+    assert(
+      actualRight === expectedRight,
+      s"""Right side aliases mismatch.
+         |Expected: ${expectedRight.map(_.alias()).mkString(", ")}
+         |Actual: ${actualRight.map(_.alias()).mkString(", ")}""".stripMargin
+    )
+  }
+
+  test("Basic case with no duplicate column names") {
+    assertAliases(
+      leftInput = Array("id", "name"),
+      rightInput = Array("email", "phone"),
+      expectedLeft = Array(
+        new ColumnWithAlias("id", null),
+        new ColumnWithAlias("name", null)
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("email", null),
+        new ColumnWithAlias("phone", null)
+      )
+    )
+  }
+
+  test("Extreme duplication scenarios") {
+    assertAliases(
+      leftInput = Array("id", "id", "id"),
+      rightInput = Array("id", "id"),
+      expectedLeft = Array(
+        new ColumnWithAlias("id", null),
+        new ColumnWithAlias("id", "id_1"),
+        new ColumnWithAlias("id", "id_2")
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("id", "id_3"),
+        new ColumnWithAlias("id", "id_4")
+      )
+    )
+  }
+
+  test("Exact duplicate column names") {
+    assertAliases(
+      leftInput = Array("id", "name"),
+      rightInput = Array("id", "name"),
+      expectedLeft = Array(
+        new ColumnWithAlias("id", null),
+        new ColumnWithAlias("name", null)
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("id", "id_1"),
+        new ColumnWithAlias("name", "name_1")
+      )
+    )
+  }
+
+  test("Columns with numeric suffixes (id vs id_1)") {
+    assertAliases(
+      leftInput = Array("id", "id_1", "name"),
+      rightInput = Array("id", "name", "value"),
+      expectedLeft = Array(
+        new ColumnWithAlias("id", null),
+        new ColumnWithAlias("id_1", null),
+        new ColumnWithAlias("name", null)
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("id", "id_2"),
+        new ColumnWithAlias("name", "name_1"),
+        new ColumnWithAlias("value", null)
+      )
+    )
+  }
+
+  test("Case-sensitive conflicts (ID vs id)") {
+    assertAliases(
+      leftInput = Array("ID", "Name"),
+      rightInput = Array("id", "name"),
+      expectedLeft = Array(
+        new ColumnWithAlias("ID", null),
+        new ColumnWithAlias("Name", null)
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("id", "id_1"),
+        new ColumnWithAlias("name", "name_1")
+      )
+    )
+  }
+
+  test("Mixed case and numeric suffixes") {
+    assertAliases(
+      leftInput = Array("UserID", "user_id", "user_id_1"),
+      rightInput = Array("userId", "USER_ID", "user_id_2"),
+      expectedLeft = Array(
+        new ColumnWithAlias("UserID", null),
+        new ColumnWithAlias("user_id", null),
+        new ColumnWithAlias("user_id_1", null)
+      ),
+      expectedRight = Array(
+        new ColumnWithAlias("userId", "userId_1"),
+        new ColumnWithAlias("USER_ID", "USER_ID_3"),
+        new ColumnWithAlias("user_id_2", null)
+      )
+    )
+  }
+}