From fcdf0f0daff0525ef56a80b0a7661c1d18bd89d9 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Wed, 17 May 2023 14:54:38 +0800
Subject: [PATCH 01/20] Spark 3.4: Support distribute by any predefined
 transform

---
 .../ClusterShardByTransformSuite.scala        | 98 +++++++++++++++++++
 .../WriteDistributionAndOrderingSuite.scala   |  8 +-
 .../spark/sql/clickhouse/ExprUtils.scala      | 60 ++++++++++--
 .../xenon/clickhouse/ClickHouseCatalog.scala  | 10 +-
 .../xenon/clickhouse/ClickHouseTable.scala    | 28 +++---
 .../clickhouse/func/ClickHouseXxHash64.scala  |  4 +-
 .../clickhouse/func/FunctionRegistry.scala    | 22 ++++-
 .../scala/xenon/clickhouse/func/Months.scala  | 47 +++++++++
 .../clickhouse/write/ClickHouseWriter.scala   | 39 +++++++-
 .../write/WriteJobDescription.scala           | 13 +--
 .../clickhouse/FunctionRegistrySuite.scala    | 73 ++++++++++++++
 11 files changed, 361 insertions(+), 41 deletions(-)
 create mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
new file mode 100644
index 00000000..fce0f77e
--- /dev/null
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.clickhouse.cluster
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
+
+class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
+  override protected def sparkConf: SparkConf = {
+    val _conf = super.sparkConf
+      .set("spark.clickhouse.write.distributed.convertLocal", "true")
+    _conf
+  }
+
+  def runTest(func_name: String, func_args: Array[String]): Unit = {
+    val func_expr = s"$func_name(${func_args.mkString(",")})"
+    val cluster = "single_replica"
+    val db = s"db_${func_name}_shard"
+    val tbl_dist = s"tbl_${func_name}_shard"
+    val tbl_local = s"${tbl_dist}_local"
+
+    try {
+      runClickHouseSQL(s"CREATE DATABASE IF NOT EXISTS $db ON CLUSTER $cluster")
+
+      spark.sql(
+        s"""CREATE TABLE $db.$tbl_local (
+           |  create_time TIMESTAMP NOT NULL,
+           |  value       STRING NOT NULL
+           |) USING ClickHouse
+           |TBLPROPERTIES (
+           |  cluster = '$cluster',
+           |  engine = 'MergeTree()',
+           |  order_by = 'create_time'
+           |)
+           |""".stripMargin
+      )
+
+      runClickHouseSQL(
+        s"""CREATE TABLE $db.$tbl_dist ON CLUSTER $cluster
+           |AS $db.$tbl_local
+           |ENGINE = Distributed($cluster, '$db', '$tbl_local', $func_expr)
+           |""".stripMargin
+      )
+      spark.sql(
+        s"""INSERT INTO `$db`.`$tbl_dist`
+           |VALUES
+           |  (timestamp'2021-01-01 10:10:10', '1'),
+           |  (timestamp'2022-02-02 10:10:10', '2'),
+           |  (timestamp'2023-03-03 10:10:10', '3'),
+           |  (timestamp'2024-04-04 10:10:10', '4') AS tab(create_time, value)
+           |""".stripMargin
+      )
+      // check that data is indeed written
+      checkAnswer(
+        spark.table(s"$db.$tbl_dist").select("value").orderBy("create_time"),
+        Seq(Row("1"), Row("2"), Row("3"), Row("4"))
+      )
+
+      // check same data is sharded in the same server comparing native sharding
+      runClickHouseSQL(
+        s"""INSERT INTO `$db`.`$tbl_dist`
+           |VALUES
+           |  (timestamp'2021-01-01 10:10:10', '1'),
+           |  (timestamp'2022-02-02 10:10:10', '2'),
+           |  (timestamp'2023-03-03 10:10:10', '3'),
+           |  (timestamp'2024-04-04 10:10:10', '4')
+           |""".stripMargin
+      )
+      checkAnswer(
+        spark.table(s"$db.$tbl_local")
+          .groupBy("value").count().filter("count != 2"),
+        Seq.empty
+      )
+
+    } finally {
+      runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_dist ON CLUSTER $cluster")
+      runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_local ON CLUSTER $cluster")
+      runClickHouseSQL(s"DROP DATABASE IF EXISTS $db ON CLUSTER $cluster")
+    }
+  }
+
+  Seq(("xxHash64", Array("value")), ("toYYYYMM", Array("create_time"))).foreach { case (func_name, func_args) =>
+    test(s"shard by $func_name")(runTest(func_name, func_args))
+  }
+
+}
diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala
index fe9ba535..7fc0972d 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala
@@ -78,12 +78,8 @@ class WriteDistributionAndOrderingSuite extends SparkClickHouseSingleTest {
     WRITE_REPARTITION_BY_PARTITION.key -> repartitionByPartition.toString,
     WRITE_LOCAL_SORT_BY_KEY.key -> localSortByKey.toString
   ) {
-    if (!ignoreUnsupportedTransform && repartitionByPartition) {
-      intercept[AnalysisException](write())
-    } else {
-      write()
-      check()
-    }
+    write()
+    check()
   }
 
   Seq(true, false).foreach { ignoreUnsupportedTransform =>
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 314c65f3..d7116cc9 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -16,18 +16,21 @@ package org.apache.spark.sql.clickhouse
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.SQLConfHelper
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression}
+import org.apache.spark.sql.catalyst.analysis.NoSuchFunctionException
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, TransformExpression}
 import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.IGNORE_UNSUPPORTED_TRANSFORM
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.connector.expressions.Expressions._
 import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, _}
 import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
 import xenon.clickhouse.exception.CHClientException
 import xenon.clickhouse.expr._
+import xenon.clickhouse.func.FunctionRegistry
 
-import scala.annotation.tailrec
 import scala.util.{Failure, Success, Try}
 
-object ExprUtils extends SQLConfHelper {
+class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable {
 
   def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] =
     partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray
@@ -47,7 +50,28 @@ object ExprUtils extends SQLConfHelper {
         toSparkTransformOpt(expr).map(trans => Expressions.sort(trans, direction, nullOrder))
       }.toArray
 
-  @tailrec
+  private def loadV2FunctionOpt(
+    name: String,
+    args: Seq[Expression]
+  ): Option[BoundFunction] = {
+    def loadFunction(ident: Identifier): UnboundFunction =
+      functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident))
+    val inputType = StructType(args.zipWithIndex.map {
+      case (exp, pos) => StructField(s"_$pos", exp.dataType, exp.nullable)
+    })
+    try {
+      val unbound = loadFunction(Identifier.of(Array.empty, name))
+      Some(unbound.bind(inputType))
+    } catch {
+      case e: NoSuchFunctionException =>
+        throw e
+      case _: UnsupportedOperationException if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) =>
+        None
+      case e: UnsupportedOperationException =>
+        throw new AnalysisException(e.getMessage, cause = Some(e))
+    }
+  }
+
   def toCatalyst(v2Expr: V2Expression, fields: Array[StructField]): Expression =
     v2Expr match {
       case IdentityTransform(ref) => toCatalyst(ref, fields)
@@ -57,8 +81,15 @@ object ExprUtils extends SQLConfHelper {
           .find { case (field, _) => field.name == ref.fieldNames.head }
           .getOrElse(throw CHClientException(s"Invalid field reference: $ref"))
         BoundReference(ordinal, field.dataType, field.nullable)
+      case t: Transform =>
+        val catalystArgs = t.arguments().map(toCatalyst(_, fields))
+        loadV2FunctionOpt(t.name(), catalystArgs).map { bound =>
+          TransformExpression(bound, catalystArgs)
+        }.getOrElse {
+          throw CHClientException(s"Unsupported expression: $v2Expr")
+        }
       case _ => throw CHClientException(
-          s"Unsupported V2 expression: $v2Expr, SPARK-33779: Spark 3.3 only support IdentityTransform"
+          s"Unsupported expression: $v2Expr"
         )
     }
 
@@ -83,10 +114,10 @@ object ExprUtils extends SQLConfHelper {
     case FuncExpr("toYYYYMMDD", List(FieldRef(col))) => days(col)
     case FuncExpr("toHour", List(FieldRef(col))) => hours(col)
     case FuncExpr("HOUR", List(FieldRef(col))) => hours(col)
-    // TODO support arbitrary functions
-    // case FuncExpr("xxHash64", List(FieldRef(col))) => apply("ck_xx_hash64", column(col))
     case FuncExpr("rand", Nil) => apply("rand")
     case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
+    case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) =>
+      apply(functionRegistry.getFuncMappingByCk(funName), column(col))
     case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
   }
 
@@ -96,7 +127,8 @@ object ExprUtils extends SQLConfHelper {
     case DaysTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMMDD", List(FieldRef(col)))
     case HoursTransform(FieldReference(Seq(col))) => FuncExpr("toHour", List(FieldRef(col)))
     case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe)
-    case ApplyTransform(name, args) => FuncExpr(name, args.map(arg => SQLExpr(arg.describe())).toList)
+    case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) =>
+      FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList)
     case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket")
     case other: Transform => throw CHClientException(s"Unsupported transform: $other")
   }
@@ -113,8 +145,18 @@ object ExprUtils extends SQLConfHelper {
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
-    case ckXxhHash64 @ ApplyTransform("ck_xx_hash64", _) => StructField(ckXxhHash64.toString, LongType)
+    case t @ ApplyTransform(transformName, _) =>
+      val resType =
+        functionRegistry.load(transformName).getOrElse(throw new NoSuchFunctionException(transformName)) match {
+          case f: ScalarFunction[_] => f.resultType()
+          case other => throw CHClientException(s"Unsupported function: $other")
+        }
+      StructField(t.toString, resType)
     case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket")
     case other: Transform => throw CHClientException(s"Unsupported transform: $other")
   }
 }
+
+object ExprUtils {
+  def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry)
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
index 02862392..b625560d 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -85,12 +85,15 @@ class ClickHouseCatalog extends TableCatalog
 
     val dynamicFunctionRegistry = new DynamicFunctionRegistry
     val xxHash64ShardFunc = new ClickHouseXxHash64Shard(clusterSpecs)
+    val monthsFunc = new Months()
     dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible
     dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc)
+    dynamicFunctionRegistry.register("months", monthsFunc)
     this.functionRegistry = new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
 
     log.info(s"Detect ${clusterSpecs.size} ClickHouse clusters: ${clusterSpecs.map(_.name).mkString(",")}")
     log.info(s"ClickHouse clusters' detail: $clusterSpecs")
+    log.info(s"functionRegistry: ${this.functionRegistry.list.mkString(",")}")
   }
 
   override def name(): String = catalogName
@@ -141,7 +144,8 @@ class ClickHouseCatalog extends TableCatalog
       tableClusterSpec,
       _tz,
       tableSpec,
-      tableEngineSpec
+      tableEngineSpec,
+      functionRegistry
     )
   }
 
@@ -206,7 +210,7 @@ class ClickHouseCatalog extends TableCatalog
 
     val partitionsClause = partitions match {
       case transforms if transforms.nonEmpty =>
-        transforms.map(ExprUtils.toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")")
+        transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")")
       case _ => ""
     }
 
@@ -297,7 +301,7 @@ class ClickHouseCatalog extends TableCatalog
       }
     tableOpt match {
       case None => false
-      case Some(ClickHouseTable(_, cluster, _, tableSpec, _)) =>
+      case Some(ClickHouseTable(_, cluster, _, tableSpec, _, _)) =>
         val (db, tbl) = (tableSpec.database, tableSpec.name)
         val isAtomic = loadNamespaceMetadata(Array(db)).get("engine").equalsIgnoreCase("atomic")
         val syncClause = if (isAtomic) "SYNC" else ""
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
index 59b3ca9f..f4e19071 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
@@ -14,16 +14,12 @@
 
 package xenon.clickhouse
 
-import java.lang.{Integer => JInt, Long => JLong}
-import java.time.{LocalDate, ZoneId}
-import java.util
-import scala.collection.JavaConverters._
-import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper}
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
-import org.apache.spark.sql.clickhouse.{ExprUtils, ReadOptions, WriteOptions}
+import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper}
 import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.{READ_DISTRIBUTED_CONVERT_LOCAL, USE_NULLABLE_QUERY_SCHEMA}
-import org.apache.spark.sql.connector.catalog._
+import org.apache.spark.sql.clickhouse.{ExprUtils, ReadOptions, WriteOptions}
 import org.apache.spark.sql.connector.catalog.TableCapability._
+import org.apache.spark.sql.connector.catalog._
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.connector.read.ScanBuilder
 import org.apache.spark.sql.connector.write.LogicalWriteInfo
@@ -34,16 +30,23 @@ import org.apache.spark.unsafe.types.UTF8String
 import xenon.clickhouse.Utils._
 import xenon.clickhouse.client.NodeClient
 import xenon.clickhouse.expr.{Expr, OrderExpr}
+import xenon.clickhouse.func.FunctionRegistry
 import xenon.clickhouse.read.{ClickHouseMetadataColumn, ClickHouseScanBuilder, ScanJobDescription}
 import xenon.clickhouse.spec._
 import xenon.clickhouse.write.{ClickHouseWriteBuilder, WriteJobDescription}
 
+import java.lang.{Integer => JInt, Long => JLong}
+import java.time.{LocalDate, ZoneId}
+import java.util
+import scala.collection.JavaConverters._
+
 case class ClickHouseTable(
   node: NodeSpec,
   cluster: Option[ClusterSpec],
   implicit val tz: ZoneId,
   spec: TableSpec,
-  engineSpec: TableEngineSpec
+  engineSpec: TableEngineSpec,
+  functionRegistry: FunctionRegistry
 ) extends Table
     with SupportsRead
     with SupportsWrite
@@ -130,10 +133,12 @@ case class ClickHouseTable(
   private lazy val metadataSchema: StructType =
     StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField))
 
-  override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)
+  override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey)
 
   override lazy val partitionSchema: StructType = StructType(
-    partitioning.map(partTransform => ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform))
+    partitioning.map(partTransform =>
+      ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform)
+    )
   )
 
   override lazy val properties: util.Map[String, String] = spec.toJavaMap
@@ -170,7 +175,8 @@ case class ClickHouseTable(
       shardingKey = shardingKey,
       partitionKey = partitionKey,
       sortingKey = sortingKey,
-      writeOptions = new WriteOptions(info.options.asCaseSensitiveMap())
+      writeOptions = new WriteOptions(info.options.asCaseSensitiveMap()),
+      functionRegistry = functionRegistry
     )
 
     new ClickHouseWriteBuilder(writeJob)
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala
index e7f223b0..dab34932 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala
@@ -26,12 +26,14 @@ import xenon.clickhouse.spec.{ClusterSpec, ShardUtils}
  *   select xxHash64(concat(project_id, toString(seq))
  * }}}
  */
-object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] {
+object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
 
   override def name: String = "clickhouse_xxHash64"
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override val ckFuncNames: Array[String] = Array("xxHash64")
+
   override def description: String = s"$name: (value: string) => hash_value: long"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index b41a7d1a..c10ce864 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -18,11 +18,19 @@ import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
 
 import scala.collection.mutable
 
-trait FunctionRegistry {
+trait FunctionRegistry extends Serializable {
 
   def list: Array[String]
 
   def load(name: String): Option[UnboundFunction]
+
+  def getFuncMappingBySpark: Map[String, String]
+
+  def getFuncMappingByCk: Map[String, String] = getFuncMappingBySpark.map(_.swap)
+}
+
+trait ClickhouseEquivFunction {
+  val ckFuncNames: Array[String]
 }
 
 class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends FunctionRegistry {
@@ -30,6 +38,8 @@ class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends Fun
   override def list: Array[String] = registries.flatMap(_.list)
 
   override def load(name: String): Option[UnboundFunction] = registries.flatMap(_.load(name)).headOption
+
+  override def getFuncMappingBySpark: Map[String, String] = registries.flatMap(_.getFuncMappingBySpark).toMap
 }
 
 object StaticFunctionRegistry extends FunctionRegistry {
@@ -42,6 +52,11 @@ object StaticFunctionRegistry extends FunctionRegistry {
   override def list: Array[String] = functions.keys.toArray
 
   override def load(name: String): Option[UnboundFunction] = functions.get(name)
+
+  override val getFuncMappingBySpark: Map[String, String] =
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+      v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
+    }
 }
 
 class DynamicFunctionRegistry extends FunctionRegistry {
@@ -56,4 +71,9 @@ class DynamicFunctionRegistry extends FunctionRegistry {
   override def list: Array[String] = functions.keys.toArray
 
   override def load(name: String): Option[UnboundFunction] = functions.get(name)
+
+  override def getFuncMappingBySpark: Map[String, String] =
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+      v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
+    }.toMap
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
new file mode 100644
index 00000000..d3f40814
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+
+import java.sql.Timestamp
+
+class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
+
+  override def name: String = "months"
+
+  override def canonicalName: String = s"months"
+
+  override val ckFuncNames: Array[String] = Array("toYYYYMM")
+
+  override def description: String = s"$name: (time: timestamp) => shard_num: int"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, TimestampType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(TimestampType)
+
+  override def resultType: DataType = IntegerType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(time: Long): Int = {
+    val ts = new Timestamp(time / 1000).toLocalDateTime
+    ts.getYear * 100 + ts.getMonthValue
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
index d18319e5..07733442 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -17,9 +17,10 @@ package xenon.clickhouse.write
 import com.clickhouse.client.ClickHouseProtocol
 import com.clickhouse.data.ClickHouseCompression
 import org.apache.commons.io.IOUtils
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection}
-import org.apache.spark.sql.catalyst.{expressions, InternalRow}
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression, V2ExpressionUtils}
+import org.apache.spark.sql.catalyst.{InternalRow, expressions}
 import org.apache.spark.sql.clickhouse.ExprUtils
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
 import org.apache.spark.sql.connector.metric.CustomTaskMetric
 import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage}
 import org.apache.spark.sql.types._
@@ -56,7 +57,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
   protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match {
     case None => None
     case Some(v2Expr) =>
-      val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)
+      val catalystExpr = ExprUtils(writeJob.functionRegistry).toCatalyst(v2Expr, writeJob.dataSetSchema.fields)
       catalystExpr match {
         case BoundReference(_, dataType, _)
             if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType`
@@ -66,6 +67,11 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
           Some(catalystExpr)
         case BoundReference(_, dataType, _) =>
           throw CHClientException(s"Invalid data type of sharding field: $dataType")
+        case TransformExpression(function, _, _) =>
+          function.resultType() match {
+            case ByteType | ShortType | IntegerType | LongType => Some(catalystExpr)
+            case _ => throw CHClientException(s"Invalid data type of sharding field: ${function.resultType()}")
+          }
         case unsupported: Expression =>
           log.warn(s"Unsupported expression of sharding field: $unsupported")
           None
@@ -74,7 +80,23 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
 
   protected lazy val shardProjection: Option[expressions.Projection] = shardExpr
     .filter(_ => writeJob.writeOptions.convertDistributedToLocal)
-    .map(expr => SafeProjection.create(Seq(expr)))
+    .flatMap(expr =>
+      expr match {
+        case BoundReference(_, _, _) =>
+          Some(SafeProjection.create(Seq(expr)))
+        case TransformExpression(function, args, _) =>
+          val retType = function.resultType() match {
+            case ByteType => classOf[Byte]
+            case ShortType => classOf[Short]
+            case IntegerType => classOf[Int]
+            case LongType => classOf[Long]
+            case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," +
+                s"sharding field: ${function.resultType()}")
+          }
+          val expr = V2ExpressionUtils.resolveScalarFunction(function.asInstanceOf[ScalarFunction[retType.type]], args)
+          Some(SafeProjection.create(Seq(expr)))
+      }
+    )
 
   // put the node select strategy in executor side because we need to calculate shard and don't know the records
   // util DataWriter#write(InternalRow) invoked.
@@ -107,6 +129,15 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
         case _ => None
       }
       shardValue.map(value => ShardUtils.calcShard(writeJob.cluster.get, value).num)
+    case (Some(TransformExpression(function, _, _)), Some(projection)) =>
+      val shardValue = function.resultType() match {
+        case ByteType => Some(projection(record).getByte(0).toLong)
+        case ShortType => Some(projection(record).getShort(0).toLong)
+        case IntegerType => Some(projection(record).getInt(0).toLong)
+        case LongType => Some(projection(record).getLong(0))
+        case _ => None
+      }
+      shardValue.map(value => ShardUtils.calcShard(writeJob.cluster.get, value).num)
     case _ => None
   }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index 9cd8262f..b374c996 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -15,11 +15,11 @@
 package xenon.clickhouse.write
 
 import java.time.ZoneId
-
 import org.apache.spark.sql.clickhouse.{ExprUtils, WriteOptions}
 import org.apache.spark.sql.connector.expressions.{Expression, SortOrder, Transform}
 import org.apache.spark.sql.types.StructType
 import xenon.clickhouse.expr.{Expr, FuncExpr, OrderExpr}
+import xenon.clickhouse.func.FunctionRegistry
 import xenon.clickhouse.spec._
 
 case class WriteJobDescription(
@@ -37,7 +37,8 @@ case class WriteJobDescription(
   shardingKey: Option[Expr],
   partitionKey: Option[List[Expr]],
   sortingKey: Option[List[OrderExpr]],
-  writeOptions: WriteOptions
+  writeOptions: WriteOptions,
+  functionRegistry: FunctionRegistry
 ) {
 
   def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match {
@@ -56,20 +57,20 @@ case class WriteJobDescription(
   }
 
   def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match {
-    case Some(expr) => ExprUtils.toSparkTransformOpt(expr)
+    case Some(expr) => ExprUtils(functionRegistry).toSparkTransformOpt(expr)
     case _ => None
   }
 
   def sparkSplits: Array[Transform] =
     if (writeOptions.repartitionByPartition) {
-      ExprUtils.toSparkSplits(shardingKeyIgnoreRand, partitionKey)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey)
     } else {
-      ExprUtils.toSparkSplits(shardingKeyIgnoreRand, None)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None)
     }
 
   def sparkSortOrders: Array[SortOrder] = {
     val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None
     val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None
-    ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey)
+    ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey)
   }
 }
diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
new file mode 100644
index 00000000..c7c1cfb3
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.clickhouse
+
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.scalatest.funsuite.AnyFunSuite
+import xenon.clickhouse.ClickHouseHelper
+import xenon.clickhouse.func.{
+  ClickHouseXxHash64,
+  CompositeFunctionRegistry,
+  DynamicFunctionRegistry,
+  StaticFunctionRegistry
+}
+
+import scala.collection.JavaConverters._
+
+class FunctionRegistrySuite extends AnyFunSuite {
+
+  val staticFunctionRegistry: StaticFunctionRegistry.type = StaticFunctionRegistry
+  val dynamicFunctionRegistry = new DynamicFunctionRegistry
+  dynamicFunctionRegistry.register("ck_xx_hash64", ClickHouseXxHash64)
+  dynamicFunctionRegistry.register("clickhouse_xxHash64", ClickHouseXxHash64)
+
+  test("check StaticFunctionRegistry mappings") {
+    assert(staticFunctionRegistry.getFuncMappingBySpark === Map(
+      "ck_xx_hash64" -> "xxHash64",
+      "clickhouse_xxHash64" -> "xxHash64"
+    ))
+    assert((staticFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "clickhouse_xxHash64"
+    )) || (staticFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "ck_xx_hash64"
+    )))
+  }
+
+  test("check DynamicFunctionRegistry mappings") {
+    assert(dynamicFunctionRegistry.getFuncMappingBySpark === Map(
+      "ck_xx_hash64" -> "xxHash64",
+      "clickhouse_xxHash64" -> "xxHash64"
+    ))
+    assert((dynamicFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "clickhouse_xxHash64"
+    )) || (dynamicFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "ck_xx_hash64"
+    )))
+  }
+
+  test("check CompositeFunctionRegistry mappings") {
+    val compositeFunctionRegistry =
+      new CompositeFunctionRegistry(Array(staticFunctionRegistry, dynamicFunctionRegistry))
+    assert(compositeFunctionRegistry.getFuncMappingBySpark === Map(
+      "ck_xx_hash64" -> "xxHash64",
+      "clickhouse_xxHash64" -> "xxHash64"
+    ))
+    assert((compositeFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "clickhouse_xxHash64"
+    )) || (compositeFunctionRegistry.getFuncMappingByCk === Map(
+      "xxHash64" -> "ck_xx_hash64"
+    )))
+  }
+}

From e52b7144412642b8629fa08f1c9c40970bd08b2f Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Thu, 18 May 2023 16:54:05 +0800
Subject: [PATCH 02/20] Spark 3.4: add udf: years, days, hours, murmurHash2 and
 murmurHash3. Amend testing

---
 .../ClickHouseClusterHashUDFSuite.scala       | 67 ++++++++++++++++
 .../cluster/ClickHouseClusterUDFSuite.scala   | 55 -------------
 .../ClusterShardByTransformSuite.scala        | 35 ++++++---
 .../spark/sql/clickhouse/ExprUtils.scala      | 14 ----
 .../xenon/clickhouse/ClickHouseCatalog.scala  |  3 +-
 .../clickhouse/func/FunctionRegistry.scala    | 25 +++++-
 .../clickhouse/func/clickhouse/Days.scala     | 52 +++++++++++++
 .../{Months.scala => clickhouse/Hours.scala}  | 20 +++--
 .../clickhouse/func/clickhouse/Months.scala   | 52 +++++++++++++
 .../func/clickhouse/MurmurHash2.scala         | 77 ++++++++++++++++++
 .../func/clickhouse/MurmurHash3.scala         | 78 +++++++++++++++++++
 .../XxHash64.scala}                           |  4 +-
 .../clickhouse/func/clickhouse/Years.scala    | 52 +++++++++++++
 .../clickhouse/write/ClickHouseWriter.scala   | 10 ++-
 .../clickhouse/FunctionRegistrySuite.scala    | 48 +++++-------
 15 files changed, 469 insertions(+), 123 deletions(-)
 create mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
 delete mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{Months.scala => clickhouse/Hours.scala} (68%)
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{ClickHouseXxHash64.scala => clickhouse/XxHash64.scala} (96%)
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
new file mode 100644
index 00000000..9ef15241
--- /dev/null
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.clickhouse.cluster
+
+import org.apache.spark.sql.clickhouse.TestUtils.om
+import xenon.clickhouse.func.{CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry}
+import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
+
+import java.lang.{Long => JLong}
+
+class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
+  // only for query function names
+  val dummyRegistry: CompositeFunctionRegistry = {
+    val dynamicFunctionRegistry = new DynamicFunctionRegistry
+    val xxHash64ShardFunc = new ClickHouseXxHash64Shard(Seq.empty)
+    dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible
+    dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc)
+    new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
+  }
+
+  def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = {
+    val sparkResult = spark.sql(
+      s"""SELECT
+         |  $funcSparkName($stringVal)                         AS hash_value
+         |""".stripMargin
+    ).collect
+    assert(sparkResult.length == 1)
+    val sparkHashVal = sparkResult.head.getAs[Long]("hash_value")
+
+    val clickhouseResultJsonStr = runClickHouseSQL(
+      s"""SELECT
+         |  $funcCkName($stringVal)     AS hash_value
+         |""".stripMargin
+    ).head.getString(0)
+    val clickhouseResultJson = om.readTree(clickhouseResultJsonStr)
+    val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText)
+    assert(sparkHashVal == clickhouseHashVal)
+  }
+
+  Seq(
+    "clickhouse_xxHash64",
+    "clickhouse_murmurHash3_64",
+    "clickhouse_murmurHash3_32",
+    "clickhouse_murmurHash2_64",
+    "clickhouse_murmurHash2_32"
+  ).foreach { funcSparkName =>
+    val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
+    test(s"UDF $funcSparkName") {
+      Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { rawStringVal =>
+        val stringVal = s"\'$rawStringVal\'"
+        runTest(funcSparkName, funcCkName, stringVal)
+      }
+    }
+  }
+}
diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala
deleted file mode 100644
index 3d97cc25..00000000
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.clickhouse.cluster
-
-import org.apache.spark.sql.clickhouse.TestUtils.om
-
-import java.lang.{Long => JLong}
-
-class ClickHouseClusterUDFSuite extends SparkClickHouseClusterTest {
-
-  test("UDF ck_xx_hash64") {
-    Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { stringVal =>
-      val sparkResult = spark.sql(
-        s"""SELECT
-           |  ck_xx_hash64('$stringVal')                                AS hash_value_legacy,
-           |  clickhouse_xxHash64('$stringVal')                         AS hash_value,
-           |  ck_xx_hash64_shard('single_replica', '$stringVal')        AS shard_num_legacy, -- one based ordinal defined in `remote_servers.xml`
-           |  clickhouse_shard_xxHash64('single_replica', '$stringVal') AS shard_num         -- one based ordinal defined in `remote_servers.xml`
-           |""".stripMargin
-      ).collect
-      assert(sparkResult.length == 1)
-      val sparkHashValLegacy = sparkResult.head.getAs[Long]("hash_value_legacy")
-      val sparkHashVal = sparkResult.head.getAs[Long]("hash_value")
-      assert(sparkHashValLegacy === sparkHashVal)
-      val sparkShardNumLegacy = sparkResult.head.getAs[Int]("shard_num_legacy")
-      val sparkShardNum = sparkResult.head.getAs[Int]("shard_num")
-      assert(sparkShardNumLegacy === sparkShardNum)
-
-      val clickhouseResultJsonStr = runClickHouseSQL(
-        s"""SELECT
-           |  xxHash64('$stringVal')     AS hash_value,
-           |  xxHash64('$stringVal') % 4 AS shard_num -- zero based ordinal
-           |""".stripMargin
-      ).head.getString(0)
-      val clickhouseResultJson = om.readTree(clickhouseResultJsonStr)
-      val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText)
-      val clickhouseShardNum = JLong.parseUnsignedLong(clickhouseResultJson.get("shard_num").asText)
-
-      assert(sparkHashVal == clickhouseHashVal)
-      assert(sparkShardNum == clickhouseShardNum + 1)
-    }
-  }
-}
diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
index fce0f77e..db8a3036 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -27,7 +27,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
   def runTest(func_name: String, func_args: Array[String]): Unit = {
     val func_expr = s"$func_name(${func_args.mkString(",")})"
     val cluster = "single_replica"
-    val db = s"db_${func_name}_shard"
+    val db = s"db_${func_name}_shard_transform"
     val tbl_dist = s"tbl_${func_name}_shard"
     val tbl_local = s"${tbl_dist}_local"
 
@@ -37,6 +37,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
       spark.sql(
         s"""CREATE TABLE $db.$tbl_local (
            |  create_time TIMESTAMP NOT NULL,
+           |  create_date DATE NOT NULL,
            |  value       STRING NOT NULL
            |) USING ClickHouse
            |TBLPROPERTIES (
@@ -56,10 +57,11 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
       spark.sql(
         s"""INSERT INTO `$db`.`$tbl_dist`
            |VALUES
-           |  (timestamp'2021-01-01 10:10:10', '1'),
-           |  (timestamp'2022-02-02 10:10:10', '2'),
-           |  (timestamp'2023-03-03 10:10:10', '3'),
-           |  (timestamp'2024-04-04 10:10:10', '4') AS tab(create_time, value)
+           |  (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'),
+           |  (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'),
+           |  (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'),
+           |  (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4')
+           |  AS tab(create_time, create_date, value)
            |""".stripMargin
       )
       // check that data is indeed written
@@ -72,10 +74,10 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
       runClickHouseSQL(
         s"""INSERT INTO `$db`.`$tbl_dist`
            |VALUES
-           |  (timestamp'2021-01-01 10:10:10', '1'),
-           |  (timestamp'2022-02-02 10:10:10', '2'),
-           |  (timestamp'2023-03-03 10:10:10', '3'),
-           |  (timestamp'2024-04-04 10:10:10', '4')
+           |  (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'),
+           |  (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'),
+           |  (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'),
+           |  (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4')
            |""".stripMargin
       )
       checkAnswer(
@@ -91,8 +93,19 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
     }
   }
 
-  Seq(("xxHash64", Array("value")), ("toYYYYMM", Array("create_time"))).foreach { case (func_name, func_args) =>
-    test(s"shard by $func_name")(runTest(func_name, func_args))
+  Seq(
+    ("toYear", Array("create_date")),
+    ("toYYYYMM", Array("create_date")),
+    ("toYYYYMMDD", Array("create_date")),
+    ("toHour", Array("create_time")),
+    ("xxHash64", Array("value")),
+    ("murmurHash2_64", Array("value")),
+    ("murmurHash2_32", Array("value")),
+    ("murmurHash3_64", Array("value")),
+    ("murmurHash3_32", Array("value"))
+  ).foreach {
+    case (func_name: String, func_args: Array[String]) =>
+      test(s"shard by $func_name")(runTest(func_name, func_args))
   }
 
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index d7116cc9..8a7d1f96 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -108,12 +108,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
 
   def toSparkTransform(expr: Expr): Transform = expr match {
     case FieldRef(col) => identity(col)
-    case FuncExpr("toYear", List(FieldRef(col))) => years(col)
-    case FuncExpr("YEAR", List(FieldRef(col))) => years(col)
-    case FuncExpr("toYYYYMM", List(FieldRef(col))) => months(col)
-    case FuncExpr("toYYYYMMDD", List(FieldRef(col))) => days(col)
-    case FuncExpr("toHour", List(FieldRef(col))) => hours(col)
-    case FuncExpr("HOUR", List(FieldRef(col))) => hours(col)
     case FuncExpr("rand", Nil) => apply("rand")
     case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
     case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) =>
@@ -122,10 +116,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
   }
 
   def toClickHouse(transform: Transform): Expr = transform match {
-    case YearsTransform(FieldReference(Seq(col))) => FuncExpr("toYear", List(FieldRef(col)))
-    case MonthsTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMM", List(FieldRef(col)))
-    case DaysTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMMDD", List(FieldRef(col)))
-    case HoursTransform(FieldReference(Seq(col))) => FuncExpr("toHour", List(FieldRef(col)))
     case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe)
     case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) =>
       FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList)
@@ -138,10 +128,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     secondarySchema: StructType,
     transform: Transform
   ): StructField = transform match {
-    case years: YearsTransform => StructField(years.toString, IntegerType)
-    case months: MonthsTransform => StructField(months.toString, IntegerType)
-    case days: DaysTransform => StructField(days.toString, IntegerType)
-    case hours: HoursTransform => StructField(hours.toString, IntegerType)
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
index b625560d..5fd043cd 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -26,6 +26,7 @@ import xenon.clickhouse.Constants._
 import xenon.clickhouse.client.NodeClient
 import xenon.clickhouse.exception.CHClientException
 import xenon.clickhouse.exception.ClickHouseErrCode._
+import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
 import xenon.clickhouse.func.{FunctionRegistry, _}
 import xenon.clickhouse.spec._
 
@@ -85,10 +86,8 @@ class ClickHouseCatalog extends TableCatalog
 
     val dynamicFunctionRegistry = new DynamicFunctionRegistry
     val xxHash64ShardFunc = new ClickHouseXxHash64Shard(clusterSpecs)
-    val monthsFunc = new Months()
     dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible
     dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc)
-    dynamicFunctionRegistry.register("months", monthsFunc)
     this.functionRegistry = new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
 
     log.info(s"Detect ${clusterSpecs.size} ClickHouse clusters: ${clusterSpecs.map(_.name).mkString(",")}")
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index c10ce864..e6094eaf 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -15,6 +15,7 @@
 package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
+import xenon.clickhouse.func.clickhouse._
 
 import scala.collection.mutable
 
@@ -26,7 +27,7 @@ trait FunctionRegistry extends Serializable {
 
   def getFuncMappingBySpark: Map[String, String]
 
-  def getFuncMappingByCk: Map[String, String] = getFuncMappingBySpark.map(_.swap)
+  def getFuncMappingByCk: Map[String, String]
 }
 
 trait ClickhouseEquivFunction {
@@ -40,13 +41,23 @@ class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends Fun
   override def load(name: String): Option[UnboundFunction] = registries.flatMap(_.load(name)).headOption
 
   override def getFuncMappingBySpark: Map[String, String] = registries.flatMap(_.getFuncMappingBySpark).toMap
+
+  override def getFuncMappingByCk: Map[String, String] = registries.flatMap(_.getFuncMappingByCk).toMap
 }
 
 object StaticFunctionRegistry extends FunctionRegistry {
 
   private val functions = Map[String, UnboundFunction](
     "ck_xx_hash64" -> ClickHouseXxHash64, // for compatible
-    "clickhouse_xxHash64" -> ClickHouseXxHash64
+    "clickhouse_xxHash64" -> ClickHouseXxHash64,
+    "clickhouse_murmurHash2_32" -> MurmurHash2_32,
+    "clickhouse_murmurHash2_64" -> MurmurHash2_64,
+    "clickhouse_murmurHash3_32" -> MurmurHash3_32,
+    "clickhouse_murmurHash3_64" -> MurmurHash3_64,
+    "clickhouse_years" -> Years,
+    "clickhouse_months" -> Months,
+    "clickhouse_days" -> Days,
+    "clickhouse_hours" -> Hours
   )
 
   override def list: Array[String] = functions.keys.toArray
@@ -57,6 +68,11 @@ object StaticFunctionRegistry extends FunctionRegistry {
     functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
       v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
     }
+
+  override val getFuncMappingByCk: Map[String, String] =
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+      v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k))
+    }
 }
 
 class DynamicFunctionRegistry extends FunctionRegistry {
@@ -76,4 +92,9 @@ class DynamicFunctionRegistry extends FunctionRegistry {
     functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
       v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
     }.toMap
+
+  override def getFuncMappingByCk: Map[String, String] =
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+      v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k))
+    }.toMap
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
new file mode 100644
index 00000000..9ceca80e
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+import java.time.LocalDate
+import java.time.format.DateTimeFormatter
+
+object Days extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_days"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("toYYYYMMDD")
+
+  override def description: String = s"$name: (date: Date) => shard_num: int"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, DateType, _, _)) => this
+    case Array(StructField(_, TimestampType, _, _)) => this
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(DateType)
+
+  override def resultType: DataType = IntegerType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(days: Int): Int = {
+    val date = LocalDate.ofEpochDay(days)
+    val formatter = DateTimeFormatter.ofPattern("yyyyMMdd")
+    date.format(formatter).toInt
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
similarity index 68%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
index d3f40814..77dbe4c2 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
@@ -12,25 +12,28 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func
+package xenon.clickhouse.func.clickhouse
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
+import xenon.clickhouse.func.ClickhouseEquivFunction
 
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
+import java.text.SimpleDateFormat
 
-class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
+object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
 
-  override def name: String = "months"
+  override def name: String = "clickhouse_hours"
 
-  override def canonicalName: String = s"months"
+  override def canonicalName: String = s"clickhouse.$name"
 
-  override val ckFuncNames: Array[String] = Array("toYYYYMM")
+  override val ckFuncNames: Array[String] = Array("toHour", "HOUR")
 
   override def description: String = s"$name: (time: timestamp) => shard_num: int"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
     case Array(StructField(_, TimestampType, _, _)) => this
+    case Array(StructField(_, StringType, _, _)) => this
     case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description")
   }
 
@@ -41,7 +44,8 @@ class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu
   override def isResultNullable: Boolean = false
 
   def invoke(time: Long): Int = {
-    val ts = new Timestamp(time / 1000).toLocalDateTime
-    ts.getYear * 100 + ts.getMonthValue
+    val ts = new Timestamp(time / 1000)
+    val formatter: SimpleDateFormat = new SimpleDateFormat("hh")
+    formatter.format(ts).toInt
   }
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
new file mode 100644
index 00000000..0be1bc9b
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+import java.time.LocalDate
+import java.time.format.DateTimeFormatter
+
+object Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_months"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("toYYYYMM")
+
+  override def description: String = s"$name: (date: Date) => shard_num: int"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, DateType, _, _)) => this
+    case Array(StructField(_, TimestampType, _, _)) => this
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(DateType)
+
+  override def resultType: DataType = IntegerType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(days: Int): Int = {
+    val date = LocalDate.ofEpochDay(days)
+    val formatter = DateTimeFormatter.ofPattern("yyyyMM")
+    date.format(formatter).toInt
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
new file mode 100644
index 00000000..49daaeae
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.commons.codec.digest.MurmurHash2
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+object MurmurHash2_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_murmurHash2_64"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("murmurHash2_64")
+
+  override def description: String = s"$name: (value: string) => hash_value: long"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(StringType)
+
+  override def resultType: DataType = LongType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(values: UTF8String): Long = {
+    // ignore UInt64 vs Int64
+    val data = values.getBytes
+    MurmurHash2.hash64(data, data.length, 0)
+  }
+}
+
+object MurmurHash2_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_murmurHash2_32"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("murmurHash2_32")
+
+  override def description: String = s"$name: (value: string) => hash_value: long"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(StringType)
+
+  override def resultType: DataType = LongType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(values: UTF8String): Long = {
+    val data = values.getBytes
+    val v = MurmurHash2.hash32(data, data.length, 0).toLong
+    if (v < 0) v + (1L << 32) else v
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
new file mode 100644
index 00000000..db15a8e7
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.commons.codec.digest.MurmurHash3
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+object MurmurHash3_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_murmurHash3_64"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("murmurHash3_64")
+
+  override def description: String = s"$name: (value: string) => hash_value: long"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(StringType)
+
+  override def resultType: DataType = LongType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(values: UTF8String): Long = {
+    // ignore UInt64 vs Int64
+    val data = values.getBytes
+    val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0)
+    hashes(0) ^ hashes(1)
+  }
+}
+
+object MurmurHash3_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_murmurHash3_32"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("murmurHash3_32")
+
+  override def description: String = s"$name: (value: string) => hash_value: long"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(StringType)
+
+  override def resultType: DataType = LongType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(values: UTF8String): Long = {
+    val data = values.getBytes
+    val v = MurmurHash3.hash32x86(data, 0, data.length, 0).toLong
+    if (v < 0) v + (1L << 32) else v
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
similarity index 96%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
index dab34932..f02af236 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
@@ -12,12 +12,13 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func
+package xenon.clickhouse.func.clickhouse
 
 import org.apache.spark.sql.catalyst.expressions.XxHash64Function
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import xenon.clickhouse.func.ClickhouseEquivFunction
 import xenon.clickhouse.spec.{ClusterSpec, ShardUtils}
 
 /**
@@ -47,6 +48,7 @@ object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with
 
   override def isResultNullable: Boolean = false
 
+  // ignore UInt64 vs Int64
   def invoke(value: UTF8String): Long = XxHash64Function.hash(value, StringType, 0L)
 }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
new file mode 100644
index 00000000..b3c0a135
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+import java.time.LocalDate
+import java.time.format.DateTimeFormatter
+
+object Years extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
+
+  override def name: String = "clickhouse_years"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("toYear", "YEAR")
+
+  override def description: String = s"$name: (date: Date) => shard_num: int"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, DateType, _, _)) => this
+    case Array(StructField(_, TimestampType, _, _)) => this
+    case Array(StructField(_, StringType, _, _)) => this
+    case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(DateType)
+
+  override def resultType: DataType = IntegerType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(days: Int): Int = {
+    val date = LocalDate.ofEpochDay(days)
+    val formatter = DateTimeFormatter.ofPattern("yyyy")
+    date.format(formatter).toInt
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
index 07733442..3cd43c5e 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -17,8 +17,14 @@ package xenon.clickhouse.write
 import com.clickhouse.client.ClickHouseProtocol
 import com.clickhouse.data.ClickHouseCompression
 import org.apache.commons.io.IOUtils
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression, V2ExpressionUtils}
-import org.apache.spark.sql.catalyst.{InternalRow, expressions}
+import org.apache.spark.sql.catalyst.expressions.{
+  BoundReference,
+  Expression,
+  SafeProjection,
+  TransformExpression,
+  V2ExpressionUtils
+}
+import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.clickhouse.ExprUtils
 import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
 import org.apache.spark.sql.connector.metric.CustomTaskMetric
diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
index c7c1cfb3..34254907 100644
--- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
+++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
@@ -17,8 +17,9 @@ package org.apache.spark.sql.clickhouse
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.scalatest.funsuite.AnyFunSuite
 import xenon.clickhouse.ClickHouseHelper
+import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64
 import xenon.clickhouse.func.{
-  ClickHouseXxHash64,
+  ClickhouseEquivFunction,
   CompositeFunctionRegistry,
   DynamicFunctionRegistry,
   StaticFunctionRegistry
@@ -34,40 +35,31 @@ class FunctionRegistrySuite extends AnyFunSuite {
   dynamicFunctionRegistry.register("clickhouse_xxHash64", ClickHouseXxHash64)
 
   test("check StaticFunctionRegistry mappings") {
-    assert(staticFunctionRegistry.getFuncMappingBySpark === Map(
-      "ck_xx_hash64" -> "xxHash64",
-      "clickhouse_xxHash64" -> "xxHash64"
-    ))
-    assert((staticFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "clickhouse_xxHash64"
-    )) || (staticFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "ck_xx_hash64"
-    )))
+    assert(staticFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) =>
+      staticFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v)
+    })
+    assert(staticFunctionRegistry.getFuncMappingByCk.forall { case (k, v) =>
+      staticFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k)
+    })
   }
 
   test("check DynamicFunctionRegistry mappings") {
-    assert(dynamicFunctionRegistry.getFuncMappingBySpark === Map(
-      "ck_xx_hash64" -> "xxHash64",
-      "clickhouse_xxHash64" -> "xxHash64"
-    ))
-    assert((dynamicFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "clickhouse_xxHash64"
-    )) || (dynamicFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "ck_xx_hash64"
-    )))
+    assert(dynamicFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) =>
+      dynamicFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v)
+    })
+    assert(dynamicFunctionRegistry.getFuncMappingByCk.forall { case (k, v) =>
+      dynamicFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k)
+    })
   }
 
   test("check CompositeFunctionRegistry mappings") {
     val compositeFunctionRegistry =
       new CompositeFunctionRegistry(Array(staticFunctionRegistry, dynamicFunctionRegistry))
-    assert(compositeFunctionRegistry.getFuncMappingBySpark === Map(
-      "ck_xx_hash64" -> "xxHash64",
-      "clickhouse_xxHash64" -> "xxHash64"
-    ))
-    assert((compositeFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "clickhouse_xxHash64"
-    )) || (compositeFunctionRegistry.getFuncMappingByCk === Map(
-      "xxHash64" -> "ck_xx_hash64"
-    )))
+    assert(compositeFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) =>
+      compositeFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v)
+    })
+    assert(compositeFunctionRegistry.getFuncMappingByCk.forall { case (k, v) =>
+      compositeFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k)
+    })
   }
 }

From ff243b591c1e411f5cccd31e138d9e1eae722a75 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 19 May 2023 22:09:39 +0800
Subject: [PATCH 03/20] Spark 3.4: Fixup sharding key needs to be mod by
 cluster weight on local sort

---
 .../xenon/clickhouse/spec/NodeSpec.scala      |  2 +
 .../spark/sql/clickhouse/ExprUtils.scala      | 41 ++++++------
 .../clickhouse/func/FunctionRegistry.scala    |  3 +-
 .../clickhouse/func/clickhouse/Pmod.scala     | 63 +++++++++++++++++++
 .../write/WriteJobDescription.scala           |  6 +-
 5 files changed, 93 insertions(+), 22 deletions(-)
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala

diff --git a/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala b/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala
index 454312df..eb809169 100644
--- a/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala
+++ b/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala
@@ -97,4 +97,6 @@ case class ClusterSpec(
   override def toString: String = s"cluster: $name, shards: [${shards.mkString(", ")}]"
 
   @JsonIgnore @transient override lazy val nodes: Array[NodeSpec] = shards.sorted.flatMap(_.nodes)
+
+  def totalWeight: Int = shards.map(_.weight).sum
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 8a7d1f96..2b2a2cae 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -23,27 +23,37 @@ import org.apache.spark.sql.connector.catalog.Identifier
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.connector.expressions.Expressions._
 import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, _}
-import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
+import org.apache.spark.sql.types.{StructField, StructType}
 import xenon.clickhouse.exception.CHClientException
 import xenon.clickhouse.expr._
 import xenon.clickhouse.func.FunctionRegistry
+import xenon.clickhouse.spec.ClusterSpec
 
 import scala.util.{Failure, Success, Try}
 
 class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable {
 
+  private def toSplitWithModulo(shardingKey: Expr, cluster: ClusterSpec): FuncExpr =
+    FuncExpr("positiveModulo", List(shardingKey, StringLiteral(cluster.totalWeight.toString)))
+
   def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] =
     partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray
 
-  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] =
-    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
+  def toSparkSplits(
+    shardingKey: Option[Expr],
+    partitionKey: Option[List[Expr]],
+    cluster: Option[ClusterSpec]
+  ): Array[Transform] =
+    (shardingKey.map(k => toSplitWithModulo(k, cluster.get)).seq ++ partitionKey.seq.flatten)
+      .flatten(toSparkTransformOpt).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
     partitionKey: Option[List[Expr]],
-    sortingKey: Option[List[OrderExpr]]
+    sortingKey: Option[List[OrderExpr]],
+    cluster: Option[ClusterSpec]
   ): Array[SortOrder] =
-    toSparkSplits(shardingKeyIgnoreRand, partitionKey).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
+    toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
@@ -93,25 +103,20 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
         )
     }
 
-  def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkTransform(expr)) match {
-    case Success(t) => Some(t)
+  def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match {
+    case Success(t: Transform) => Some(t)
+    case Success(_) => None
     case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
     case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
   }
 
-  // Some functions of ClickHouse which match Spark pre-defined Transforms
-  //
-  // toYear, YEAR - Converts a date or date with time to a UInt16 (AD)
-  // toYYYYMM     - Converts a date or date with time to a UInt32 (YYYY*100 + MM)
-  // toYYYYMMDD   - Converts a date or date with time to a UInt32 (YYYY*10000 + MM*100 + DD)
-  // toHour, HOUR - Converts a         date with time to a UInt8  (0-23)
-
-  def toSparkTransform(expr: Expr): Transform = expr match {
+  def toSparkExpression(expr: Expr): V2Expression = expr match {
     case FieldRef(col) => identity(col)
+    case StringLiteral(value) => literal(value)
     case FuncExpr("rand", Nil) => apply("rand")
     case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
-    case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) =>
-      apply(functionRegistry.getFuncMappingByCk(funName), column(col))
+    case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
+      apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
     case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
   }
 
@@ -131,7 +136,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
-    case t @ ApplyTransform(transformName, _) =>
+    case t @ ApplyTransform(transformName, _) if functionRegistry.load(transformName).isDefined =>
       val resType =
         functionRegistry.load(transformName).getOrElse(throw new NoSuchFunctionException(transformName)) match {
           case f: ScalarFunction[_] => f.resultType()
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index e6094eaf..8a7ec436 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -57,7 +57,8 @@ object StaticFunctionRegistry extends FunctionRegistry {
     "clickhouse_years" -> Years,
     "clickhouse_months" -> Months,
     "clickhouse_days" -> Days,
-    "clickhouse_hours" -> Hours
+    "clickhouse_hours" -> Hours,
+    "sharding_pmod" -> Pmod
   )
 
   override def list: Array[String] = functions.keys.toArray
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
new file mode 100644
index 00000000..e9eafb8d
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import xenon.clickhouse.func.ClickhouseEquivFunction
+
+object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+
+  override def name: String = "sharding_pmod"
+
+  override def canonicalName: String = s"clickhouse.$name"
+
+  override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod")
+
+  override def description: String = s"$name: (a: long, b: long) => mod: long"
+
+  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(a, b) if
+          (a match {
+            case StructField(_, LongType, _, _) => true
+            case StructField(_, IntegerType, _, _) => true
+            case StructField(_, ShortType, _, _) => true
+            case StructField(_, ByteType, _, _) => true
+            case StructField(_, StringType, _, _) => true
+            case _ => false
+          }) &&
+            (b match {
+              case StructField(_, LongType, _, _) => true
+              case StructField(_, IntegerType, _, _) => true
+              case StructField(_, ShortType, _, _) => true
+              case StructField(_, ByteType, _, _) => true
+              case StructField(_, StringType, _, _) => true
+              case _ => false
+            }) =>
+      this
+    case _ => throw new UnsupportedOperationException(s"Expect 2 integer arguments. $description")
+  }
+
+  override def inputTypes: Array[DataType] = Array(LongType, LongType)
+
+  override def resultType: DataType = LongType
+
+  override def isResultNullable: Boolean = false
+
+  def invoke(a: Long, b: Long): Long = {
+    val mod = a % b
+    if (mod < 0) mod + b else mod
+  }
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index b374c996..81a347ee 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -63,14 +63,14 @@ case class WriteJobDescription(
 
   def sparkSplits: Array[Transform] =
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster)
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster)
     }
 
   def sparkSortOrders: Array[SortOrder] = {
     val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None
     val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None
-    ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey)
+    ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
   }
 }

From a1d4dce4ddbf54039be9f083da0ba7032f3143d7 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Sat, 20 May 2023 01:54:46 +0800
Subject: [PATCH 04/20] Scala 2.13: Fix Spark 3.4 compile issue

---
 .../scala/xenon/clickhouse/func/FunctionRegistry.scala    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index 8a7ec436..fd12edc1 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -90,12 +90,12 @@ class DynamicFunctionRegistry extends FunctionRegistry {
   override def load(name: String): Option[UnboundFunction] = functions.get(name)
 
   override def getFuncMappingBySpark: Map[String, String] =
-    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) =>
       v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
-    }.toMap
+    }
 
   override def getFuncMappingByCk: Map[String, String] =
-    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
+    functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) =>
       v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k))
-    }.toMap
+    }
 }

From 5ddb98f60e70674fbd5cd2e044472b421ccaccc6 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Mon, 22 May 2023 10:31:32 +0800
Subject: [PATCH 05/20] Spark 3.4: Optimize sharding key handling when shuffle
 and sort

---
 .../apache/spark/sql/clickhouse/ExprUtils.scala   | 15 ++++++++++-----
 .../clickhouse/write/WriteJobDescription.scala    |  4 ++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 2b2a2cae..cbf57630 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -41,11 +41,11 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
 
   def toSparkSplits(
     shardingKey: Option[Expr],
-    partitionKey: Option[List[Expr]],
-    cluster: Option[ClusterSpec]
+    partitionKey: Option[List[Expr]]
   ): Array[Transform] =
-    (shardingKey.map(k => toSplitWithModulo(k, cluster.get)).seq ++ partitionKey.seq.flatten)
-      .flatten(toSparkTransformOpt).toArray
+    // no pmod shard key here, because we want to shuffle it more evenly,
+    // hence spread the load in Spark tasks to multiple Clickhouse nodes
+    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
@@ -53,7 +53,11 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     sortingKey: Option[List[OrderExpr]],
     cluster: Option[ClusterSpec]
   ): Array[SortOrder] =
-    toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
+    // pmod shard key here, because we need same cluster number but not same hash value
+    // to be sorted together and be written as a batch
+    toSparkSplits(shardingKeyIgnoreRand.map(k => toSplitWithModulo(k, cluster.get)), partitionKey).map(
+      Expressions.sort(_, SortDirection.ASCENDING)
+    ) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
@@ -104,6 +108,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     }
 
   def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match {
+    // need this function because spark `Table`'s `partitioning` field should be `Transform`
     case Success(t: Transform) => Some(t)
     case Success(_) => None
     case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index 81a347ee..de28ec87 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -63,9 +63,9 @@ case class WriteJobDescription(
 
   def sparkSplits: Array[Transform] =
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey)
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None)
     }
 
   def sparkSortOrders: Array[SortOrder] = {

From 000638e0c377d7c687eccd297c88fad0da9d05a3 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Mon, 22 May 2023 18:30:02 +0800
Subject: [PATCH 06/20] Spark 3.4: Optimize sharding key handling when shuffle
 and sort, approach 2

---
 .../spark/sql/clickhouse/ExprUtils.scala      | 21 +++++++++++--------
 .../write/WriteJobDescription.scala           |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index cbf57630..a873fc4d 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -41,11 +41,18 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
 
   def toSparkSplits(
     shardingKey: Option[Expr],
-    partitionKey: Option[List[Expr]]
+    partitionKey: Option[List[Expr]],
+    cluster: Option[ClusterSpec]
   ): Array[Transform] =
-    // no pmod shard key here, because we want to shuffle it more evenly,
-    // hence spread the load in Spark tasks to multiple Clickhouse nodes
-    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
+    // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this:
+    //   - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused
+    //     by this.
+    //   - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we
+    //     need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the
+    //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
+    (shardingKey.map(k =>
+      FuncExpr("positiveModulo", List(k, StringLiteral((cluster.get.totalWeight * 10).toString)))
+    ).seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
@@ -53,11 +60,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     sortingKey: Option[List[OrderExpr]],
     cluster: Option[ClusterSpec]
   ): Array[SortOrder] =
-    // pmod shard key here, because we need same cluster number but not same hash value
-    // to be sorted together and be written as a batch
-    toSparkSplits(shardingKeyIgnoreRand.map(k => toSplitWithModulo(k, cluster.get)), partitionKey).map(
-      Expressions.sort(_, SortDirection.ASCENDING)
-    ) ++:
+    toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index de28ec87..81a347ee 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -63,9 +63,9 @@ case class WriteJobDescription(
 
   def sparkSplits: Array[Transform] =
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster)
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None)
+      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster)
     }
 
   def sparkSortOrders: Array[SortOrder] = {

From 59f3bed98e6f4bb66e14761249aa0c747df5e3bd Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Tue, 23 May 2023 17:30:41 +0800
Subject: [PATCH 07/20] Spark 3.4: Support variable length arguments for
 murmurHash (up to 5 string arguments)

---
 .../ClickHouseClusterHashUDFSuite.scala       |  33 ++++++
 .../xenon/clickhouse/func/MultiArgsHash.scala | 101 ++++++++++++++++++
 .../scala/xenon/clickhouse/func/Util.scala    |  52 +++++++++
 .../func/clickhouse/MurmurHash2.scala         |  60 +++--------
 .../func/clickhouse/MurmurHash3.scala         |  58 +++-------
 5 files changed, 219 insertions(+), 85 deletions(-)
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
index 9ef15241..b3556258 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
@@ -30,6 +30,15 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
     new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
   }
 
+  def product[A](xs: Seq[Seq[A]]): Seq[Seq[A]] =
+    xs.toList match {
+      case Nil => Seq(Seq())
+      case head :: tail => for {
+          h <- head
+          t <- product(tail)
+        } yield h +: t
+    }
+
   def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = {
     val sparkResult = spark.sql(
       s"""SELECT
@@ -64,4 +73,28 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
       }
     }
   }
+
+  Seq(
+    "clickhouse_murmurHash3_64",
+    "clickhouse_murmurHash3_32",
+    "clickhouse_murmurHash2_64",
+    "clickhouse_murmurHash2_32"
+  ).foreach { funcSparkName =>
+    val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
+    test(s"UDF $funcSparkName multiple args") {
+      val strings = Seq(
+        "\'spark-clickhouse-connector\'",
+        "\'Apache Spark\'",
+        "\'ClickHouse\'",
+        "\'Yandex\'",
+        "\'热爱\'",
+        "\'🇨🇳\'"
+      )
+      val test_5 = strings.combinations(5)
+      test_5.foreach { seq =>
+        val stringVal = seq.mkString(", ")
+        runTest(funcSparkName, funcCkName, stringVal)
+      }
+    }
+  }
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
new file mode 100644
index 00000000..dc635a27
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func
+
+import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction {
+  trait Base extends ScalarFunction[Long] {
+    // must not be private object, nor do it successors, because spark would compile them
+    override def canonicalName: String = s"clickhouse.$name"
+    override def resultType: DataType = LongType
+    override def isResultNullable: Boolean = false
+  }
+
+  object Arg1 extends Base {
+    override def name: String = s"${funcName}_1"
+    override def inputTypes: Array[DataType] = Array.fill(1)(StringType)
+    def invoke(value: UTF8String): Long = invokeBase(value)
+  }
+
+  object Arg2 extends Base {
+    override def name: String = s"${funcName}_2"
+    override def inputTypes: Array[DataType] = Array.fill(2)(StringType)
+    def invoke(v1: UTF8String, v2: UTF8String): Long = Seq(v1, v2).map(invokeBase).reduce(combineHashes)
+  }
+
+  object Arg3 extends Base {
+    override def name: String = s"${funcName}_3"
+    override def inputTypes: Array[DataType] = Array.fill(3)(StringType)
+    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String): Long =
+      Seq(v1, v2, v3).map(invokeBase).reduce(combineHashes)
+  }
+
+  object Arg4 extends Base {
+    override def name: String = s"${funcName}_4"
+    override def inputTypes: Array[DataType] = Array.fill(4)(StringType)
+    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String): Long =
+      Seq(v1, v2, v3, v4).map(invokeBase).reduce(combineHashes)
+  }
+
+  object Arg5 extends Base {
+    override def name: String = s"${funcName}_4"
+    override def inputTypes: Array[DataType] = Array.fill(5)(StringType)
+    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String, v5: UTF8String): Long =
+      Seq(v1, v2, v3, v4, v5).map(invokeBase).reduce(combineHashes)
+  }
+  private def isExceptedType(dt: DataType): Boolean =
+    dt.isInstanceOf[StringType]
+
+  final override def name: String = funcName
+  final override def bind(inputType: StructType): BoundFunction = inputType.fields match {
+    case Array(StructField(_, dt, _, _)) if List(dt).forall(isExceptedType) => this.Arg1
+    case Array(
+          StructField(_, dt1, _, _),
+          StructField(_, dt2, _, _)
+        ) if List(dt1, dt2).forall(isExceptedType) =>
+      this.Arg2
+    case Array(
+          StructField(_, dt1, _, _),
+          StructField(_, dt2, _, _),
+          StructField(_, dt3, _, _)
+        ) if List(dt1, dt2, dt3).forall(isExceptedType) =>
+      this.Arg3
+    case Array(
+          StructField(_, dt1, _, _),
+          StructField(_, dt2, _, _),
+          StructField(_, dt3, _, _),
+          StructField(_, dt4, _, _)
+        ) if List(dt1, dt2, dt3, dt4).forall(isExceptedType) =>
+      this.Arg4
+    case Array(
+          StructField(_, dt1, _, _),
+          StructField(_, dt2, _, _),
+          StructField(_, dt3, _, _),
+          StructField(_, dt4, _, _),
+          StructField(_, dt5, _, _)
+        ) if List(dt1, dt2, dt3, dt4, dt5).forall(isExceptedType) =>
+      this.Arg5
+    case _ => throw new UnsupportedOperationException(s"Expect up to 5 STRING argument. $description")
+  }
+
+  protected def funcName: String
+  override val ckFuncNames: Array[String]
+  override def description: String = s"$name: (value: string, ...) => hash_value: long"
+  def invokeBase(value: UTF8String): Long
+  def combineHashes(v1: Long, v2: Long): Long
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala
new file mode 100644
index 00000000..9ba35f10
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func
+
+object Util {
+  def intHash64Impl(x: Long): Long =
+    // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Functions/FunctionsHashing.h#L143
+    intHash64(x ^ 0x4cf2d2baae6da887L)
+
+  def intHash64(l: Long): Long = {
+    // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Common/HashTable/Hash.h#L28
+    var x = l
+    x ^= x >>> 33;
+    x *= 0xff51afd7ed558ccdL;
+    x ^= x >>> 33;
+    x *= 0xc4ceb9fe1a85ec53L;
+    x ^= x >>> 33;
+    x
+  }
+
+  def int32Impl(x: Long): Int =
+    // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Functions/FunctionsHashing.h#L133
+    intHash32(x, 0x75d9543de018bf45L)
+
+  def intHash32(l: Long, salt: Long): Int = {
+    // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Common/HashTable/Hash.h#L502
+    var x = l
+
+    x ^= salt;
+    x = (~x) + (x << 18)
+    x = x ^ ((x >>> 31) | (x << 33))
+    x = x * 21
+    x = x ^ ((x >>> 11) | (x << 53))
+    x = x + (x << 6)
+    x = x ^ ((x >>> 22) | (x << 42))
+    x.toInt
+  }
+
+  def toUInt32Range(v: Long): Long = if (v < 0) v + (1L << 32) else v
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
index 49daaeae..052be5f9 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
@@ -14,64 +14,38 @@
 
 package xenon.clickhouse.func.clickhouse
 
-import org.apache.commons.codec.digest.MurmurHash2
+import org.apache.commons.codec.digest.{MurmurHash2, MurmurHash3}
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.ClickhouseEquivFunction
+import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
 
-object MurmurHash2_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
-
-  override def name: String = "clickhouse_murmurHash2_64"
-
-  override def canonicalName: String = s"clickhouse.$name"
+object MurmurHash2_64 extends MultiArgsHash {
+  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L460
 
+  override protected def funcName: String = "clickhouse_murmurHash2_64"
   override val ckFuncNames: Array[String] = Array("murmurHash2_64")
 
-  override def description: String = s"$name: (value: string) => hash_value: long"
-
-  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, StringType, _, _)) => this
-    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
-  }
-
-  override def inputTypes: Array[DataType] = Array(StringType)
-
-  override def resultType: DataType = LongType
-
-  override def isResultNullable: Boolean = false
-
-  def invoke(values: UTF8String): Long = {
+  override def invokeBase(value: UTF8String): Long = {
     // ignore UInt64 vs Int64
-    val data = values.getBytes
+    val data = value.getBytes
     MurmurHash2.hash64(data, data.length, 0)
   }
-}
-
-object MurmurHash2_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
 
-  override def name: String = "clickhouse_murmurHash2_32"
+  override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2
+}
 
-  override def canonicalName: String = s"clickhouse.$name"
+object MurmurHash2_32 extends MultiArgsHash {
+  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519
 
+  override protected def funcName: String = "clickhouse_murmurHash2_32"
   override val ckFuncNames: Array[String] = Array("murmurHash2_32")
 
-  override def description: String = s"$name: (value: string) => hash_value: long"
-
-  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, StringType, _, _)) => this
-    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  override def invokeBase(value: UTF8String): Long = {
+    val data = value.getBytes
+    val v = MurmurHash2.hash32(data, data.length, 0)
+    Util.toUInt32Range(v)
   }
 
-  override def inputTypes: Array[DataType] = Array(StringType)
-
-  override def resultType: DataType = LongType
-
-  override def isResultNullable: Boolean = false
-
-  def invoke(values: UTF8String): Long = {
-    val data = values.getBytes
-    val v = MurmurHash2.hash32(data, data.length, 0).toLong
-    if (v < 0) v + (1L << 32) else v
-  }
+  override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2)
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
index db15a8e7..f353d1e7 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
@@ -18,61 +18,35 @@ import org.apache.commons.codec.digest.MurmurHash3
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.ClickhouseEquivFunction
+import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
 
-object MurmurHash3_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
-
-  override def name: String = "clickhouse_murmurHash3_64"
-
-  override def canonicalName: String = s"clickhouse.$name"
+object MurmurHash3_64 extends MultiArgsHash {
+  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L543
 
+  override protected def funcName: String = "clickhouse_murmurHash3_64"
   override val ckFuncNames: Array[String] = Array("murmurHash3_64")
 
-  override def description: String = s"$name: (value: string) => hash_value: long"
-
-  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, StringType, _, _)) => this
-    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
-  }
-
-  override def inputTypes: Array[DataType] = Array(StringType)
-
-  override def resultType: DataType = LongType
-
-  override def isResultNullable: Boolean = false
-
-  def invoke(values: UTF8String): Long = {
+  override def invokeBase(value: UTF8String): Long = {
     // ignore UInt64 vs Int64
-    val data = values.getBytes
+    val data = value.getBytes
     val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0)
     hashes(0) ^ hashes(1)
   }
-}
-
-object MurmurHash3_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
 
-  override def name: String = "clickhouse_murmurHash3_32"
+  override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2
+}
 
-  override def canonicalName: String = s"clickhouse.$name"
+object MurmurHash3_32 extends MultiArgsHash {
+  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519
 
+  override protected def funcName: String = "clickhouse_murmurHash3_32"
   override val ckFuncNames: Array[String] = Array("murmurHash3_32")
 
-  override def description: String = s"$name: (value: string) => hash_value: long"
-
-  override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, StringType, _, _)) => this
-    case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description")
+  override def invokeBase(value: UTF8String): Long = {
+    val data = value.getBytes
+    val v = MurmurHash3.hash32x86(data, 0, data.length, 0)
+    Util.toUInt32Range(v)
   }
 
-  override def inputTypes: Array[DataType] = Array(StringType)
-
-  override def resultType: DataType = LongType
-
-  override def isResultNullable: Boolean = false
-
-  def invoke(values: UTF8String): Long = {
-    val data = values.getBytes
-    val v = MurmurHash3.hash32x86(data, 0, data.length, 0).toLong
-    if (v < 0) v + (1L << 32) else v
-  }
+  override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2)
 }

From af14b3a9fb8706503d332c79fe35a8958de507b2 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Wed, 24 May 2023 13:52:37 +0800
Subject: [PATCH 08/20] Spark 3.4: add CityHash64

---
 .../ClickHouseClusterHashUDFSuite.scala       |  22 +-
 .../ClusterShardByTransformSuite.scala        |   3 +-
 .../clickhouse/func/FunctionRegistry.scala    |   1 +
 .../func/clickhouse/CityHash64.scala          |  40 ++
 .../clickhouse/cityhash/CityHash_v1_0_2.java  | 344 ++++++++++++++++++
 .../func/clickhouse/cityhash/UInt128.java     |  34 ++
 6 files changed, 439 insertions(+), 5 deletions(-)
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
 create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
index b3556258..adf3d9de 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
@@ -55,7 +55,10 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
     ).head.getString(0)
     val clickhouseResultJson = om.readTree(clickhouseResultJsonStr)
     val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText)
-    assert(sparkHashVal == clickhouseHashVal)
+    assert(
+      sparkHashVal == clickhouseHashVal,
+      s"ck_function: $funcCkName, spark_function: $funcSparkName, args: ($stringVal)"
+    )
   }
 
   Seq(
@@ -63,11 +66,20 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
     "clickhouse_murmurHash3_64",
     "clickhouse_murmurHash3_32",
     "clickhouse_murmurHash2_64",
-    "clickhouse_murmurHash2_32"
+    "clickhouse_murmurHash2_32",
+    "clickhouse_cityHash64"
   ).foreach { funcSparkName =>
     val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
     test(s"UDF $funcSparkName") {
-      Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { rawStringVal =>
+      Seq(
+        "spark-clickhouse-connector",
+        "Apache Spark",
+        "ClickHouse",
+        "Yandex",
+        "热爱",
+        "在传统的行式数据库系统中，数据按如下顺序存储：",
+        "🇨🇳"
+      ).foreach { rawStringVal =>
         val stringVal = s"\'$rawStringVal\'"
         runTest(funcSparkName, funcCkName, stringVal)
       }
@@ -78,7 +90,8 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
     "clickhouse_murmurHash3_64",
     "clickhouse_murmurHash3_32",
     "clickhouse_murmurHash2_64",
-    "clickhouse_murmurHash2_32"
+    "clickhouse_murmurHash2_32",
+    "clickhouse_cityHash64"
   ).foreach { funcSparkName =>
     val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
     test(s"UDF $funcSparkName multiple args") {
@@ -88,6 +101,7 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
         "\'ClickHouse\'",
         "\'Yandex\'",
         "\'热爱\'",
+        "\'在传统的行式数据库系统中，数据按如下顺序存储：\'",
         "\'🇨🇳\'"
       )
       val test_5 = strings.combinations(5)
diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
index db8a3036..21e984bc 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -102,7 +102,8 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
     ("murmurHash2_64", Array("value")),
     ("murmurHash2_32", Array("value")),
     ("murmurHash3_64", Array("value")),
-    ("murmurHash3_32", Array("value"))
+    ("murmurHash3_32", Array("value")),
+    ("cityHash64", Array("value"))
   ).foreach {
     case (func_name: String, func_args: Array[String]) =>
       test(s"shard by $func_name")(runTest(func_name, func_args))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index fd12edc1..a509f07e 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -54,6 +54,7 @@ object StaticFunctionRegistry extends FunctionRegistry {
     "clickhouse_murmurHash2_64" -> MurmurHash2_64,
     "clickhouse_murmurHash3_32" -> MurmurHash3_32,
     "clickhouse_murmurHash3_64" -> MurmurHash3_64,
+    "clickhouse_cityHash64" -> CityHash64,
     "clickhouse_years" -> Years,
     "clickhouse_months" -> Months,
     "clickhouse_days" -> Days,
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
new file mode 100644
index 00000000..fa599cbd
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse
+
+import io.netty.buffer.{ByteBuf, Unpooled}
+import org.apache.spark.unsafe.types.UTF8String
+import xenon.clickhouse.func.MultiArgsHash
+import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128}
+
+object CityHash64 extends MultiArgsHash {
+  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L694
+
+  override protected def funcName: String = "clickhouse_cityHash64"
+  override val ckFuncNames: Array[String] = Array("cityHash64")
+
+  def convertToByteBuf(array: Array[Byte]): ByteBuf = {
+    val byteBuf = Unpooled.buffer(array.length).writeBytes(array)
+    byteBuf
+  }
+
+  override def invokeBase(value: UTF8String): Long = {
+    // ignore UInt64 vs Int64
+    val data = value.getBytes
+    CityHash_v1_0_2.CityHash64(convertToByteBuf(data), 0, data.length)
+  }
+
+  override def combineHashes(v1: Long, v2: Long): Long = CityHash_v1_0_2.Hash128to64(new UInt128(v1, v2))
+}
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
new file mode 100644
index 00000000..df218df3
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
@@ -0,0 +1,344 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse.cityhash;
+
+import io.netty.buffer.ByteBuf;
+
+// copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/CityHash_v1_0_2.java
+// fixed some bugs involving int32 to uint32 conversion
+final public class CityHash_v1_0_2 {
+
+    private static final long kMul = 0x9ddfea08eb382d69L;
+    // Some primes between 2^63 and 2^64 for various uses.
+    private static final long k0 = 0xc3a5c85c97cb3127L;
+    private static final long k1 = 0xb492b66fbe98f273L;
+    private static final long k2 = 0x9ae16a3b2f90404fL;
+    private static final long k3 = 0xc949d7c7509e6557L;
+
+    private CityHash_v1_0_2() { /* restricted */ }
+
+    private static long Fetch64(ByteBuf p, int index) {
+        return p.getLongLE(index);
+    }
+
+    private static int Fetch32(ByteBuf p, int index) {
+        return p.getIntLE(index);
+    }
+
+    private static long toUint32(int x) {
+        return x & 0xFFFFFFFFL;
+    }
+
+    // Equivalent to Rotate(), but requires the second arg to be non-zero.
+// On x86-64, and probably others, it's possible for this to compile
+// to a single instruction if both args are already in registers.
+    private static long RotateByAtLeast1(long val, int shift) {
+        return (val >>> shift) | (val << (64 - shift));
+    }
+
+    private static long ShiftMix(long val) {
+        return val ^ (val >>> 47);
+    }
+
+    private static long Uint128Low64(UInt128 x) {
+        return x.first;
+    }
+
+    private static long Rotate(long val, int shift) {
+        return shift == 0 ? val : (val >>> shift) | (val << (64 - shift));
+    }
+
+    private static long Uint128High64(UInt128 x) {
+        return x.second;
+    }
+
+    // Hash 128 input bits down to 64 bits of output.
+// This is intended to be a reasonably good hash function.
+    public static long Hash128to64(UInt128 x) {
+        // Murmur-inspired hashing.
+        long a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+        a ^= (a >>> 47);
+        long b = (Uint128High64(x) ^ a) * kMul;
+        b ^= (b >>> 47);
+        b *= kMul;
+        return b;
+    }
+
+    private static long HashLen16(long u, long v) {
+        return Hash128to64(UInt128.of(u, v));
+    }
+
+    private static long HashLen0to16(ByteBuf s, int index, int len) {
+        if (len > 8) {
+            long a = Fetch64(s, index);
+            long b = Fetch64(s, index + len - 8);
+            return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
+        }
+        if (len >= 4) {
+            long a = toUint32(Fetch32(s, index));
+            return HashLen16(len + (a << 3), toUint32(Fetch32(s, index + len - 4)));
+        }
+        if (len > 0) {
+            byte a = s.getByte(index);
+            byte b = s.getByte(index + len >>> 1);
+            byte c = s.getByte(index + len - 1);
+            int y = (a & 0xFF) + ((b & 0xFF) << 8);
+            int z = len + ((c & 0xFF) << 2);
+            return ShiftMix(y * k2 ^ z * k3) * k2;
+        }
+        return k2;
+    }
+
+    // This probably works well for 16-byte strings as well, but it may be overkill
+// in that case.
+    private static long HashLen17to32(ByteBuf s, int index, int len) {
+        long a = Fetch64(s, index) * k1;
+        long b = Fetch64(s, index + 8);
+        long c = Fetch64(s, index + len - 8) * k2;
+        long d = Fetch64(s, index + len - 16) * k0;
+        return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+                a + Rotate(b ^ k3, 20) - c + len);
+    }
+
+    // Return a 16-byte hash for 48 bytes.  Quick and dirty.
+// Callers do best to use "random-looking" values for a and b.
+    private static UInt128 WeakHashLen32WithSeeds(
+            long w, long x, long y, long z, long a, long b) {
+        a += w;
+        b = Rotate(b + a + z, 21);
+        long c = a;
+        a += x;
+        a += y;
+        b += Rotate(a, 44);
+        return UInt128.of(a + z, b + c);
+    }
+
+    // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+    private static UInt128 WeakHashLen32WithSeeds(ByteBuf s, int index, long a, long b) {
+        return WeakHashLen32WithSeeds(Fetch64(s, index),
+                Fetch64(s, index + 8),
+                Fetch64(s, index + 16),
+                Fetch64(s, index + 24),
+                a,
+                b);
+    }
+
+    // Return an 8-byte hash for 33 to 64 bytes.
+    private static long HashLen33to64(ByteBuf s, int index, int len) {
+        long z = Fetch64(s, index + 24);
+        long a = Fetch64(s, index) + (len + Fetch64(s, index + len - 16)) * k0;
+        long b = Rotate(a + z, 52);
+        long c = Rotate(a, 37);
+        a += Fetch64(s, index + 8);
+        c += Rotate(a, 7);
+        a += Fetch64(s, index + 16);
+        long vf = a + z;
+        long vs = b + Rotate(a, 31) + c;
+        a = Fetch64(s, index + 16) + Fetch64(s, index + len - 32);
+        z = Fetch64(s, index + len - 8);
+        b = Rotate(a + z, 52);
+        c = Rotate(a, 37);
+        a += Fetch64(s, index + len - 24);
+        c += Rotate(a, 7);
+        a += Fetch64(s, index + len - 16);
+        long wf = a + z;
+        long ws = b + Rotate(a, 31) + c;
+        long r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+        return ShiftMix(r * k0 + vs) * k2;
+    }
+
+    // A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
+// of any length representable in ssize_t.  Based on City and Murmur.
+    private static UInt128 CityMurmur(ByteBuf s, int index, int len, UInt128 seed) {
+        long a = Uint128Low64(seed);
+        long b = Uint128High64(seed);
+        long c;
+        long d;
+        int l = len - 16;
+        if (l <= 0) {  // len <= 16
+            a = ShiftMix(a * k1) * k1;
+            c = b * k1 + HashLen0to16(s, index, len);
+            d = ShiftMix(a + (len >= 8 ? Fetch64(s, index) : c));
+        } else {  // len > 16
+            c = HashLen16(Fetch64(s, index + len - 8) + k1, a);
+            d = HashLen16(b + len, c + Fetch64(s, index + len - 16));
+            a += d;
+            do {
+                a ^= ShiftMix(Fetch64(s, index) * k1) * k1;
+                a *= k1;
+                b ^= a;
+                c ^= ShiftMix(Fetch64(s, index + 8) * k1) * k1;
+                c *= k1;
+                d ^= c;
+                index += 16;
+                l -= 16;
+            } while (l > 0);
+        }
+        a = HashLen16(a, c);
+        b = HashLen16(d, b);
+        return UInt128.of(a ^ b, HashLen16(b, a));
+    }
+
+    public static long CityHash64(ByteBuf s, int index, int len) {
+        if (len <= 32) {
+            if (len <= 16) {
+                return HashLen0to16(s, index, len);
+            } else {
+                return HashLen17to32(s, index, len);
+            }
+        } else if (len <= 64) {
+            return HashLen33to64(s, index, len);
+        }
+
+        // For strings over 64 bytes we hash the end first, and then as we
+        // loop we keep 56 bytes of state: v, w, x, y, and z.
+        long x = Fetch64(s, index);
+        long y = Fetch64(s, index + len - 16) ^ k1;
+        long z = Fetch64(s, index + len - 56) ^ k0;
+        UInt128 v = WeakHashLen32WithSeeds(s, len - 64, len, y);
+        UInt128 w = WeakHashLen32WithSeeds(s, len - 32, len * k1, k0);
+        z += ShiftMix(v.second) * k1;
+        x = Rotate(z + x, 39) * k1;
+        y = Rotate(y, 33) * k1;
+
+        // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
+        len = (len - 1) & ~63;
+        do {
+            x = Rotate(x + y + v.first + Fetch64(s, index + 16), 37) * k1;
+            y = Rotate(y + v.second + Fetch64(s, index + 48), 42) * k1;
+            x ^= w.second;
+            y ^= v.first;
+            z = Rotate(z ^ w.first, 33);
+            v = WeakHashLen32WithSeeds(s, index, v.second * k1, x + w.first);
+            w = WeakHashLen32WithSeeds(s, index + 32, z + w.second, y);
+            // swap
+            long t = z;
+            z = x;
+            x = t;
+            index += 64;
+            len -= 64;
+        } while (len != 0);
+        return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+                HashLen16(v.second, w.second) + x);
+    }
+
+    private static long CityHash64WithSeed(ByteBuf s, int index, int len, long seed) {
+        return CityHash64WithSeeds(s, index, len, k2, seed);
+    }
+
+    private static long CityHash64WithSeeds(ByteBuf s, int index, int len,
+                                            long seed0, long seed1) {
+        return HashLen16(CityHash64(s, index, len) - seed0, seed1);
+    }
+
+    private static UInt128 CityHash128WithSeed(ByteBuf s, int index, int len, UInt128 seed) {
+        if (len < 128) {
+            return CityMurmur(s, index, len, seed);
+        }
+
+        // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
+        // v, w, x, y, and z.
+        UInt128 v, w;
+        long x = Uint128Low64(seed);
+        long y = Uint128High64(seed);
+        long z = len * k1;
+        long vFirst = Rotate(y ^ k1, 49) * k1 + Fetch64(s, index);
+        long vSecond = Rotate(vFirst, 42) * k1 + Fetch64(s, index + 8);
+        long wFirst = Rotate(y + z, 35) * k1 + x;
+        long wSecond = Rotate(x + Fetch64(s, index + 88), 53) * k1;
+
+//        v = UInt128.of(vFirst, vSecond);
+//        w = UInt128.of(wFirst, wSecond);
+
+        // This is the same inner loop as CityHash64(), manually unrolled.
+        do {
+            x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1;
+            y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1;
+            x ^= wSecond;
+            y ^= vFirst;
+            z = Rotate(z ^ wFirst, 33);
+            v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst);
+            w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y);
+
+            vFirst = v.first;
+            vSecond = v.second;
+            wFirst = w.first;
+            wSecond = w.second;
+            {
+                long swap = z;
+                z = x;
+                x = swap;
+            }
+            index += 64;
+            x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1;
+            y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1;
+            x ^= wSecond;
+            y ^= vFirst;
+            z = Rotate(z ^ wFirst, 33);
+            v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst);
+            w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y);
+
+            vFirst = v.first;
+            vSecond = v.second;
+            wFirst = w.first;
+            wSecond = w.second;
+            {
+                long swap = z;
+                z = x;
+                x = swap;
+            }
+            index += 64;
+            len -= 128;
+        } while (len >= 128);
+        y += Rotate(wFirst, 37) * k0 + z;
+        x += Rotate(vFirst + z, 49) * k0;
+        // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
+        for (int tail_done = 0; tail_done < len; ) {
+            tail_done += 32;
+            y = Rotate(y - x, 42) * k0 + vSecond;
+            wFirst += Fetch64(s, index + len - tail_done + 16);
+            x = Rotate(x, 49) * k0 + wFirst;
+            wFirst += vFirst;
+            v = WeakHashLen32WithSeeds(s, index + len - tail_done, vFirst, vSecond);
+
+            vFirst = v.first;
+            vSecond = v.second;
+        }
+        // At this point our 48 bytes of state should contain more than
+        // enough information for a strong 128-bit hash.  We use two
+        // different 48-byte-to-8-byte hashes to get a 16-byte final result.
+        x = HashLen16(x, vFirst);
+        y = HashLen16(y, wFirst);
+        return UInt128.of(HashLen16(x + vSecond, wSecond) + y,
+                HashLen16(x + wSecond, y + vSecond));
+    }
+
+    public static UInt128 CityHash128(ByteBuf s, int len) {
+        if (len >= 16) {
+            return CityHash128WithSeed(s, 16,
+                    len - 16,
+                    UInt128.of(Fetch64(s, 0) ^ k3,
+                            Fetch64(s, 8)));
+        } else if (len >= 8) {
+            return CityHash128WithSeed(null,
+                    0, 0,
+                    UInt128.of(Fetch64(s, 0) ^ (len * k0),
+                            Fetch64(s, len - 8) ^ k1));
+        } else {
+            return CityHash128WithSeed(s, 0, len, UInt128.of(k0, k1));
+        }
+    }
+}
+
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java
new file mode 100644
index 00000000..2ba6c1f7
--- /dev/null
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package xenon.clickhouse.func.clickhouse.cityhash;
+
+/**
+ * @author Dmitriy Poluyanov
+ * @since 15/02/2018
+ * copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/UInt128.java
+ */
+final public class UInt128 {
+    final public long first;
+    final public long second;
+
+    public UInt128(long first, long second) {
+        this.first = first;
+        this.second = second;
+    }
+
+    static UInt128 of(long first, long second) {
+        return new UInt128(first, second);
+    }
+}

From 22f191a4701e81b0c5b7f89ffc44fce014dc8f7b Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 26 May 2023 11:13:02 +0800
Subject: [PATCH 09/20] Spark 3.4: Optimize sharding key handling when shuffle
 and sort, approach 3

---
 .../spark/sql/clickhouse/ExprUtils.scala      | 27 +++++++------------
 .../write/WriteJobDescription.scala           | 16 +++++++++--
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index a873fc4d..8ec72448 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -33,26 +33,11 @@ import scala.util.{Failure, Success, Try}
 
 class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable {
 
-  private def toSplitWithModulo(shardingKey: Expr, cluster: ClusterSpec): FuncExpr =
-    FuncExpr("positiveModulo", List(shardingKey, StringLiteral(cluster.totalWeight.toString)))
-
   def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] =
     partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray
 
-  def toSparkSplits(
-    shardingKey: Option[Expr],
-    partitionKey: Option[List[Expr]],
-    cluster: Option[ClusterSpec]
-  ): Array[Transform] =
-    // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this:
-    //   - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused
-    //     by this.
-    //   - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we
-    //     need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the
-    //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
-    (shardingKey.map(k =>
-      FuncExpr("positiveModulo", List(k, StringLiteral((cluster.get.totalWeight * 10).toString)))
-    ).seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
+  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] =
+    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
@@ -60,7 +45,10 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     sortingKey: Option[List[OrderExpr]],
     cluster: Option[ClusterSpec]
   ): Array[SortOrder] =
-    toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
+    toSparkSplits(
+      shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)),
+      partitionKey
+    ).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
@@ -158,4 +146,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
 
 object ExprUtils {
   def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry)
+
+  def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr =
+    FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString)))
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index 81a347ee..f0d9a5d9 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -62,10 +62,22 @@ case class WriteJobDescription(
   }
 
   def sparkSplits: Array[Transform] =
+    // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this:
+    //   - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused
+    //     by this.
+    //   - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we
+    //     need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the
+    //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster)
+      ExprUtils(functionRegistry).toSparkSplits(
+        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)),
+        partitionKey
+      )
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster)
+      ExprUtils(functionRegistry).toSparkSplits(
+        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)),
+        None
+      )
     }
 
   def sparkSortOrders: Array[SortOrder] = {

From ea5ed0e236d3862c6b4a63362ce0496b31faafcc Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 26 May 2023 15:00:47 +0800
Subject: [PATCH 10/20] Spark 3.4 UDF: Amend input type, Make clickhouse
 function nullable, better spark help text

---
 .../src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala | 2 +-
 .../main/scala/xenon/clickhouse/func/clickhouse/Days.scala   | 5 +++--
 .../main/scala/xenon/clickhouse/func/clickhouse/Hours.scala  | 5 +++--
 .../main/scala/xenon/clickhouse/func/clickhouse/Months.scala | 5 +++--
 .../main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala   | 2 ++
 .../scala/xenon/clickhouse/func/clickhouse/XxHash64.scala    | 2 ++
 .../main/scala/xenon/clickhouse/func/clickhouse/Years.scala  | 5 +++--
 7 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
index dc635a27..555001fa 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
@@ -23,7 +23,7 @@ abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunctio
     // must not be private object, nor do it successors, because spark would compile them
     override def canonicalName: String = s"clickhouse.$name"
     override def resultType: DataType = LongType
-    override def isResultNullable: Boolean = false
+    override def toString: String = name
   }
 
   object Arg1 extends Base {
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
index 9ceca80e..672fd44f 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
@@ -27,14 +27,15 @@ object Days extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqui
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("toYYYYMMDD")
 
   override def description: String = s"$name: (date: Date) => shard_num: int"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
     case Array(StructField(_, DateType, _, _)) => this
-    case Array(StructField(_, TimestampType, _, _)) => this
-    case Array(StructField(_, StringType, _, _)) => this
+//    case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this
     case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
   }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
index 77dbe4c2..0abe25cb 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
@@ -27,13 +27,14 @@ object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("toHour", "HOUR")
 
   override def description: String = s"$name: (time: timestamp) => shard_num: int"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, TimestampType, _, _)) => this
-    case Array(StructField(_, StringType, _, _)) => this
+    case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this
     case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description")
   }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
index 0be1bc9b..846dd245 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
@@ -27,14 +27,15 @@ object Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEq
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("toYYYYMM")
 
   override def description: String = s"$name: (date: Date) => shard_num: int"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
     case Array(StructField(_, DateType, _, _)) => this
-    case Array(StructField(_, TimestampType, _, _)) => this
-    case Array(StructField(_, StringType, _, _)) => this
+//    case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this
     case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
   }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
index e9eafb8d..f7c3e228 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
@@ -24,6 +24,8 @@ object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqu
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod")
 
   override def description: String = s"$name: (a: long, b: long) => mod: long"
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
index f02af236..241ae9d8 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
@@ -33,6 +33,8 @@ object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("xxHash64")
 
   override def description: String = s"$name: (value: string) => hash_value: long"
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
index b3c0a135..4b2e650d 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
@@ -27,14 +27,15 @@ object Years extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu
 
   override def canonicalName: String = s"clickhouse.$name"
 
+  override def toString: String = name
+
   override val ckFuncNames: Array[String] = Array("toYear", "YEAR")
 
   override def description: String = s"$name: (date: Date) => shard_num: int"
 
   override def bind(inputType: StructType): BoundFunction = inputType.fields match {
     case Array(StructField(_, DateType, _, _)) => this
-    case Array(StructField(_, TimestampType, _, _)) => this
-    case Array(StructField(_, StringType, _, _)) => this
+//    case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this
     case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description")
   }
 

From a8bdcbf1a58c0f9f16bdbe907b29a28047dce74d Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Tue, 30 May 2023 18:42:14 +0800
Subject: [PATCH 11/20] Spark 3.4: Optimize sharding key handling when shuffle
 and sort, amend approach 3

---
 .../scala/xenon/clickhouse/write/WriteJobDescription.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index f0d9a5d9..4fb5afcf 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -70,12 +70,12 @@ case class WriteJobDescription(
     //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
     if (writeOptions.repartitionByPartition) {
       ExprUtils(functionRegistry).toSparkSplits(
-        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)),
+        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         partitionKey
       )
     } else {
       ExprUtils(functionRegistry).toSparkSplits(
-        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)),
+        shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         None
       )
     }

From 3dcdd81e421019a5fd00a41052522198f908bdaa Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 2 Jun 2023 10:46:12 +0800
Subject: [PATCH 12/20] Spark 3.4: Change ExprUtils to implicit

---
 .../ClusterShardByTransformSuite.scala        |  2 +-
 .../spark/sql/clickhouse/ExprUtils.scala      | 73 ++++++++++---------
 .../xenon/clickhouse/ClickHouseCatalog.scala  |  2 +-
 .../xenon/clickhouse/ClickHouseTable.scala    |  4 +-
 .../clickhouse/write/ClickHouseWriter.scala   |  2 +-
 .../write/WriteJobDescription.scala           | 10 ++-
 6 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
index 21e984bc..32e4fc5d 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -106,7 +106,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
     ("cityHash64", Array("value"))
   ).foreach {
     case (func_name: String, func_args: Array[String]) =>
-      test(s"shard by $func_name")(runTest(func_name, func_args))
+      test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))
   }
 
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 8ec72448..55350ebe 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -31,20 +31,24 @@ import xenon.clickhouse.spec.ClusterSpec
 
 import scala.util.{Failure, Success, Try}
 
-class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable {
+object ExprUtils extends SQLConfHelper with Serializable {
 
-  def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] =
-    partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray
+  def toSparkPartitions(partitionKey: Option[List[Expr]])(implicit
+    functionRegistry: FunctionRegistry
+  ): Array[Transform] =
+    partitionKey.seq.flatten.flatten(toSparkTransformOpt(_)).toArray
 
-  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] =
-    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
+  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]])(implicit
+    functionRegistry: FunctionRegistry
+  ): Array[Transform] =
+    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_)).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
     partitionKey: Option[List[Expr]],
     sortingKey: Option[List[OrderExpr]],
     cluster: Option[ClusterSpec]
-  ): Array[SortOrder] =
+  )(implicit functionRegistry: FunctionRegistry): Array[SortOrder] =
     toSparkSplits(
       shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)),
       partitionKey
@@ -52,13 +56,15 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
-        toSparkTransformOpt(expr).map(trans => Expressions.sort(trans, direction, nullOrder))
+        toSparkTransformOpt(expr).map(trans =>
+          Expressions.sort(trans, direction, nullOrder)
+        )
       }.toArray
 
   private def loadV2FunctionOpt(
     name: String,
     args: Seq[Expression]
-  ): Option[BoundFunction] = {
+  )(implicit functionRegistry: FunctionRegistry): Option[BoundFunction] = {
     def loadFunction(ident: Identifier): UnboundFunction =
       functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident))
     val inputType = StructType(args.zipWithIndex.map {
@@ -77,7 +83,10 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     }
   }
 
-  def toCatalyst(v2Expr: V2Expression, fields: Array[StructField]): Expression =
+  def toCatalyst(
+    v2Expr: V2Expression,
+    fields: Array[StructField]
+  )(implicit functionRegistry: FunctionRegistry): Expression =
     v2Expr match {
       case IdentityTransform(ref) => toCatalyst(ref, fields)
       case ref: NamedReference if ref.fieldNames.length == 1 =>
@@ -88,9 +97,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
         BoundReference(ordinal, field.dataType, field.nullable)
       case t: Transform =>
         val catalystArgs = t.arguments().map(toCatalyst(_, fields))
-        loadV2FunctionOpt(t.name(), catalystArgs).map { bound =>
-          TransformExpression(bound, catalystArgs)
-        }.getOrElse {
+        loadV2FunctionOpt(t.name(), catalystArgs).map(bound => TransformExpression(bound, catalystArgs)).getOrElse {
           throw CHClientException(s"Unsupported expression: $v2Expr")
         }
       case _ => throw CHClientException(
@@ -98,25 +105,27 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
         )
     }
 
-  def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match {
-    // need this function because spark `Table`'s `partitioning` field should be `Transform`
-    case Success(t: Transform) => Some(t)
-    case Success(_) => None
-    case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
-    case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
-  }
+  def toSparkTransformOpt(expr: Expr)(implicit functionRegistry: FunctionRegistry): Option[Transform] =
+    Try(toSparkExpression(expr)) match {
+      // need this function because spark `Table`'s `partitioning` field should be `Transform`
+      case Success(t: Transform) => Some(t)
+      case Success(_) => None
+      case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
+      case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
+    }
 
-  def toSparkExpression(expr: Expr): V2Expression = expr match {
-    case FieldRef(col) => identity(col)
-    case StringLiteral(value) => literal(value)
-    case FuncExpr("rand", Nil) => apply("rand")
-    case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
-    case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
-      apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
-    case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
-  }
+  def toSparkExpression(expr: Expr)(implicit functionRegistry: FunctionRegistry): V2Expression =
+    expr match {
+      case FieldRef(col) => identity(col)
+      case StringLiteral(value) => literal(value)
+      case FuncExpr("rand", Nil) => apply("rand")
+      case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
+      case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
+        apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
+      case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
+    }
 
-  def toClickHouse(transform: Transform): Expr = transform match {
+  def toClickHouse(transform: Transform)(implicit functionRegistry: FunctionRegistry): Expr = transform match {
     case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe)
     case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) =>
       FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList)
@@ -128,7 +137,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     primarySchema: StructType,
     secondarySchema: StructType,
     transform: Transform
-  ): StructField = transform match {
+  )(implicit functionRegistry: FunctionRegistry): StructField = transform match {
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
@@ -142,10 +151,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket")
     case other: Transform => throw CHClientException(s"Unsupported transform: $other")
   }
-}
-
-object ExprUtils {
-  def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry)
 
   def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr =
     FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString)))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
index 5fd043cd..83327fa4 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog
 
     val partitionsClause = partitions match {
       case transforms if transforms.nonEmpty =>
-        transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")")
+        transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")")
       case _ => ""
     }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
index f4e19071..ced6e07e 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
@@ -133,11 +133,11 @@ case class ClickHouseTable(
   private lazy val metadataSchema: StructType =
     StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField))
 
-  override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey)
+  override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry)
 
   override lazy val partitionSchema: StructType = StructType(
     partitioning.map(partTransform =>
-      ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform)
+      ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry)
     )
   )
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
index 3cd43c5e..c9e14c2d 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -63,7 +63,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
   protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match {
     case None => None
     case Some(v2Expr) =>
-      val catalystExpr = ExprUtils(writeJob.functionRegistry).toCatalyst(v2Expr, writeJob.dataSetSchema.fields)
+      val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)(writeJob.functionRegistry)
       catalystExpr match {
         case BoundReference(_, dataType, _)
             if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType`
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index 4fb5afcf..bb6cca02 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -41,6 +41,8 @@ case class WriteJobDescription(
   functionRegistry: FunctionRegistry
 ) {
 
+  implicit val _functionRegistry: FunctionRegistry = functionRegistry
+
   def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match {
     case dist: DistributedEngineSpec if convert2Local => dist.local_db
     case _ => tableSpec.database
@@ -57,7 +59,7 @@ case class WriteJobDescription(
   }
 
   def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match {
-    case Some(expr) => ExprUtils(functionRegistry).toSparkTransformOpt(expr)
+    case Some(expr) => ExprUtils.toSparkTransformOpt(expr)
     case _ => None
   }
 
@@ -69,12 +71,12 @@ case class WriteJobDescription(
     //     need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the
     //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(
+      ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         partitionKey
       )
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(
+      ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         None
       )
@@ -83,6 +85,6 @@ case class WriteJobDescription(
   def sparkSortOrders: Array[SortOrder] = {
     val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None
     val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None
-    ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
+    ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
   }
 }

From 386ddb0daa420f82eb3874f319785cba841facc2 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Sun, 25 Jun 2023 13:51:20 +0800
Subject: [PATCH 13/20] Spark 3.4 UDF: clickhouse code reference using tag from
 commit hash

---
 .../src/main/scala/xenon/clickhouse/func/Util.scala       | 8 ++++----
 .../xenon/clickhouse/func/clickhouse/CityHash64.scala     | 2 +-
 .../xenon/clickhouse/func/clickhouse/MurmurHash2.scala    | 4 ++--
 .../xenon/clickhouse/func/clickhouse/MurmurHash3.scala    | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala
index 9ba35f10..ac7c331f 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala
@@ -16,11 +16,11 @@ package xenon.clickhouse.func
 
 object Util {
   def intHash64Impl(x: Long): Long =
-    // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Functions/FunctionsHashing.h#L143
+    // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L140
     intHash64(x ^ 0x4cf2d2baae6da887L)
 
   def intHash64(l: Long): Long = {
-    // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Common/HashTable/Hash.h#L28
+    // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Common/HashTable/Hash.h#L26
     var x = l
     x ^= x >>> 33;
     x *= 0xff51afd7ed558ccdL;
@@ -31,11 +31,11 @@ object Util {
   }
 
   def int32Impl(x: Long): Int =
-    // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Functions/FunctionsHashing.h#L133
+    // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L133
     intHash32(x, 0x75d9543de018bf45L)
 
   def intHash32(l: Long, salt: Long): Int = {
-    // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Common/HashTable/Hash.h#L502
+    // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Common/HashTable/Hash.h#L502
     var x = l
 
     x ^= salt;
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
index fa599cbd..160d45e9 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
@@ -20,7 +20,7 @@ import xenon.clickhouse.func.MultiArgsHash
 import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128}
 
 object CityHash64 extends MultiArgsHash {
-  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L694
+  // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694
 
   override protected def funcName: String = "clickhouse_cityHash64"
   override val ckFuncNames: Array[String] = Array("cityHash64")
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
index 052be5f9..f2ff9ed2 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
@@ -21,7 +21,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
 
 object MurmurHash2_64 extends MultiArgsHash {
-  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L460
+  // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460
 
   override protected def funcName: String = "clickhouse_murmurHash2_64"
   override val ckFuncNames: Array[String] = Array("murmurHash2_64")
@@ -36,7 +36,7 @@ object MurmurHash2_64 extends MultiArgsHash {
 }
 
 object MurmurHash2_32 extends MultiArgsHash {
-  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519
+  // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
 
   override protected def funcName: String = "clickhouse_murmurHash2_32"
   override val ckFuncNames: Array[String] = Array("murmurHash2_32")
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
index f353d1e7..1db654c1 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
@@ -21,7 +21,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
 
 object MurmurHash3_64 extends MultiArgsHash {
-  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L543
+  // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L543
 
   override protected def funcName: String = "clickhouse_murmurHash3_64"
   override val ckFuncNames: Array[String] = Array("murmurHash3_64")
@@ -37,7 +37,7 @@ object MurmurHash3_64 extends MultiArgsHash {
 }
 
 object MurmurHash3_32 extends MultiArgsHash {
-  // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519
+  // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
 
   override protected def funcName: String = "clickhouse_murmurHash3_32"
   override val ckFuncNames: Array[String] = Array("murmurHash3_32")

From 286c21faae13f5a9f33ded1f5f3d45755d9a88c3 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Mon, 26 Jun 2023 18:04:44 +0800
Subject: [PATCH 14/20] Spark 3.4 UDF: support varargs for Hash UDFs

---
 .../xenon/clickhouse/func/MultiArgsHash.scala | 85 ++++---------------
 1 file changed, 17 insertions(+), 68 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
index 555001fa..adc3a382 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
@@ -14,83 +14,32 @@
 
 package xenon.clickhouse.func
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction {
-  trait Base extends ScalarFunction[Long] {
-    // must not be private object, nor do it successors, because spark would compile them
-    override def canonicalName: String = s"clickhouse.$name"
-    override def resultType: DataType = LongType
-    override def toString: String = name
-  }
-
-  object Arg1 extends Base {
-    override def name: String = s"${funcName}_1"
-    override def inputTypes: Array[DataType] = Array.fill(1)(StringType)
-    def invoke(value: UTF8String): Long = invokeBase(value)
-  }
-
-  object Arg2 extends Base {
-    override def name: String = s"${funcName}_2"
-    override def inputTypes: Array[DataType] = Array.fill(2)(StringType)
-    def invoke(v1: UTF8String, v2: UTF8String): Long = Seq(v1, v2).map(invokeBase).reduce(combineHashes)
-  }
-
-  object Arg3 extends Base {
-    override def name: String = s"${funcName}_3"
-    override def inputTypes: Array[DataType] = Array.fill(3)(StringType)
-    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String): Long =
-      Seq(v1, v2, v3).map(invokeBase).reduce(combineHashes)
-  }
-
-  object Arg4 extends Base {
-    override def name: String = s"${funcName}_4"
-    override def inputTypes: Array[DataType] = Array.fill(4)(StringType)
-    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String): Long =
-      Seq(v1, v2, v3, v4).map(invokeBase).reduce(combineHashes)
-  }
-
-  object Arg5 extends Base {
-    override def name: String = s"${funcName}_4"
-    override def inputTypes: Array[DataType] = Array.fill(5)(StringType)
-    def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String, v5: UTF8String): Long =
-      Seq(v1, v2, v3, v4, v5).map(invokeBase).reduce(combineHashes)
-  }
   private def isExceptedType(dt: DataType): Boolean =
     dt.isInstanceOf[StringType]
 
   final override def name: String = funcName
-  final override def bind(inputType: StructType): BoundFunction = inputType.fields match {
-    case Array(StructField(_, dt, _, _)) if List(dt).forall(isExceptedType) => this.Arg1
-    case Array(
-          StructField(_, dt1, _, _),
-          StructField(_, dt2, _, _)
-        ) if List(dt1, dt2).forall(isExceptedType) =>
-      this.Arg2
-    case Array(
-          StructField(_, dt1, _, _),
-          StructField(_, dt2, _, _),
-          StructField(_, dt3, _, _)
-        ) if List(dt1, dt2, dt3).forall(isExceptedType) =>
-      this.Arg3
-    case Array(
-          StructField(_, dt1, _, _),
-          StructField(_, dt2, _, _),
-          StructField(_, dt3, _, _),
-          StructField(_, dt4, _, _)
-        ) if List(dt1, dt2, dt3, dt4).forall(isExceptedType) =>
-      this.Arg4
-    case Array(
-          StructField(_, dt1, _, _),
-          StructField(_, dt2, _, _),
-          StructField(_, dt3, _, _),
-          StructField(_, dt4, _, _),
-          StructField(_, dt5, _, _)
-        ) if List(dt1, dt2, dt3, dt4, dt5).forall(isExceptedType) =>
-      this.Arg5
-    case _ => throw new UnsupportedOperationException(s"Expect up to 5 STRING argument. $description")
+  final override def bind(inputType: StructType): BoundFunction = {
+    val inputDataTypes = inputType.fields.map(_.dataType)
+    if (inputDataTypes.forall(isExceptedType)) new ScalarFunction[Long] {
+      override def inputTypes(): Array[DataType] = inputDataTypes
+      override def name: String = funcName
+      override def canonicalName: String = s"clickhouse.$name"
+      override def resultType: DataType = LongType
+      override def toString: String = name
+      override def produceResult(input: InternalRow): Long = {
+        val inputStrings: Seq[UTF8String] =
+          input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]]
+        inputStrings.map(invokeBase).reduce(combineHashes)
+      }
+    }
+    else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
+
   }
 
   protected def funcName: String

From e5809f7b7d77117456864e62d8473d7b11dc2b0b Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Tue, 27 Jun 2023 10:05:04 +0800
Subject: [PATCH 15/20] Spark 3.4: refactor implicit into normal arg in
 ExprUtils

---
 .../spark/sql/clickhouse/ExprUtils.scala      | 60 +++++++++++--------
 .../xenon/clickhouse/ClickHouseCatalog.scala  |  2 +-
 .../xenon/clickhouse/ClickHouseTable.scala    |  4 +-
 .../clickhouse/write/ClickHouseWriter.scala   |  2 +-
 .../write/WriteJobDescription.scala           | 12 ++--
 5 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 55350ebe..1626267c 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -33,38 +33,44 @@ import scala.util.{Failure, Success, Try}
 
 object ExprUtils extends SQLConfHelper with Serializable {
 
-  def toSparkPartitions(partitionKey: Option[List[Expr]])(implicit
+  def toSparkPartitions(
+    partitionKey: Option[List[Expr]],
     functionRegistry: FunctionRegistry
   ): Array[Transform] =
-    partitionKey.seq.flatten.flatten(toSparkTransformOpt(_)).toArray
+    partitionKey.seq.flatten.flatten(toSparkTransformOpt(_, functionRegistry)).toArray
 
-  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]])(implicit
+  def toSparkSplits(
+    shardingKey: Option[Expr],
+    partitionKey: Option[List[Expr]],
     functionRegistry: FunctionRegistry
   ): Array[Transform] =
-    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_)).toArray
+    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_, functionRegistry)).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
     partitionKey: Option[List[Expr]],
     sortingKey: Option[List[OrderExpr]],
-    cluster: Option[ClusterSpec]
-  )(implicit functionRegistry: FunctionRegistry): Array[SortOrder] =
+    cluster: Option[ClusterSpec],
+    functionRegistry: FunctionRegistry
+  ): Array[SortOrder] =
     toSparkSplits(
       shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)),
-      partitionKey
+      partitionKey,
+      functionRegistry
     ).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
-        toSparkTransformOpt(expr).map(trans =>
+        toSparkTransformOpt(expr, functionRegistry).map(trans =>
           Expressions.sort(trans, direction, nullOrder)
         )
       }.toArray
 
   private def loadV2FunctionOpt(
     name: String,
-    args: Seq[Expression]
-  )(implicit functionRegistry: FunctionRegistry): Option[BoundFunction] = {
+    args: Seq[Expression],
+    functionRegistry: FunctionRegistry
+  ): Option[BoundFunction] = {
     def loadFunction(ident: Identifier): UnboundFunction =
       functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident))
     val inputType = StructType(args.zipWithIndex.map {
@@ -85,10 +91,11 @@ object ExprUtils extends SQLConfHelper with Serializable {
 
   def toCatalyst(
     v2Expr: V2Expression,
-    fields: Array[StructField]
-  )(implicit functionRegistry: FunctionRegistry): Expression =
+    fields: Array[StructField],
+    functionRegistry: FunctionRegistry
+  ): Expression =
     v2Expr match {
-      case IdentityTransform(ref) => toCatalyst(ref, fields)
+      case IdentityTransform(ref) => toCatalyst(ref, fields, functionRegistry)
       case ref: NamedReference if ref.fieldNames.length == 1 =>
         val (field, ordinal) = fields
           .zipWithIndex
@@ -96,17 +103,18 @@ object ExprUtils extends SQLConfHelper with Serializable {
           .getOrElse(throw CHClientException(s"Invalid field reference: $ref"))
         BoundReference(ordinal, field.dataType, field.nullable)
       case t: Transform =>
-        val catalystArgs = t.arguments().map(toCatalyst(_, fields))
-        loadV2FunctionOpt(t.name(), catalystArgs).map(bound => TransformExpression(bound, catalystArgs)).getOrElse {
-          throw CHClientException(s"Unsupported expression: $v2Expr")
-        }
+        val catalystArgs = t.arguments().map(toCatalyst(_, fields, functionRegistry))
+        loadV2FunctionOpt(t.name(), catalystArgs, functionRegistry)
+          .map(bound => TransformExpression(bound, catalystArgs)).getOrElse {
+            throw CHClientException(s"Unsupported expression: $v2Expr")
+          }
       case _ => throw CHClientException(
           s"Unsupported expression: $v2Expr"
         )
     }
 
-  def toSparkTransformOpt(expr: Expr)(implicit functionRegistry: FunctionRegistry): Option[Transform] =
-    Try(toSparkExpression(expr)) match {
+  def toSparkTransformOpt(expr: Expr, functionRegistry: FunctionRegistry): Option[Transform] =
+    Try(toSparkExpression(expr, functionRegistry)) match {
       // need this function because spark `Table`'s `partitioning` field should be `Transform`
       case Success(t: Transform) => Some(t)
       case Success(_) => None
@@ -114,18 +122,21 @@ object ExprUtils extends SQLConfHelper with Serializable {
       case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
     }
 
-  def toSparkExpression(expr: Expr)(implicit functionRegistry: FunctionRegistry): V2Expression =
+  def toSparkExpression(expr: Expr, functionRegistry: FunctionRegistry): V2Expression =
     expr match {
       case FieldRef(col) => identity(col)
       case StringLiteral(value) => literal(value)
       case FuncExpr("rand", Nil) => apply("rand")
       case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
       case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
-        apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
+        apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression(_, functionRegistry)): _*)
       case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
     }
 
-  def toClickHouse(transform: Transform)(implicit functionRegistry: FunctionRegistry): Expr = transform match {
+  def toClickHouse(
+    transform: Transform,
+    functionRegistry: FunctionRegistry
+  ): Expr = transform match {
     case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe)
     case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) =>
       FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList)
@@ -136,8 +147,9 @@ object ExprUtils extends SQLConfHelper with Serializable {
   def inferTransformSchema(
     primarySchema: StructType,
     secondarySchema: StructType,
-    transform: Transform
-  )(implicit functionRegistry: FunctionRegistry): StructField = transform match {
+    transform: Transform,
+    functionRegistry: FunctionRegistry
+  ): StructField = transform match {
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
index 83327fa4..caff6a50 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog
 
     val partitionsClause = partitions match {
       case transforms if transforms.nonEmpty =>
-        transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")")
+        transforms.map(ExprUtils.toClickHouse(_, functionRegistry).sql).mkString("PARTITION BY (", ", ", ")")
       case _ => ""
     }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
index ced6e07e..eda3a1a4 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
@@ -133,11 +133,11 @@ case class ClickHouseTable(
   private lazy val metadataSchema: StructType =
     StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField))
 
-  override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry)
+  override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey, functionRegistry)
 
   override lazy val partitionSchema: StructType = StructType(
     partitioning.map(partTransform =>
-      ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry)
+      ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform, functionRegistry)
     )
   )
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
index c9e14c2d..65a4bc33 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -63,7 +63,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
   protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match {
     case None => None
     case Some(v2Expr) =>
-      val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)(writeJob.functionRegistry)
+      val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields, writeJob.functionRegistry)
       catalystExpr match {
         case BoundReference(_, dataType, _)
             if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType`
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
index bb6cca02..646d6ca5 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -41,8 +41,6 @@ case class WriteJobDescription(
   functionRegistry: FunctionRegistry
 ) {
 
-  implicit val _functionRegistry: FunctionRegistry = functionRegistry
-
   def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match {
     case dist: DistributedEngineSpec if convert2Local => dist.local_db
     case _ => tableSpec.database
@@ -59,7 +57,7 @@ case class WriteJobDescription(
   }
 
   def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match {
-    case Some(expr) => ExprUtils.toSparkTransformOpt(expr)
+    case Some(expr) => ExprUtils.toSparkTransformOpt(expr, functionRegistry)
     case _ => None
   }
 
@@ -73,18 +71,20 @@ case class WriteJobDescription(
     if (writeOptions.repartitionByPartition) {
       ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
-        partitionKey
+        partitionKey,
+        functionRegistry
       )
     } else {
       ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
-        None
+        None,
+        functionRegistry
       )
     }
 
   def sparkSortOrders: Array[SortOrder] = {
     val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None
     val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None
-    ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
+    ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster, functionRegistry)
   }
 }

From 5ae4f3df1fb53972355e308b2a84ead667d85abc Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Tue, 27 Jun 2023 15:50:48 +0800
Subject: [PATCH 16/20] Spark 3.4: Cast type when calling projection, support
 recursive resolve

(cherry picked from commit 936a18af65c37ca2cfad97b63645e579632ff72d)
---
 .../ClusterShardByTransformSuite.scala        |  7 +++-
 .../spark/sql/clickhouse/ExprUtils.scala      | 25 ++++++++++-
 .../clickhouse/write/ClickHouseWriter.scala   | 41 ++++++++-----------
 3 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
index 32e4fc5d..06b7f9b4 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -94,16 +94,21 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
   }
 
   Seq(
+    // wait for SPARK-44180 to be fixed, then add implicit cast test cases
     ("toYear", Array("create_date")),
+//    ("toYear", Array("create_time")),
     ("toYYYYMM", Array("create_date")),
+//    ("toYYYYMM", Array("create_time")),
     ("toYYYYMMDD", Array("create_date")),
+//    ("toYYYYMMDD", Array("create_time")),
     ("toHour", Array("create_time")),
     ("xxHash64", Array("value")),
     ("murmurHash2_64", Array("value")),
     ("murmurHash2_32", Array("value")),
     ("murmurHash3_64", Array("value")),
     ("murmurHash3_32", Array("value")),
-    ("cityHash64", Array("value"))
+    ("cityHash64", Array("value")),
+    ("positiveModulo", Array("toYYYYMM(create_date)", "10"))
   ).foreach {
     case (func_name: String, func_args: Array[String]) =>
       test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 1626267c..8c2f6d6d 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -15,9 +15,15 @@
 package org.apache.spark.sql.clickhouse
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.SQLConfHelper
 import org.apache.spark.sql.catalyst.analysis.NoSuchFunctionException
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, TransformExpression}
+import org.apache.spark.sql.catalyst.{expressions, SQLConfHelper}
+import org.apache.spark.sql.catalyst.expressions.{
+  BoundReference,
+  Cast,
+  Expression,
+  TransformExpression,
+  V2ExpressionUtils
+}
 import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.IGNORE_UNSUPPORTED_TRANSFORM
 import org.apache.spark.sql.connector.catalog.Identifier
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
@@ -89,6 +95,20 @@ object ExprUtils extends SQLConfHelper with Serializable {
     }
   }
 
+  def resolveTransformCatalyst(
+    catalystExpr: Expression,
+    timeZoneId: Option[String] = None
+  ): Expression = catalystExpr match {
+    case TransformExpression(function: ScalarFunction[_], args, _) =>
+      val resolvedArgs: Seq[Expression] = args.map(resolveTransformCatalyst(_, timeZoneId))
+      val castedArgs: Seq[Expression] = resolvedArgs.zip(function.inputTypes()).map {
+        case (arg, expectedType) if !arg.dataType.sameType(expectedType) => Cast(arg, expectedType, timeZoneId)
+        case (arg, _) => arg
+      }
+      V2ExpressionUtils.resolveScalarFunction(function, castedArgs)
+    case other => other
+  }
+
   def toCatalyst(
     v2Expr: V2Expression,
     fields: Array[StructField],
@@ -108,6 +128,7 @@ object ExprUtils extends SQLConfHelper with Serializable {
           .map(bound => TransformExpression(bound, catalystArgs)).getOrElse {
             throw CHClientException(s"Unsupported expression: $v2Expr")
           }
+      case literal: LiteralValue[Any] => expressions.Literal(literal.value)
       case _ => throw CHClientException(
           s"Unsupported expression: $v2Expr"
         )
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
index 65a4bc33..a6b5a5fe 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -17,16 +17,9 @@ package xenon.clickhouse.write
 import com.clickhouse.client.ClickHouseProtocol
 import com.clickhouse.data.ClickHouseCompression
 import org.apache.commons.io.IOUtils
-import org.apache.spark.sql.catalyst.expressions.{
-  BoundReference,
-  Expression,
-  SafeProjection,
-  TransformExpression,
-  V2ExpressionUtils
-}
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression}
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.clickhouse.ExprUtils
-import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
 import org.apache.spark.sql.connector.metric.CustomTaskMetric
 import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage}
 import org.apache.spark.sql.types._
@@ -86,23 +79,21 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
 
   protected lazy val shardProjection: Option[expressions.Projection] = shardExpr
     .filter(_ => writeJob.writeOptions.convertDistributedToLocal)
-    .flatMap(expr =>
-      expr match {
-        case BoundReference(_, _, _) =>
-          Some(SafeProjection.create(Seq(expr)))
-        case TransformExpression(function, args, _) =>
-          val retType = function.resultType() match {
-            case ByteType => classOf[Byte]
-            case ShortType => classOf[Short]
-            case IntegerType => classOf[Int]
-            case LongType => classOf[Long]
-            case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," +
-                s"sharding field: ${function.resultType()}")
-          }
-          val expr = V2ExpressionUtils.resolveScalarFunction(function.asInstanceOf[ScalarFunction[retType.type]], args)
-          Some(SafeProjection.create(Seq(expr)))
-      }
-    )
+    .flatMap {
+      case expr: BoundReference =>
+        Some(SafeProjection.create(Seq(expr)))
+      case expr @ TransformExpression(function, _, _) =>
+        // result type must be integer class
+        function.resultType() match {
+          case ByteType => classOf[Byte]
+          case ShortType => classOf[Short]
+          case IntegerType => classOf[Int]
+          case LongType => classOf[Long]
+          case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," +
+              s"sharding field: ${function.resultType()}")
+        }
+        Some(SafeProjection.create(Seq(ExprUtils.resolveTransformCatalyst(expr, Some(writeJob.tz.getId)))))
+    }
 
   // put the node select strategy in executor side because we need to calculate shard and don't know the records
   // util DataWriter#write(InternalRow) invoked.

From 088bf3dc8da996ba241052ba2a7323aa35865a9b Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 14 Jul 2023 17:31:38 +0800
Subject: [PATCH 17/20] Spark 3.4 UDF: change pmod to mod because
 positiveModulo does not exists in early version of clickhouse

(cherry picked from commit ea0592d3f6f9262e931141af9868441f6422977b)
(cherry picked from commit 8a270a24441fa2d0b5b9ff7426b54e5357c66b92)
---
 .../cluster/ClusterShardByTransformSuite.scala        |  2 +-
 .../org/apache/spark/sql/clickhouse/ExprUtils.scala   |  2 +-
 .../xenon/clickhouse/func/FunctionRegistry.scala      |  2 +-
 .../func/clickhouse/{Pmod.scala => Mod.scala}         | 11 ++++-------
 4 files changed, 7 insertions(+), 10 deletions(-)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/{Pmod.scala => Mod.scala} (86%)

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
index 06b7f9b4..e02dad11 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -108,7 +108,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
     ("murmurHash3_64", Array("value")),
     ("murmurHash3_32", Array("value")),
     ("cityHash64", Array("value")),
-    ("positiveModulo", Array("toYYYYMM(create_date)", "10"))
+    ("modulo", Array("toYYYYMM(create_date)", "10"))
   ).foreach {
     case (func_name: String, func_args: Array[String]) =>
       test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
index 8c2f6d6d..7ba7ad62 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -186,5 +186,5 @@ object ExprUtils extends SQLConfHelper with Serializable {
   }
 
   def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr =
-    FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString)))
+    FuncExpr("modulo", List(shardingKey, StringLiteral(weight.toString)))
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index a509f07e..c6f01110 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -59,7 +59,7 @@ object StaticFunctionRegistry extends FunctionRegistry {
     "clickhouse_months" -> Months,
     "clickhouse_days" -> Days,
     "clickhouse_hours" -> Hours,
-    "sharding_pmod" -> Pmod
+    "sharding_mod" -> Mod
   )
 
   override def list: Array[String] = functions.keys.toArray
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
similarity index 86%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
index f7c3e228..3e8c5182 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
@@ -18,15 +18,15 @@ import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFu
 import org.apache.spark.sql.types._
 import xenon.clickhouse.func.ClickhouseEquivFunction
 
-object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
+object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
 
-  override def name: String = "sharding_pmod"
+  override def name: String = "sharding_mod"
 
   override def canonicalName: String = s"clickhouse.$name"
 
   override def toString: String = name
 
-  override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod")
+  override val ckFuncNames: Array[String] = Array("modulo", "remainder")
 
   override def description: String = s"$name: (a: long, b: long) => mod: long"
 
@@ -58,8 +58,5 @@ object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqu
 
   override def isResultNullable: Boolean = false
 
-  def invoke(a: Long, b: Long): Long = {
-    val mod = a % b
-    if (mod < 0) mod + b else mod
-  }
+  def invoke(a: Long, b: Long): Long = a % b
 }

From 85a025f659ef7122dfcde8a4d5fa38d7f58a5c2b Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Fri, 14 Jul 2023 17:37:57 +0800
Subject: [PATCH 18/20] Docs: add comment for modulo UDF

(cherry picked from commit d2bb743f1be1c27c7e133f3c4bd43a41427eadb2)
(cherry picked from commit f4ae4ad42cb2b7f59b672435b0a0a2ef6adb6e5d)
---
 .../src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
index 3e8c5182..b10f0f7e 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
@@ -26,6 +26,8 @@ object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqui
 
   override def toString: String = name
 
+  // remainder is not a Clickhouse function, but modulo will be parsed to remainder in the connector.
+  // Added remainder as a synonym.
   override val ckFuncNames: Array[String] = Array("modulo", "remainder")
 
   override def description: String = s"$name: (a: long, b: long) => mod: long"

From 4e201d61cfd9ca6ab3323c2aae6b1555f28c7e78 Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Tue, 25 Jul 2023 18:31:57 +0800
Subject: [PATCH 19/20] Spark 3.4: Adapt to hash function under clickhouse-core

---
 .../ClickHouseClusterHashUDFSuite.scala       |  17 +-
 .../xenon/clickhouse/ClickHouseCatalog.scala  |   3 +-
 .../func/{clickhouse => }/CityHash64.scala    |  22 +-
 .../func/{clickhouse => }/Days.scala          |   3 +-
 .../clickhouse/func/FunctionRegistry.scala    |   1 -
 .../func/{clickhouse => }/Hours.scala         |   5 +-
 .../func/{clickhouse => }/Mod.scala           |   3 +-
 .../func/{clickhouse => }/Months.scala        |   3 +-
 ...gsHash.scala => MultiStringArgsHash.scala} |  45 ++-
 .../func/{clickhouse => }/MurmurHash2.scala   |  29 +-
 .../func/{clickhouse => }/MurmurHash3.scala   |  30 +-
 .../func/{clickhouse => }/XxHash64.scala      |   3 +-
 .../func/{clickhouse => }/Years.scala         |   3 +-
 .../clickhouse/cityhash/CityHash_v1_0_2.java  | 344 ------------------
 .../func/clickhouse/cityhash/UInt128.java     |  34 --
 .../clickhouse/FunctionRegistrySuite.scala    |   8 +-
 16 files changed, 60 insertions(+), 493 deletions(-)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/CityHash64.scala (52%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Days.scala (95%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Hours.scala (93%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Mod.scala (96%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Months.scala (95%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{MultiArgsHash.scala => MultiStringArgsHash.scala} (55%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/MurmurHash2.scala (52%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/MurmurHash3.scala (51%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/XxHash64.scala (97%)
 rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Years.scala (95%)
 delete mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
 delete mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java

diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
index adf3d9de..65f667b2 100644
--- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
+++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
@@ -15,8 +15,12 @@
 package org.apache.spark.sql.clickhouse.cluster
 
 import org.apache.spark.sql.clickhouse.TestUtils.om
-import xenon.clickhouse.func.{CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry}
-import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
+import xenon.clickhouse.func.{
+  ClickHouseXxHash64Shard,
+  CompositeFunctionRegistry,
+  DynamicFunctionRegistry,
+  StaticFunctionRegistry
+}
 
 import java.lang.{Long => JLong}
 
@@ -30,15 +34,6 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
     new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
   }
 
-  def product[A](xs: Seq[Seq[A]]): Seq[Seq[A]] =
-    xs.toList match {
-      case Nil => Seq(Seq())
-      case head :: tail => for {
-          h <- head
-          t <- product(tail)
-        } yield h +: t
-    }
-
   def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = {
     val sparkResult = spark.sql(
       s"""SELECT
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
index caff6a50..6db307f3 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -26,8 +26,7 @@ import xenon.clickhouse.Constants._
 import xenon.clickhouse.client.NodeClient
 import xenon.clickhouse.exception.CHClientException
 import xenon.clickhouse.exception.ClickHouseErrCode._
-import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
-import xenon.clickhouse.func.{FunctionRegistry, _}
+import xenon.clickhouse.func.{ClickHouseXxHash64Shard, FunctionRegistry, _}
 import xenon.clickhouse.spec._
 
 import java.time.ZoneId
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala
similarity index 52%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala
index 160d45e9..b78f8ee3 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala
@@ -12,29 +12,15 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
-import io.netty.buffer.{ByteBuf, Unpooled}
-import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.MultiArgsHash
-import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128}
+import xenon.clickhouse.hash
 
-object CityHash64 extends MultiArgsHash {
+object CityHash64 extends MultiStringArgsHash {
   // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694
 
   override protected def funcName: String = "clickhouse_cityHash64"
   override val ckFuncNames: Array[String] = Array("cityHash64")
 
-  def convertToByteBuf(array: Array[Byte]): ByteBuf = {
-    val byteBuf = Unpooled.buffer(array.length).writeBytes(array)
-    byteBuf
-  }
-
-  override def invokeBase(value: UTF8String): Long = {
-    // ignore UInt64 vs Int64
-    val data = value.getBytes
-    CityHash_v1_0_2.CityHash64(convertToByteBuf(data), 0, data.length)
-  }
-
-  override def combineHashes(v1: Long, v2: Long): Long = CityHash_v1_0_2.Hash128to64(new UInt128(v1, v2))
+  override def applyHash(input: Array[Any]): Long = hash.CityHash64(input)
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala
similarity index 95%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala
index 672fd44f..3008d7fd 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala
@@ -12,11 +12,10 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
-import xenon.clickhouse.func.ClickhouseEquivFunction
 
 import java.time.LocalDate
 import java.time.format.DateTimeFormatter
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
index c6f01110..d7856c3c 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala
@@ -15,7 +15,6 @@
 package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
-import xenon.clickhouse.func.clickhouse._
 
 import scala.collection.mutable
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala
similarity index 93%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala
index 0abe25cb..e88907be 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala
@@ -12,13 +12,12 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
-import xenon.clickhouse.func.ClickhouseEquivFunction
 
-import java.sql.{Date, Timestamp}
+import java.sql.Timestamp
 import java.text.SimpleDateFormat
 
 object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala
similarity index 96%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala
index b10f0f7e..69fdedc9 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala
@@ -12,11 +12,10 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
-import xenon.clickhouse.func.ClickhouseEquivFunction
 
 object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
similarity index 95%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
index 846dd245..13e06d88 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala
@@ -12,11 +12,10 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
-import xenon.clickhouse.func.ClickhouseEquivFunction
 
 import java.time.LocalDate
 import java.time.format.DateTimeFormatter
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala
similarity index 55%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala
index adc3a382..69ce07c1 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala
@@ -19,32 +19,41 @@ import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFu
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction {
+abstract class MultiStringArgsHash extends UnboundFunction with ClickhouseEquivFunction {
+
+  def applyHash(input: Array[Any]): Long
+
+  protected def funcName: String
+
+  override val ckFuncNames: Array[String]
+
+  override def description: String = s"$name: (value: string, ...) => hash_value: long"
+
   private def isExceptedType(dt: DataType): Boolean =
     dt.isInstanceOf[StringType]
 
   final override def name: String = funcName
+
   final override def bind(inputType: StructType): BoundFunction = {
     val inputDataTypes = inputType.fields.map(_.dataType)
-    if (inputDataTypes.forall(isExceptedType)) new ScalarFunction[Long] {
-      override def inputTypes(): Array[DataType] = inputDataTypes
-      override def name: String = funcName
-      override def canonicalName: String = s"clickhouse.$name"
-      override def resultType: DataType = LongType
-      override def toString: String = name
-      override def produceResult(input: InternalRow): Long = {
-        val inputStrings: Seq[UTF8String] =
-          input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]]
-        inputStrings.map(invokeBase).reduce(combineHashes)
+    if (inputDataTypes.forall(isExceptedType)) {
+      // need to new a ScalarFunction instance for each bind,
+      // because we do not know the number of arguments in advance
+      new ScalarFunction[Long] {
+        override def inputTypes(): Array[DataType] = inputDataTypes
+        override def name: String = funcName
+        override def canonicalName: String = s"clickhouse.$name"
+        override def resultType: DataType = LongType
+        override def toString: String = name
+        override def produceResult(input: InternalRow): Long = {
+          val inputStrings: Array[Any] =
+            input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]].toArray
+              .map(_.getBytes)
+          applyHash(inputStrings)
+        }
       }
-    }
-    else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
+    } else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
 
   }
 
-  protected def funcName: String
-  override val ckFuncNames: Array[String]
-  override def description: String = s"$name: (value: string, ...) => hash_value: long"
-  def invokeBase(value: UTF8String): Long
-  def combineHashes(v1: Long, v2: Long): Long
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala
similarity index 52%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala
index f2ff9ed2..9fac4d60 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala
@@ -12,40 +12,25 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
-import org.apache.commons.codec.digest.{MurmurHash2, MurmurHash3}
-import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
+import xenon.clickhouse.hash
+import xenon.clickhouse.hash.HashUtils
 
-object MurmurHash2_64 extends MultiArgsHash {
+object MurmurHash2_64 extends MultiStringArgsHash {
   // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460
 
   override protected def funcName: String = "clickhouse_murmurHash2_64"
   override val ckFuncNames: Array[String] = Array("murmurHash2_64")
 
-  override def invokeBase(value: UTF8String): Long = {
-    // ignore UInt64 vs Int64
-    val data = value.getBytes
-    MurmurHash2.hash64(data, data.length, 0)
-  }
-
-  override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2
+  override def applyHash(input: Array[Any]): Long = hash.Murmurhash2_64(input)
 }
 
-object MurmurHash2_32 extends MultiArgsHash {
+object MurmurHash2_32 extends MultiStringArgsHash {
   // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
 
   override protected def funcName: String = "clickhouse_murmurHash2_32"
   override val ckFuncNames: Array[String] = Array("murmurHash2_32")
 
-  override def invokeBase(value: UTF8String): Long = {
-    val data = value.getBytes
-    val v = MurmurHash2.hash32(data, data.length, 0)
-    Util.toUInt32Range(v)
-  }
-
-  override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2)
+  override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash2_32(input))
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala
similarity index 51%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala
index 1db654c1..848bb3b0 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala
@@ -12,41 +12,25 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
-import org.apache.commons.codec.digest.MurmurHash3
-import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
+import xenon.clickhouse.hash
+import xenon.clickhouse.hash.HashUtils
 
-object MurmurHash3_64 extends MultiArgsHash {
+object MurmurHash3_64 extends MultiStringArgsHash {
   // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L543
 
   override protected def funcName: String = "clickhouse_murmurHash3_64"
   override val ckFuncNames: Array[String] = Array("murmurHash3_64")
 
-  override def invokeBase(value: UTF8String): Long = {
-    // ignore UInt64 vs Int64
-    val data = value.getBytes
-    val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0)
-    hashes(0) ^ hashes(1)
-  }
-
-  override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2
+  override def applyHash(input: Array[Any]): Long = hash.Murmurhash3_64(input)
 }
 
-object MurmurHash3_32 extends MultiArgsHash {
+object MurmurHash3_32 extends MultiStringArgsHash {
   // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
 
   override protected def funcName: String = "clickhouse_murmurHash3_32"
   override val ckFuncNames: Array[String] = Array("murmurHash3_32")
 
-  override def invokeBase(value: UTF8String): Long = {
-    val data = value.getBytes
-    val v = MurmurHash3.hash32x86(data, 0, data.length, 0)
-    Util.toUInt32Range(v)
-  }
-
-  override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2)
+  override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash3_32(input))
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala
similarity index 97%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala
index 241ae9d8..3c4a5b1a 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala
@@ -12,13 +12,12 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.catalyst.expressions.XxHash64Function
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import xenon.clickhouse.func.ClickhouseEquivFunction
 import xenon.clickhouse.spec.{ClusterSpec, ShardUtils}
 
 /**
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala
similarity index 95%
rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala
index 4b2e650d..6bf987fb 100644
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala
+++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala
@@ -12,11 +12,10 @@
  * limitations under the License.
  */
 
-package xenon.clickhouse.func.clickhouse
+package xenon.clickhouse.func
 
 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
 import org.apache.spark.sql.types._
-import xenon.clickhouse.func.ClickhouseEquivFunction
 
 import java.time.LocalDate
 import java.time.format.DateTimeFormatter
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
deleted file mode 100644
index df218df3..00000000
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package xenon.clickhouse.func.clickhouse.cityhash;
-
-import io.netty.buffer.ByteBuf;
-
-// copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/CityHash_v1_0_2.java
-// fixed some bugs involving int32 to uint32 conversion
-final public class CityHash_v1_0_2 {
-
-    private static final long kMul = 0x9ddfea08eb382d69L;
-    // Some primes between 2^63 and 2^64 for various uses.
-    private static final long k0 = 0xc3a5c85c97cb3127L;
-    private static final long k1 = 0xb492b66fbe98f273L;
-    private static final long k2 = 0x9ae16a3b2f90404fL;
-    private static final long k3 = 0xc949d7c7509e6557L;
-
-    private CityHash_v1_0_2() { /* restricted */ }
-
-    private static long Fetch64(ByteBuf p, int index) {
-        return p.getLongLE(index);
-    }
-
-    private static int Fetch32(ByteBuf p, int index) {
-        return p.getIntLE(index);
-    }
-
-    private static long toUint32(int x) {
-        return x & 0xFFFFFFFFL;
-    }
-
-    // Equivalent to Rotate(), but requires the second arg to be non-zero.
-// On x86-64, and probably others, it's possible for this to compile
-// to a single instruction if both args are already in registers.
-    private static long RotateByAtLeast1(long val, int shift) {
-        return (val >>> shift) | (val << (64 - shift));
-    }
-
-    private static long ShiftMix(long val) {
-        return val ^ (val >>> 47);
-    }
-
-    private static long Uint128Low64(UInt128 x) {
-        return x.first;
-    }
-
-    private static long Rotate(long val, int shift) {
-        return shift == 0 ? val : (val >>> shift) | (val << (64 - shift));
-    }
-
-    private static long Uint128High64(UInt128 x) {
-        return x.second;
-    }
-
-    // Hash 128 input bits down to 64 bits of output.
-// This is intended to be a reasonably good hash function.
-    public static long Hash128to64(UInt128 x) {
-        // Murmur-inspired hashing.
-        long a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
-        a ^= (a >>> 47);
-        long b = (Uint128High64(x) ^ a) * kMul;
-        b ^= (b >>> 47);
-        b *= kMul;
-        return b;
-    }
-
-    private static long HashLen16(long u, long v) {
-        return Hash128to64(UInt128.of(u, v));
-    }
-
-    private static long HashLen0to16(ByteBuf s, int index, int len) {
-        if (len > 8) {
-            long a = Fetch64(s, index);
-            long b = Fetch64(s, index + len - 8);
-            return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
-        }
-        if (len >= 4) {
-            long a = toUint32(Fetch32(s, index));
-            return HashLen16(len + (a << 3), toUint32(Fetch32(s, index + len - 4)));
-        }
-        if (len > 0) {
-            byte a = s.getByte(index);
-            byte b = s.getByte(index + len >>> 1);
-            byte c = s.getByte(index + len - 1);
-            int y = (a & 0xFF) + ((b & 0xFF) << 8);
-            int z = len + ((c & 0xFF) << 2);
-            return ShiftMix(y * k2 ^ z * k3) * k2;
-        }
-        return k2;
-    }
-
-    // This probably works well for 16-byte strings as well, but it may be overkill
-// in that case.
-    private static long HashLen17to32(ByteBuf s, int index, int len) {
-        long a = Fetch64(s, index) * k1;
-        long b = Fetch64(s, index + 8);
-        long c = Fetch64(s, index + len - 8) * k2;
-        long d = Fetch64(s, index + len - 16) * k0;
-        return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
-                a + Rotate(b ^ k3, 20) - c + len);
-    }
-
-    // Return a 16-byte hash for 48 bytes.  Quick and dirty.
-// Callers do best to use "random-looking" values for a and b.
-    private static UInt128 WeakHashLen32WithSeeds(
-            long w, long x, long y, long z, long a, long b) {
-        a += w;
-        b = Rotate(b + a + z, 21);
-        long c = a;
-        a += x;
-        a += y;
-        b += Rotate(a, 44);
-        return UInt128.of(a + z, b + c);
-    }
-
-    // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
-    private static UInt128 WeakHashLen32WithSeeds(ByteBuf s, int index, long a, long b) {
-        return WeakHashLen32WithSeeds(Fetch64(s, index),
-                Fetch64(s, index + 8),
-                Fetch64(s, index + 16),
-                Fetch64(s, index + 24),
-                a,
-                b);
-    }
-
-    // Return an 8-byte hash for 33 to 64 bytes.
-    private static long HashLen33to64(ByteBuf s, int index, int len) {
-        long z = Fetch64(s, index + 24);
-        long a = Fetch64(s, index) + (len + Fetch64(s, index + len - 16)) * k0;
-        long b = Rotate(a + z, 52);
-        long c = Rotate(a, 37);
-        a += Fetch64(s, index + 8);
-        c += Rotate(a, 7);
-        a += Fetch64(s, index + 16);
-        long vf = a + z;
-        long vs = b + Rotate(a, 31) + c;
-        a = Fetch64(s, index + 16) + Fetch64(s, index + len - 32);
-        z = Fetch64(s, index + len - 8);
-        b = Rotate(a + z, 52);
-        c = Rotate(a, 37);
-        a += Fetch64(s, index + len - 24);
-        c += Rotate(a, 7);
-        a += Fetch64(s, index + len - 16);
-        long wf = a + z;
-        long ws = b + Rotate(a, 31) + c;
-        long r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
-        return ShiftMix(r * k0 + vs) * k2;
-    }
-
-    // A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
-// of any length representable in ssize_t.  Based on City and Murmur.
-    private static UInt128 CityMurmur(ByteBuf s, int index, int len, UInt128 seed) {
-        long a = Uint128Low64(seed);
-        long b = Uint128High64(seed);
-        long c;
-        long d;
-        int l = len - 16;
-        if (l <= 0) {  // len <= 16
-            a = ShiftMix(a * k1) * k1;
-            c = b * k1 + HashLen0to16(s, index, len);
-            d = ShiftMix(a + (len >= 8 ? Fetch64(s, index) : c));
-        } else {  // len > 16
-            c = HashLen16(Fetch64(s, index + len - 8) + k1, a);
-            d = HashLen16(b + len, c + Fetch64(s, index + len - 16));
-            a += d;
-            do {
-                a ^= ShiftMix(Fetch64(s, index) * k1) * k1;
-                a *= k1;
-                b ^= a;
-                c ^= ShiftMix(Fetch64(s, index + 8) * k1) * k1;
-                c *= k1;
-                d ^= c;
-                index += 16;
-                l -= 16;
-            } while (l > 0);
-        }
-        a = HashLen16(a, c);
-        b = HashLen16(d, b);
-        return UInt128.of(a ^ b, HashLen16(b, a));
-    }
-
-    public static long CityHash64(ByteBuf s, int index, int len) {
-        if (len <= 32) {
-            if (len <= 16) {
-                return HashLen0to16(s, index, len);
-            } else {
-                return HashLen17to32(s, index, len);
-            }
-        } else if (len <= 64) {
-            return HashLen33to64(s, index, len);
-        }
-
-        // For strings over 64 bytes we hash the end first, and then as we
-        // loop we keep 56 bytes of state: v, w, x, y, and z.
-        long x = Fetch64(s, index);
-        long y = Fetch64(s, index + len - 16) ^ k1;
-        long z = Fetch64(s, index + len - 56) ^ k0;
-        UInt128 v = WeakHashLen32WithSeeds(s, len - 64, len, y);
-        UInt128 w = WeakHashLen32WithSeeds(s, len - 32, len * k1, k0);
-        z += ShiftMix(v.second) * k1;
-        x = Rotate(z + x, 39) * k1;
-        y = Rotate(y, 33) * k1;
-
-        // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
-        len = (len - 1) & ~63;
-        do {
-            x = Rotate(x + y + v.first + Fetch64(s, index + 16), 37) * k1;
-            y = Rotate(y + v.second + Fetch64(s, index + 48), 42) * k1;
-            x ^= w.second;
-            y ^= v.first;
-            z = Rotate(z ^ w.first, 33);
-            v = WeakHashLen32WithSeeds(s, index, v.second * k1, x + w.first);
-            w = WeakHashLen32WithSeeds(s, index + 32, z + w.second, y);
-            // swap
-            long t = z;
-            z = x;
-            x = t;
-            index += 64;
-            len -= 64;
-        } while (len != 0);
-        return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
-                HashLen16(v.second, w.second) + x);
-    }
-
-    private static long CityHash64WithSeed(ByteBuf s, int index, int len, long seed) {
-        return CityHash64WithSeeds(s, index, len, k2, seed);
-    }
-
-    private static long CityHash64WithSeeds(ByteBuf s, int index, int len,
-                                            long seed0, long seed1) {
-        return HashLen16(CityHash64(s, index, len) - seed0, seed1);
-    }
-
-    private static UInt128 CityHash128WithSeed(ByteBuf s, int index, int len, UInt128 seed) {
-        if (len < 128) {
-            return CityMurmur(s, index, len, seed);
-        }
-
-        // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
-        // v, w, x, y, and z.
-        UInt128 v, w;
-        long x = Uint128Low64(seed);
-        long y = Uint128High64(seed);
-        long z = len * k1;
-        long vFirst = Rotate(y ^ k1, 49) * k1 + Fetch64(s, index);
-        long vSecond = Rotate(vFirst, 42) * k1 + Fetch64(s, index + 8);
-        long wFirst = Rotate(y + z, 35) * k1 + x;
-        long wSecond = Rotate(x + Fetch64(s, index + 88), 53) * k1;
-
-//        v = UInt128.of(vFirst, vSecond);
-//        w = UInt128.of(wFirst, wSecond);
-
-        // This is the same inner loop as CityHash64(), manually unrolled.
-        do {
-            x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1;
-            y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1;
-            x ^= wSecond;
-            y ^= vFirst;
-            z = Rotate(z ^ wFirst, 33);
-            v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst);
-            w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y);
-
-            vFirst = v.first;
-            vSecond = v.second;
-            wFirst = w.first;
-            wSecond = w.second;
-            {
-                long swap = z;
-                z = x;
-                x = swap;
-            }
-            index += 64;
-            x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1;
-            y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1;
-            x ^= wSecond;
-            y ^= vFirst;
-            z = Rotate(z ^ wFirst, 33);
-            v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst);
-            w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y);
-
-            vFirst = v.first;
-            vSecond = v.second;
-            wFirst = w.first;
-            wSecond = w.second;
-            {
-                long swap = z;
-                z = x;
-                x = swap;
-            }
-            index += 64;
-            len -= 128;
-        } while (len >= 128);
-        y += Rotate(wFirst, 37) * k0 + z;
-        x += Rotate(vFirst + z, 49) * k0;
-        // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
-        for (int tail_done = 0; tail_done < len; ) {
-            tail_done += 32;
-            y = Rotate(y - x, 42) * k0 + vSecond;
-            wFirst += Fetch64(s, index + len - tail_done + 16);
-            x = Rotate(x, 49) * k0 + wFirst;
-            wFirst += vFirst;
-            v = WeakHashLen32WithSeeds(s, index + len - tail_done, vFirst, vSecond);
-
-            vFirst = v.first;
-            vSecond = v.second;
-        }
-        // At this point our 48 bytes of state should contain more than
-        // enough information for a strong 128-bit hash.  We use two
-        // different 48-byte-to-8-byte hashes to get a 16-byte final result.
-        x = HashLen16(x, vFirst);
-        y = HashLen16(y, wFirst);
-        return UInt128.of(HashLen16(x + vSecond, wSecond) + y,
-                HashLen16(x + wSecond, y + vSecond));
-    }
-
-    public static UInt128 CityHash128(ByteBuf s, int len) {
-        if (len >= 16) {
-            return CityHash128WithSeed(s, 16,
-                    len - 16,
-                    UInt128.of(Fetch64(s, 0) ^ k3,
-                            Fetch64(s, 8)));
-        } else if (len >= 8) {
-            return CityHash128WithSeed(null,
-                    0, 0,
-                    UInt128.of(Fetch64(s, 0) ^ (len * k0),
-                            Fetch64(s, len - 8) ^ k1));
-        } else {
-            return CityHash128WithSeed(s, 0, len, UInt128.of(k0, k1));
-        }
-    }
-}
-
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java
deleted file mode 100644
index 2ba6c1f7..00000000
--- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package xenon.clickhouse.func.clickhouse.cityhash;
-
-/**
- * @author Dmitriy Poluyanov
- * @since 15/02/2018
- * copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/UInt128.java
- */
-final public class UInt128 {
-    final public long first;
-    final public long second;
-
-    public UInt128(long first, long second) {
-        this.first = first;
-        this.second = second;
-    }
-
-    static UInt128 of(long first, long second) {
-        return new UInt128(first, second);
-    }
-}
diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
index 34254907..d241e87b 100644
--- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
+++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
@@ -17,13 +17,7 @@ package org.apache.spark.sql.clickhouse
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.scalatest.funsuite.AnyFunSuite
 import xenon.clickhouse.ClickHouseHelper
-import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64
-import xenon.clickhouse.func.{
-  ClickhouseEquivFunction,
-  CompositeFunctionRegistry,
-  DynamicFunctionRegistry,
-  StaticFunctionRegistry
-}
+import xenon.clickhouse.func.{ClickHouseXxHash64, ClickhouseEquivFunction, CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry}
 
 import scala.collection.JavaConverters._
 

From 085b3adf28bc0916943841085ef61b4d705c023a Mon Sep 17 00:00:00 2001
From: Xinyuan Yang <yangxinyuan100@gmail.com>
Date: Wed, 26 Jul 2023 12:29:11 +0800
Subject: [PATCH 20/20] fix style

---
 .../spark/sql/clickhouse/FunctionRegistrySuite.scala      | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
index d241e87b..33369cb1 100644
--- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
+++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala
@@ -17,7 +17,13 @@ package org.apache.spark.sql.clickhouse
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.scalatest.funsuite.AnyFunSuite
 import xenon.clickhouse.ClickHouseHelper
-import xenon.clickhouse.func.{ClickHouseXxHash64, ClickhouseEquivFunction, CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry}
+import xenon.clickhouse.func.{
+  ClickHouseXxHash64,
+  ClickhouseEquivFunction,
+  CompositeFunctionRegistry,
+  DynamicFunctionRegistry,
+  StaticFunctionRegistry
+}
 
 import scala.collection.JavaConverters._