From fcdf0f0daff0525ef56a80b0a7661c1d18bd89d9 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Wed, 17 May 2023 14:54:38 +0800 Subject: [PATCH 01/20] Spark 3.4: Support distribute by any predefined transform --- .../ClusterShardByTransformSuite.scala | 98 +++++++++++++++++++ .../WriteDistributionAndOrderingSuite.scala | 8 +- .../spark/sql/clickhouse/ExprUtils.scala | 60 ++++++++++-- .../xenon/clickhouse/ClickHouseCatalog.scala | 10 +- .../xenon/clickhouse/ClickHouseTable.scala | 28 +++--- .../clickhouse/func/ClickHouseXxHash64.scala | 4 +- .../clickhouse/func/FunctionRegistry.scala | 22 ++++- .../scala/xenon/clickhouse/func/Months.scala | 47 +++++++++ .../clickhouse/write/ClickHouseWriter.scala | 39 +++++++- .../write/WriteJobDescription.scala | 13 +-- .../clickhouse/FunctionRegistrySuite.scala | 73 ++++++++++++++ 11 files changed, 361 insertions(+), 41 deletions(-) create mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala create mode 100644 spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala new file mode 100644 index 00000000..fce0f77e --- /dev/null +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.clickhouse.cluster + +import org.apache.spark.SparkConf +import org.apache.spark.sql.Row + +class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { + override protected def sparkConf: SparkConf = { + val _conf = super.sparkConf + .set("spark.clickhouse.write.distributed.convertLocal", "true") + _conf + } + + def runTest(func_name: String, func_args: Array[String]): Unit = { + val func_expr = s"$func_name(${func_args.mkString(",")})" + val cluster = "single_replica" + val db = s"db_${func_name}_shard" + val tbl_dist = s"tbl_${func_name}_shard" + val tbl_local = s"${tbl_dist}_local" + + try { + runClickHouseSQL(s"CREATE DATABASE IF NOT EXISTS $db ON CLUSTER $cluster") + + spark.sql( + s"""CREATE TABLE $db.$tbl_local ( + | create_time TIMESTAMP NOT NULL, + | value STRING NOT NULL + |) USING ClickHouse + |TBLPROPERTIES ( + | cluster = '$cluster', + | engine = 'MergeTree()', + | order_by = 'create_time' + |) + |""".stripMargin + ) + + runClickHouseSQL( + s"""CREATE TABLE $db.$tbl_dist ON CLUSTER $cluster + |AS $db.$tbl_local + |ENGINE = Distributed($cluster, '$db', '$tbl_local', $func_expr) + |""".stripMargin + ) + spark.sql( + s"""INSERT INTO `$db`.`$tbl_dist` + |VALUES + | (timestamp'2021-01-01 10:10:10', '1'), + | (timestamp'2022-02-02 10:10:10', '2'), + | (timestamp'2023-03-03 10:10:10', '3'), + | (timestamp'2024-04-04 10:10:10', '4') AS tab(create_time, value) + |""".stripMargin + ) + // check that data is indeed written + checkAnswer( + spark.table(s"$db.$tbl_dist").select("value").orderBy("create_time"), + Seq(Row("1"), Row("2"), Row("3"), Row("4")) + ) + + // check same data is sharded in the same server comparing native sharding + runClickHouseSQL( + s"""INSERT INTO `$db`.`$tbl_dist` + |VALUES + | (timestamp'2021-01-01 10:10:10', '1'), + | (timestamp'2022-02-02 10:10:10', '2'), + | (timestamp'2023-03-03 10:10:10', '3'), + | (timestamp'2024-04-04 10:10:10', '4') + |""".stripMargin + ) + checkAnswer( + spark.table(s"$db.$tbl_local") + .groupBy("value").count().filter("count != 2"), + Seq.empty + ) + + } finally { + runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_dist ON CLUSTER $cluster") + runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_local ON CLUSTER $cluster") + runClickHouseSQL(s"DROP DATABASE IF EXISTS $db ON CLUSTER $cluster") + } + } + + Seq(("xxHash64", Array("value")), ("toYYYYMM", Array("create_time"))).foreach { case (func_name, func_args) => + test(s"shard by $func_name")(runTest(func_name, func_args)) + } + +} diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala index fe9ba535..7fc0972d 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala @@ -78,12 +78,8 @@ class WriteDistributionAndOrderingSuite extends SparkClickHouseSingleTest { WRITE_REPARTITION_BY_PARTITION.key -> repartitionByPartition.toString, WRITE_LOCAL_SORT_BY_KEY.key -> localSortByKey.toString ) { - if (!ignoreUnsupportedTransform && repartitionByPartition) { - intercept[AnalysisException](write()) - } else { - write() - check() - } + write() + check() } Seq(true, false).foreach { ignoreUnsupportedTransform => diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 314c65f3..d7116cc9 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -16,18 +16,21 @@ package org.apache.spark.sql.clickhouse import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression} +import org.apache.spark.sql.catalyst.analysis.NoSuchFunctionException +import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, TransformExpression} import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.IGNORE_UNSUPPORTED_TRANSFORM +import org.apache.spark.sql.connector.catalog.Identifier +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.connector.expressions.Expressions._ import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, _} import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import xenon.clickhouse.exception.CHClientException import xenon.clickhouse.expr._ +import xenon.clickhouse.func.FunctionRegistry -import scala.annotation.tailrec import scala.util.{Failure, Success, Try} -object ExprUtils extends SQLConfHelper { +class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable { def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] = partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray @@ -47,7 +50,28 @@ object ExprUtils extends SQLConfHelper { toSparkTransformOpt(expr).map(trans => Expressions.sort(trans, direction, nullOrder)) }.toArray - @tailrec + private def loadV2FunctionOpt( + name: String, + args: Seq[Expression] + ): Option[BoundFunction] = { + def loadFunction(ident: Identifier): UnboundFunction = + functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident)) + val inputType = StructType(args.zipWithIndex.map { + case (exp, pos) => StructField(s"_$pos", exp.dataType, exp.nullable) + }) + try { + val unbound = loadFunction(Identifier.of(Array.empty, name)) + Some(unbound.bind(inputType)) + } catch { + case e: NoSuchFunctionException => + throw e + case _: UnsupportedOperationException if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => + None + case e: UnsupportedOperationException => + throw new AnalysisException(e.getMessage, cause = Some(e)) + } + } + def toCatalyst(v2Expr: V2Expression, fields: Array[StructField]): Expression = v2Expr match { case IdentityTransform(ref) => toCatalyst(ref, fields) @@ -57,8 +81,15 @@ object ExprUtils extends SQLConfHelper { .find { case (field, _) => field.name == ref.fieldNames.head } .getOrElse(throw CHClientException(s"Invalid field reference: $ref")) BoundReference(ordinal, field.dataType, field.nullable) + case t: Transform => + val catalystArgs = t.arguments().map(toCatalyst(_, fields)) + loadV2FunctionOpt(t.name(), catalystArgs).map { bound => + TransformExpression(bound, catalystArgs) + }.getOrElse { + throw CHClientException(s"Unsupported expression: $v2Expr") + } case _ => throw CHClientException( - s"Unsupported V2 expression: $v2Expr, SPARK-33779: Spark 3.3 only support IdentityTransform" + s"Unsupported expression: $v2Expr" ) } @@ -83,10 +114,10 @@ object ExprUtils extends SQLConfHelper { case FuncExpr("toYYYYMMDD", List(FieldRef(col))) => days(col) case FuncExpr("toHour", List(FieldRef(col))) => hours(col) case FuncExpr("HOUR", List(FieldRef(col))) => hours(col) - // TODO support arbitrary functions - // case FuncExpr("xxHash64", List(FieldRef(col))) => apply("ck_xx_hash64", column(col)) case FuncExpr("rand", Nil) => apply("rand") case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) + case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) => + apply(functionRegistry.getFuncMappingByCk(funName), column(col)) case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported") } @@ -96,7 +127,8 @@ object ExprUtils extends SQLConfHelper { case DaysTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMMDD", List(FieldRef(col))) case HoursTransform(FieldReference(Seq(col))) => FuncExpr("toHour", List(FieldRef(col))) case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe) - case ApplyTransform(name, args) => FuncExpr(name, args.map(arg => SQLExpr(arg.describe())).toList) + case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) => + FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList) case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket") case other: Transform => throw CHClientException(s"Unsupported transform: $other") } @@ -113,8 +145,18 @@ object ExprUtils extends SQLConfHelper { case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col) .orElse(secondarySchema.find(_.name == col)) .getOrElse(throw CHClientException(s"Invalid partition column: $col")) - case ckXxhHash64 @ ApplyTransform("ck_xx_hash64", _) => StructField(ckXxhHash64.toString, LongType) + case t @ ApplyTransform(transformName, _) => + val resType = + functionRegistry.load(transformName).getOrElse(throw new NoSuchFunctionException(transformName)) match { + case f: ScalarFunction[_] => f.resultType() + case other => throw CHClientException(s"Unsupported function: $other") + } + StructField(t.toString, resType) case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket") case other: Transform => throw CHClientException(s"Unsupported transform: $other") } } + +object ExprUtils { + def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry) +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala index 02862392..b625560d 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala @@ -85,12 +85,15 @@ class ClickHouseCatalog extends TableCatalog val dynamicFunctionRegistry = new DynamicFunctionRegistry val xxHash64ShardFunc = new ClickHouseXxHash64Shard(clusterSpecs) + val monthsFunc = new Months() dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc) + dynamicFunctionRegistry.register("months", monthsFunc) this.functionRegistry = new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry)) log.info(s"Detect ${clusterSpecs.size} ClickHouse clusters: ${clusterSpecs.map(_.name).mkString(",")}") log.info(s"ClickHouse clusters' detail: $clusterSpecs") + log.info(s"functionRegistry: ${this.functionRegistry.list.mkString(",")}") } override def name(): String = catalogName @@ -141,7 +144,8 @@ class ClickHouseCatalog extends TableCatalog tableClusterSpec, _tz, tableSpec, - tableEngineSpec + tableEngineSpec, + functionRegistry ) } @@ -206,7 +210,7 @@ class ClickHouseCatalog extends TableCatalog val partitionsClause = partitions match { case transforms if transforms.nonEmpty => - transforms.map(ExprUtils.toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")") + transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")") case _ => "" } @@ -297,7 +301,7 @@ class ClickHouseCatalog extends TableCatalog } tableOpt match { case None => false - case Some(ClickHouseTable(_, cluster, _, tableSpec, _)) => + case Some(ClickHouseTable(_, cluster, _, tableSpec, _, _)) => val (db, tbl) = (tableSpec.database, tableSpec.name) val isAtomic = loadNamespaceMetadata(Array(db)).get("engine").equalsIgnoreCase("atomic") val syncClause = if (isAtomic) "SYNC" else "" diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala index 59b3ca9f..f4e19071 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala @@ -14,16 +14,12 @@ package xenon.clickhouse -import java.lang.{Integer => JInt, Long => JLong} -import java.time.{LocalDate, ZoneId} -import java.util -import scala.collection.JavaConverters._ -import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.clickhouse.{ExprUtils, ReadOptions, WriteOptions} +import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.{READ_DISTRIBUTED_CONVERT_LOCAL, USE_NULLABLE_QUERY_SCHEMA} -import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.clickhouse.{ExprUtils, ReadOptions, WriteOptions} import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.read.ScanBuilder import org.apache.spark.sql.connector.write.LogicalWriteInfo @@ -34,16 +30,23 @@ import org.apache.spark.unsafe.types.UTF8String import xenon.clickhouse.Utils._ import xenon.clickhouse.client.NodeClient import xenon.clickhouse.expr.{Expr, OrderExpr} +import xenon.clickhouse.func.FunctionRegistry import xenon.clickhouse.read.{ClickHouseMetadataColumn, ClickHouseScanBuilder, ScanJobDescription} import xenon.clickhouse.spec._ import xenon.clickhouse.write.{ClickHouseWriteBuilder, WriteJobDescription} +import java.lang.{Integer => JInt, Long => JLong} +import java.time.{LocalDate, ZoneId} +import java.util +import scala.collection.JavaConverters._ + case class ClickHouseTable( node: NodeSpec, cluster: Option[ClusterSpec], implicit val tz: ZoneId, spec: TableSpec, - engineSpec: TableEngineSpec + engineSpec: TableEngineSpec, + functionRegistry: FunctionRegistry ) extends Table with SupportsRead with SupportsWrite @@ -130,10 +133,12 @@ case class ClickHouseTable( private lazy val metadataSchema: StructType = StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField)) - override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey) + override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey) override lazy val partitionSchema: StructType = StructType( - partitioning.map(partTransform => ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)) + partitioning.map(partTransform => + ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform) + ) ) override lazy val properties: util.Map[String, String] = spec.toJavaMap @@ -170,7 +175,8 @@ case class ClickHouseTable( shardingKey = shardingKey, partitionKey = partitionKey, sortingKey = sortingKey, - writeOptions = new WriteOptions(info.options.asCaseSensitiveMap()) + writeOptions = new WriteOptions(info.options.asCaseSensitiveMap()), + functionRegistry = functionRegistry ) new ClickHouseWriteBuilder(writeJob) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala index e7f223b0..dab34932 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala @@ -26,12 +26,14 @@ import xenon.clickhouse.spec.{ClusterSpec, ShardUtils} * select xxHash64(concat(project_id, toString(seq)) * }}} */ -object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] { +object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { override def name: String = "clickhouse_xxHash64" override def canonicalName: String = s"clickhouse.$name" + override val ckFuncNames: Array[String] = Array("xxHash64") + override def description: String = s"$name: (value: string) => hash_value: long" override def bind(inputType: StructType): BoundFunction = inputType.fields match { diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index b41a7d1a..c10ce864 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -18,11 +18,19 @@ import org.apache.spark.sql.connector.catalog.functions.UnboundFunction import scala.collection.mutable -trait FunctionRegistry { +trait FunctionRegistry extends Serializable { def list: Array[String] def load(name: String): Option[UnboundFunction] + + def getFuncMappingBySpark: Map[String, String] + + def getFuncMappingByCk: Map[String, String] = getFuncMappingBySpark.map(_.swap) +} + +trait ClickhouseEquivFunction { + val ckFuncNames: Array[String] } class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends FunctionRegistry { @@ -30,6 +38,8 @@ class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends Fun override def list: Array[String] = registries.flatMap(_.list) override def load(name: String): Option[UnboundFunction] = registries.flatMap(_.load(name)).headOption + + override def getFuncMappingBySpark: Map[String, String] = registries.flatMap(_.getFuncMappingBySpark).toMap } object StaticFunctionRegistry extends FunctionRegistry { @@ -42,6 +52,11 @@ object StaticFunctionRegistry extends FunctionRegistry { override def list: Array[String] = functions.keys.toArray override def load(name: String): Option[UnboundFunction] = functions.get(name) + + override val getFuncMappingBySpark: Map[String, String] = + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _)) + } } class DynamicFunctionRegistry extends FunctionRegistry { @@ -56,4 +71,9 @@ class DynamicFunctionRegistry extends FunctionRegistry { override def list: Array[String] = functions.keys.toArray override def load(name: String): Option[UnboundFunction] = functions.get(name) + + override def getFuncMappingBySpark: Map[String, String] = + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _)) + }.toMap } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala new file mode 100644 index 00000000..d3f40814 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ + +import java.sql.Timestamp + +class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { + + override def name: String = "months" + + override def canonicalName: String = s"months" + + override val ckFuncNames: Array[String] = Array("toYYYYMM") + + override def description: String = s"$name: (time: timestamp) => shard_num: int" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, TimestampType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description") + } + + override def inputTypes: Array[DataType] = Array(TimestampType) + + override def resultType: DataType = IntegerType + + override def isResultNullable: Boolean = false + + def invoke(time: Long): Int = { + val ts = new Timestamp(time / 1000).toLocalDateTime + ts.getYear * 100 + ts.getMonthValue + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala index d18319e5..07733442 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala @@ -17,9 +17,10 @@ package xenon.clickhouse.write import com.clickhouse.client.ClickHouseProtocol import com.clickhouse.data.ClickHouseCompression import org.apache.commons.io.IOUtils -import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection} -import org.apache.spark.sql.catalyst.{expressions, InternalRow} +import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression, V2ExpressionUtils} +import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.clickhouse.ExprUtils +import org.apache.spark.sql.connector.catalog.functions.ScalarFunction import org.apache.spark.sql.connector.metric.CustomTaskMetric import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} import org.apache.spark.sql.types._ @@ -56,7 +57,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match { case None => None case Some(v2Expr) => - val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields) + val catalystExpr = ExprUtils(writeJob.functionRegistry).toCatalyst(v2Expr, writeJob.dataSetSchema.fields) catalystExpr match { case BoundReference(_, dataType, _) if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType` @@ -66,6 +67,11 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) Some(catalystExpr) case BoundReference(_, dataType, _) => throw CHClientException(s"Invalid data type of sharding field: $dataType") + case TransformExpression(function, _, _) => + function.resultType() match { + case ByteType | ShortType | IntegerType | LongType => Some(catalystExpr) + case _ => throw CHClientException(s"Invalid data type of sharding field: ${function.resultType()}") + } case unsupported: Expression => log.warn(s"Unsupported expression of sharding field: $unsupported") None @@ -74,7 +80,23 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) protected lazy val shardProjection: Option[expressions.Projection] = shardExpr .filter(_ => writeJob.writeOptions.convertDistributedToLocal) - .map(expr => SafeProjection.create(Seq(expr))) + .flatMap(expr => + expr match { + case BoundReference(_, _, _) => + Some(SafeProjection.create(Seq(expr))) + case TransformExpression(function, args, _) => + val retType = function.resultType() match { + case ByteType => classOf[Byte] + case ShortType => classOf[Short] + case IntegerType => classOf[Int] + case LongType => classOf[Long] + case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," + + s"sharding field: ${function.resultType()}") + } + val expr = V2ExpressionUtils.resolveScalarFunction(function.asInstanceOf[ScalarFunction[retType.type]], args) + Some(SafeProjection.create(Seq(expr))) + } + ) // put the node select strategy in executor side because we need to calculate shard and don't know the records // util DataWriter#write(InternalRow) invoked. @@ -107,6 +129,15 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) case _ => None } shardValue.map(value => ShardUtils.calcShard(writeJob.cluster.get, value).num) + case (Some(TransformExpression(function, _, _)), Some(projection)) => + val shardValue = function.resultType() match { + case ByteType => Some(projection(record).getByte(0).toLong) + case ShortType => Some(projection(record).getShort(0).toLong) + case IntegerType => Some(projection(record).getInt(0).toLong) + case LongType => Some(projection(record).getLong(0)) + case _ => None + } + shardValue.map(value => ShardUtils.calcShard(writeJob.cluster.get, value).num) case _ => None } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index 9cd8262f..b374c996 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -15,11 +15,11 @@ package xenon.clickhouse.write import java.time.ZoneId - import org.apache.spark.sql.clickhouse.{ExprUtils, WriteOptions} import org.apache.spark.sql.connector.expressions.{Expression, SortOrder, Transform} import org.apache.spark.sql.types.StructType import xenon.clickhouse.expr.{Expr, FuncExpr, OrderExpr} +import xenon.clickhouse.func.FunctionRegistry import xenon.clickhouse.spec._ case class WriteJobDescription( @@ -37,7 +37,8 @@ case class WriteJobDescription( shardingKey: Option[Expr], partitionKey: Option[List[Expr]], sortingKey: Option[List[OrderExpr]], - writeOptions: WriteOptions + writeOptions: WriteOptions, + functionRegistry: FunctionRegistry ) { def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match { @@ -56,20 +57,20 @@ case class WriteJobDescription( } def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match { - case Some(expr) => ExprUtils.toSparkTransformOpt(expr) + case Some(expr) => ExprUtils(functionRegistry).toSparkTransformOpt(expr) case _ => None } def sparkSplits: Array[Transform] = if (writeOptions.repartitionByPartition) { - ExprUtils.toSparkSplits(shardingKeyIgnoreRand, partitionKey) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey) } else { - ExprUtils.toSparkSplits(shardingKeyIgnoreRand, None) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None) } def sparkSortOrders: Array[SortOrder] = { val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None - ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey) + ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey) } } diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala new file mode 100644 index 00000000..c7c1cfb3 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.clickhouse + +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.scalatest.funsuite.AnyFunSuite +import xenon.clickhouse.ClickHouseHelper +import xenon.clickhouse.func.{ + ClickHouseXxHash64, + CompositeFunctionRegistry, + DynamicFunctionRegistry, + StaticFunctionRegistry +} + +import scala.collection.JavaConverters._ + +class FunctionRegistrySuite extends AnyFunSuite { + + val staticFunctionRegistry: StaticFunctionRegistry.type = StaticFunctionRegistry + val dynamicFunctionRegistry = new DynamicFunctionRegistry + dynamicFunctionRegistry.register("ck_xx_hash64", ClickHouseXxHash64) + dynamicFunctionRegistry.register("clickhouse_xxHash64", ClickHouseXxHash64) + + test("check StaticFunctionRegistry mappings") { + assert(staticFunctionRegistry.getFuncMappingBySpark === Map( + "ck_xx_hash64" -> "xxHash64", + "clickhouse_xxHash64" -> "xxHash64" + )) + assert((staticFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "clickhouse_xxHash64" + )) || (staticFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "ck_xx_hash64" + ))) + } + + test("check DynamicFunctionRegistry mappings") { + assert(dynamicFunctionRegistry.getFuncMappingBySpark === Map( + "ck_xx_hash64" -> "xxHash64", + "clickhouse_xxHash64" -> "xxHash64" + )) + assert((dynamicFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "clickhouse_xxHash64" + )) || (dynamicFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "ck_xx_hash64" + ))) + } + + test("check CompositeFunctionRegistry mappings") { + val compositeFunctionRegistry = + new CompositeFunctionRegistry(Array(staticFunctionRegistry, dynamicFunctionRegistry)) + assert(compositeFunctionRegistry.getFuncMappingBySpark === Map( + "ck_xx_hash64" -> "xxHash64", + "clickhouse_xxHash64" -> "xxHash64" + )) + assert((compositeFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "clickhouse_xxHash64" + )) || (compositeFunctionRegistry.getFuncMappingByCk === Map( + "xxHash64" -> "ck_xx_hash64" + ))) + } +} From e52b7144412642b8629fa08f1c9c40970bd08b2f Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Thu, 18 May 2023 16:54:05 +0800 Subject: [PATCH 02/20] Spark 3.4: add udf: years, days, hours, murmurHash2 and murmurHash3. Amend testing --- .../ClickHouseClusterHashUDFSuite.scala | 67 ++++++++++++++++ .../cluster/ClickHouseClusterUDFSuite.scala | 55 ------------- .../ClusterShardByTransformSuite.scala | 35 ++++++--- .../spark/sql/clickhouse/ExprUtils.scala | 14 ---- .../xenon/clickhouse/ClickHouseCatalog.scala | 3 +- .../clickhouse/func/FunctionRegistry.scala | 25 +++++- .../clickhouse/func/clickhouse/Days.scala | 52 +++++++++++++ .../{Months.scala => clickhouse/Hours.scala} | 20 +++-- .../clickhouse/func/clickhouse/Months.scala | 52 +++++++++++++ .../func/clickhouse/MurmurHash2.scala | 77 ++++++++++++++++++ .../func/clickhouse/MurmurHash3.scala | 78 +++++++++++++++++++ .../XxHash64.scala} | 4 +- .../clickhouse/func/clickhouse/Years.scala | 52 +++++++++++++ .../clickhouse/write/ClickHouseWriter.scala | 10 ++- .../clickhouse/FunctionRegistrySuite.scala | 48 +++++------- 15 files changed, 469 insertions(+), 123 deletions(-) create mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala delete mode 100644 spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{Months.scala => clickhouse/Hours.scala} (68%) create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{ClickHouseXxHash64.scala => clickhouse/XxHash64.scala} (96%) create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala new file mode 100644 index 00000000..9ef15241 --- /dev/null +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.clickhouse.cluster + +import org.apache.spark.sql.clickhouse.TestUtils.om +import xenon.clickhouse.func.{CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry} +import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard + +import java.lang.{Long => JLong} + +class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { + // only for query function names + val dummyRegistry: CompositeFunctionRegistry = { + val dynamicFunctionRegistry = new DynamicFunctionRegistry + val xxHash64ShardFunc = new ClickHouseXxHash64Shard(Seq.empty) + dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible + dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc) + new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry)) + } + + def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = { + val sparkResult = spark.sql( + s"""SELECT + | $funcSparkName($stringVal) AS hash_value + |""".stripMargin + ).collect + assert(sparkResult.length == 1) + val sparkHashVal = sparkResult.head.getAs[Long]("hash_value") + + val clickhouseResultJsonStr = runClickHouseSQL( + s"""SELECT + | $funcCkName($stringVal) AS hash_value + |""".stripMargin + ).head.getString(0) + val clickhouseResultJson = om.readTree(clickhouseResultJsonStr) + val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText) + assert(sparkHashVal == clickhouseHashVal) + } + + Seq( + "clickhouse_xxHash64", + "clickhouse_murmurHash3_64", + "clickhouse_murmurHash3_32", + "clickhouse_murmurHash2_64", + "clickhouse_murmurHash2_32" + ).foreach { funcSparkName => + val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName) + test(s"UDF $funcSparkName") { + Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { rawStringVal => + val stringVal = s"\'$rawStringVal\'" + runTest(funcSparkName, funcCkName, stringVal) + } + } + } +} diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala deleted file mode 100644 index 3d97cc25..00000000 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.clickhouse.cluster - -import org.apache.spark.sql.clickhouse.TestUtils.om - -import java.lang.{Long => JLong} - -class ClickHouseClusterUDFSuite extends SparkClickHouseClusterTest { - - test("UDF ck_xx_hash64") { - Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { stringVal => - val sparkResult = spark.sql( - s"""SELECT - | ck_xx_hash64('$stringVal') AS hash_value_legacy, - | clickhouse_xxHash64('$stringVal') AS hash_value, - | ck_xx_hash64_shard('single_replica', '$stringVal') AS shard_num_legacy, -- one based ordinal defined in `remote_servers.xml` - | clickhouse_shard_xxHash64('single_replica', '$stringVal') AS shard_num -- one based ordinal defined in `remote_servers.xml` - |""".stripMargin - ).collect - assert(sparkResult.length == 1) - val sparkHashValLegacy = sparkResult.head.getAs[Long]("hash_value_legacy") - val sparkHashVal = sparkResult.head.getAs[Long]("hash_value") - assert(sparkHashValLegacy === sparkHashVal) - val sparkShardNumLegacy = sparkResult.head.getAs[Int]("shard_num_legacy") - val sparkShardNum = sparkResult.head.getAs[Int]("shard_num") - assert(sparkShardNumLegacy === sparkShardNum) - - val clickhouseResultJsonStr = runClickHouseSQL( - s"""SELECT - | xxHash64('$stringVal') AS hash_value, - | xxHash64('$stringVal') % 4 AS shard_num -- zero based ordinal - |""".stripMargin - ).head.getString(0) - val clickhouseResultJson = om.readTree(clickhouseResultJsonStr) - val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText) - val clickhouseShardNum = JLong.parseUnsignedLong(clickhouseResultJson.get("shard_num").asText) - - assert(sparkHashVal == clickhouseHashVal) - assert(sparkShardNum == clickhouseShardNum + 1) - } - } -} diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala index fce0f77e..db8a3036 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -27,7 +27,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { def runTest(func_name: String, func_args: Array[String]): Unit = { val func_expr = s"$func_name(${func_args.mkString(",")})" val cluster = "single_replica" - val db = s"db_${func_name}_shard" + val db = s"db_${func_name}_shard_transform" val tbl_dist = s"tbl_${func_name}_shard" val tbl_local = s"${tbl_dist}_local" @@ -37,6 +37,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { spark.sql( s"""CREATE TABLE $db.$tbl_local ( | create_time TIMESTAMP NOT NULL, + | create_date DATE NOT NULL, | value STRING NOT NULL |) USING ClickHouse |TBLPROPERTIES ( @@ -56,10 +57,11 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { spark.sql( s"""INSERT INTO `$db`.`$tbl_dist` |VALUES - | (timestamp'2021-01-01 10:10:10', '1'), - | (timestamp'2022-02-02 10:10:10', '2'), - | (timestamp'2023-03-03 10:10:10', '3'), - | (timestamp'2024-04-04 10:10:10', '4') AS tab(create_time, value) + | (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'), + | (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'), + | (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'), + | (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4') + | AS tab(create_time, create_date, value) |""".stripMargin ) // check that data is indeed written @@ -72,10 +74,10 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { runClickHouseSQL( s"""INSERT INTO `$db`.`$tbl_dist` |VALUES - | (timestamp'2021-01-01 10:10:10', '1'), - | (timestamp'2022-02-02 10:10:10', '2'), - | (timestamp'2023-03-03 10:10:10', '3'), - | (timestamp'2024-04-04 10:10:10', '4') + | (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'), + | (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'), + | (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'), + | (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4') |""".stripMargin ) checkAnswer( @@ -91,8 +93,19 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { } } - Seq(("xxHash64", Array("value")), ("toYYYYMM", Array("create_time"))).foreach { case (func_name, func_args) => - test(s"shard by $func_name")(runTest(func_name, func_args)) + Seq( + ("toYear", Array("create_date")), + ("toYYYYMM", Array("create_date")), + ("toYYYYMMDD", Array("create_date")), + ("toHour", Array("create_time")), + ("xxHash64", Array("value")), + ("murmurHash2_64", Array("value")), + ("murmurHash2_32", Array("value")), + ("murmurHash3_64", Array("value")), + ("murmurHash3_32", Array("value")) + ).foreach { + case (func_name: String, func_args: Array[String]) => + test(s"shard by $func_name")(runTest(func_name, func_args)) } } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index d7116cc9..8a7d1f96 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -108,12 +108,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S def toSparkTransform(expr: Expr): Transform = expr match { case FieldRef(col) => identity(col) - case FuncExpr("toYear", List(FieldRef(col))) => years(col) - case FuncExpr("YEAR", List(FieldRef(col))) => years(col) - case FuncExpr("toYYYYMM", List(FieldRef(col))) => months(col) - case FuncExpr("toYYYYMMDD", List(FieldRef(col))) => days(col) - case FuncExpr("toHour", List(FieldRef(col))) => hours(col) - case FuncExpr("HOUR", List(FieldRef(col))) => hours(col) case FuncExpr("rand", Nil) => apply("rand") case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) => @@ -122,10 +116,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S } def toClickHouse(transform: Transform): Expr = transform match { - case YearsTransform(FieldReference(Seq(col))) => FuncExpr("toYear", List(FieldRef(col))) - case MonthsTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMM", List(FieldRef(col))) - case DaysTransform(FieldReference(Seq(col))) => FuncExpr("toYYYYMMDD", List(FieldRef(col))) - case HoursTransform(FieldReference(Seq(col))) => FuncExpr("toHour", List(FieldRef(col))) case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe) case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) => FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList) @@ -138,10 +128,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S secondarySchema: StructType, transform: Transform ): StructField = transform match { - case years: YearsTransform => StructField(years.toString, IntegerType) - case months: MonthsTransform => StructField(months.toString, IntegerType) - case days: DaysTransform => StructField(days.toString, IntegerType) - case hours: HoursTransform => StructField(hours.toString, IntegerType) case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col) .orElse(secondarySchema.find(_.name == col)) .getOrElse(throw CHClientException(s"Invalid partition column: $col")) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala index b625560d..5fd043cd 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala @@ -26,6 +26,7 @@ import xenon.clickhouse.Constants._ import xenon.clickhouse.client.NodeClient import xenon.clickhouse.exception.CHClientException import xenon.clickhouse.exception.ClickHouseErrCode._ +import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard import xenon.clickhouse.func.{FunctionRegistry, _} import xenon.clickhouse.spec._ @@ -85,10 +86,8 @@ class ClickHouseCatalog extends TableCatalog val dynamicFunctionRegistry = new DynamicFunctionRegistry val xxHash64ShardFunc = new ClickHouseXxHash64Shard(clusterSpecs) - val monthsFunc = new Months() dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc) - dynamicFunctionRegistry.register("months", monthsFunc) this.functionRegistry = new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry)) log.info(s"Detect ${clusterSpecs.size} ClickHouse clusters: ${clusterSpecs.map(_.name).mkString(",")}") diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index c10ce864..e6094eaf 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -15,6 +15,7 @@ package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.UnboundFunction +import xenon.clickhouse.func.clickhouse._ import scala.collection.mutable @@ -26,7 +27,7 @@ trait FunctionRegistry extends Serializable { def getFuncMappingBySpark: Map[String, String] - def getFuncMappingByCk: Map[String, String] = getFuncMappingBySpark.map(_.swap) + def getFuncMappingByCk: Map[String, String] } trait ClickhouseEquivFunction { @@ -40,13 +41,23 @@ class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends Fun override def load(name: String): Option[UnboundFunction] = registries.flatMap(_.load(name)).headOption override def getFuncMappingBySpark: Map[String, String] = registries.flatMap(_.getFuncMappingBySpark).toMap + + override def getFuncMappingByCk: Map[String, String] = registries.flatMap(_.getFuncMappingByCk).toMap } object StaticFunctionRegistry extends FunctionRegistry { private val functions = Map[String, UnboundFunction]( "ck_xx_hash64" -> ClickHouseXxHash64, // for compatible - "clickhouse_xxHash64" -> ClickHouseXxHash64 + "clickhouse_xxHash64" -> ClickHouseXxHash64, + "clickhouse_murmurHash2_32" -> MurmurHash2_32, + "clickhouse_murmurHash2_64" -> MurmurHash2_64, + "clickhouse_murmurHash3_32" -> MurmurHash3_32, + "clickhouse_murmurHash3_64" -> MurmurHash3_64, + "clickhouse_years" -> Years, + "clickhouse_months" -> Months, + "clickhouse_days" -> Days, + "clickhouse_hours" -> Hours ) override def list: Array[String] = functions.keys.toArray @@ -57,6 +68,11 @@ object StaticFunctionRegistry extends FunctionRegistry { functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _)) } + + override val getFuncMappingByCk: Map[String, String] = + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k)) + } } class DynamicFunctionRegistry extends FunctionRegistry { @@ -76,4 +92,9 @@ class DynamicFunctionRegistry extends FunctionRegistry { functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _)) }.toMap + + override def getFuncMappingByCk: Map[String, String] = + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k)) + }.toMap } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala new file mode 100644 index 00000000..9ceca80e --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import xenon.clickhouse.func.ClickhouseEquivFunction + +import java.time.LocalDate +import java.time.format.DateTimeFormatter + +object Days extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_days" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("toYYYYMMDD") + + override def description: String = s"$name: (date: Date) => shard_num: int" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, DateType, _, _)) => this + case Array(StructField(_, TimestampType, _, _)) => this + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") + } + + override def inputTypes: Array[DataType] = Array(DateType) + + override def resultType: DataType = IntegerType + + override def isResultNullable: Boolean = false + + def invoke(days: Int): Int = { + val date = LocalDate.ofEpochDay(days) + val formatter = DateTimeFormatter.ofPattern("yyyyMMdd") + date.format(formatter).toInt + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala similarity index 68% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala index d3f40814..77dbe4c2 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala @@ -12,25 +12,28 @@ * limitations under the License. */ -package xenon.clickhouse.func +package xenon.clickhouse.func.clickhouse import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ +import xenon.clickhouse.func.ClickhouseEquivFunction -import java.sql.Timestamp +import java.sql.{Date, Timestamp} +import java.text.SimpleDateFormat -class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { +object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { - override def name: String = "months" + override def name: String = "clickhouse_hours" - override def canonicalName: String = s"months" + override def canonicalName: String = s"clickhouse.$name" - override val ckFuncNames: Array[String] = Array("toYYYYMM") + override val ckFuncNames: Array[String] = Array("toHour", "HOUR") override def description: String = s"$name: (time: timestamp) => shard_num: int" override def bind(inputType: StructType): BoundFunction = inputType.fields match { case Array(StructField(_, TimestampType, _, _)) => this + case Array(StructField(_, StringType, _, _)) => this case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description") } @@ -41,7 +44,8 @@ class Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu override def isResultNullable: Boolean = false def invoke(time: Long): Int = { - val ts = new Timestamp(time / 1000).toLocalDateTime - ts.getYear * 100 + ts.getMonthValue + val ts = new Timestamp(time / 1000) + val formatter: SimpleDateFormat = new SimpleDateFormat("hh") + formatter.format(ts).toInt } } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala new file mode 100644 index 00000000..0be1bc9b --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import xenon.clickhouse.func.ClickhouseEquivFunction + +import java.time.LocalDate +import java.time.format.DateTimeFormatter + +object Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_months" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("toYYYYMM") + + override def description: String = s"$name: (date: Date) => shard_num: int" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, DateType, _, _)) => this + case Array(StructField(_, TimestampType, _, _)) => this + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") + } + + override def inputTypes: Array[DataType] = Array(DateType) + + override def resultType: DataType = IntegerType + + override def isResultNullable: Boolean = false + + def invoke(days: Int): Int = { + val date = LocalDate.ofEpochDay(days) + val formatter = DateTimeFormatter.ofPattern("yyyyMM") + date.format(formatter).toInt + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala new file mode 100644 index 00000000..49daaeae --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String +import xenon.clickhouse.func.ClickhouseEquivFunction + +object MurmurHash2_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_murmurHash2_64" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("murmurHash2_64") + + override def description: String = s"$name: (value: string) => hash_value: long" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + } + + override def inputTypes: Array[DataType] = Array(StringType) + + override def resultType: DataType = LongType + + override def isResultNullable: Boolean = false + + def invoke(values: UTF8String): Long = { + // ignore UInt64 vs Int64 + val data = values.getBytes + MurmurHash2.hash64(data, data.length, 0) + } +} + +object MurmurHash2_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_murmurHash2_32" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("murmurHash2_32") + + override def description: String = s"$name: (value: string) => hash_value: long" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + } + + override def inputTypes: Array[DataType] = Array(StringType) + + override def resultType: DataType = LongType + + override def isResultNullable: Boolean = false + + def invoke(values: UTF8String): Long = { + val data = values.getBytes + val v = MurmurHash2.hash32(data, data.length, 0).toLong + if (v < 0) v + (1L << 32) else v + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala new file mode 100644 index 00000000..db15a8e7 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala @@ -0,0 +1,78 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.commons.codec.digest.MurmurHash3 +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String +import xenon.clickhouse.func.ClickhouseEquivFunction + +object MurmurHash3_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_murmurHash3_64" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("murmurHash3_64") + + override def description: String = s"$name: (value: string) => hash_value: long" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + } + + override def inputTypes: Array[DataType] = Array(StringType) + + override def resultType: DataType = LongType + + override def isResultNullable: Boolean = false + + def invoke(values: UTF8String): Long = { + // ignore UInt64 vs Int64 + val data = values.getBytes + val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0) + hashes(0) ^ hashes(1) + } +} + +object MurmurHash3_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_murmurHash3_32" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("murmurHash3_32") + + override def description: String = s"$name: (value: string) => hash_value: long" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + } + + override def inputTypes: Array[DataType] = Array(StringType) + + override def resultType: DataType = LongType + + override def isResultNullable: Boolean = false + + def invoke(values: UTF8String): Long = { + val data = values.getBytes + val v = MurmurHash3.hash32x86(data, 0, data.length, 0).toLong + if (v < 0) v + (1L << 32) else v + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala similarity index 96% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala index dab34932..f02af236 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/ClickHouseXxHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala @@ -12,12 +12,13 @@ * limitations under the License. */ -package xenon.clickhouse.func +package xenon.clickhouse.func.clickhouse import org.apache.spark.sql.catalyst.expressions.XxHash64Function import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String +import xenon.clickhouse.func.ClickhouseEquivFunction import xenon.clickhouse.spec.{ClusterSpec, ShardUtils} /** @@ -47,6 +48,7 @@ object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with override def isResultNullable: Boolean = false + // ignore UInt64 vs Int64 def invoke(value: UTF8String): Long = XxHash64Function.hash(value, StringType, 0L) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala new file mode 100644 index 00000000..b3c0a135 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import xenon.clickhouse.func.ClickhouseEquivFunction + +import java.time.LocalDate +import java.time.format.DateTimeFormatter + +object Years extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { + + override def name: String = "clickhouse_years" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("toYear", "YEAR") + + override def description: String = s"$name: (date: Date) => shard_num: int" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, DateType, _, _)) => this + case Array(StructField(_, TimestampType, _, _)) => this + case Array(StructField(_, StringType, _, _)) => this + case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") + } + + override def inputTypes: Array[DataType] = Array(DateType) + + override def resultType: DataType = IntegerType + + override def isResultNullable: Boolean = false + + def invoke(days: Int): Int = { + val date = LocalDate.ofEpochDay(days) + val formatter = DateTimeFormatter.ofPattern("yyyy") + date.format(formatter).toInt + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala index 07733442..3cd43c5e 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala @@ -17,8 +17,14 @@ package xenon.clickhouse.write import com.clickhouse.client.ClickHouseProtocol import com.clickhouse.data.ClickHouseCompression import org.apache.commons.io.IOUtils -import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression, V2ExpressionUtils} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.expressions.{ + BoundReference, + Expression, + SafeProjection, + TransformExpression, + V2ExpressionUtils +} +import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.clickhouse.ExprUtils import org.apache.spark.sql.connector.catalog.functions.ScalarFunction import org.apache.spark.sql.connector.metric.CustomTaskMetric diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala index c7c1cfb3..34254907 100644 --- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala +++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala @@ -17,8 +17,9 @@ package org.apache.spark.sql.clickhouse import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.scalatest.funsuite.AnyFunSuite import xenon.clickhouse.ClickHouseHelper +import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64 import xenon.clickhouse.func.{ - ClickHouseXxHash64, + ClickhouseEquivFunction, CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry @@ -34,40 +35,31 @@ class FunctionRegistrySuite extends AnyFunSuite { dynamicFunctionRegistry.register("clickhouse_xxHash64", ClickHouseXxHash64) test("check StaticFunctionRegistry mappings") { - assert(staticFunctionRegistry.getFuncMappingBySpark === Map( - "ck_xx_hash64" -> "xxHash64", - "clickhouse_xxHash64" -> "xxHash64" - )) - assert((staticFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "clickhouse_xxHash64" - )) || (staticFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "ck_xx_hash64" - ))) + assert(staticFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) => + staticFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v) + }) + assert(staticFunctionRegistry.getFuncMappingByCk.forall { case (k, v) => + staticFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k) + }) } test("check DynamicFunctionRegistry mappings") { - assert(dynamicFunctionRegistry.getFuncMappingBySpark === Map( - "ck_xx_hash64" -> "xxHash64", - "clickhouse_xxHash64" -> "xxHash64" - )) - assert((dynamicFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "clickhouse_xxHash64" - )) || (dynamicFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "ck_xx_hash64" - ))) + assert(dynamicFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) => + dynamicFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v) + }) + assert(dynamicFunctionRegistry.getFuncMappingByCk.forall { case (k, v) => + dynamicFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k) + }) } test("check CompositeFunctionRegistry mappings") { val compositeFunctionRegistry = new CompositeFunctionRegistry(Array(staticFunctionRegistry, dynamicFunctionRegistry)) - assert(compositeFunctionRegistry.getFuncMappingBySpark === Map( - "ck_xx_hash64" -> "xxHash64", - "clickhouse_xxHash64" -> "xxHash64" - )) - assert((compositeFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "clickhouse_xxHash64" - )) || (compositeFunctionRegistry.getFuncMappingByCk === Map( - "xxHash64" -> "ck_xx_hash64" - ))) + assert(compositeFunctionRegistry.getFuncMappingBySpark.forall { case (k, v) => + compositeFunctionRegistry.load(k).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(v) + }) + assert(compositeFunctionRegistry.getFuncMappingByCk.forall { case (k, v) => + compositeFunctionRegistry.load(v).get.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.contains(k) + }) } } From ff243b591c1e411f5cccd31e138d9e1eae722a75 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 19 May 2023 22:09:39 +0800 Subject: [PATCH 03/20] Spark 3.4: Fixup sharding key needs to be mod by cluster weight on local sort --- .../xenon/clickhouse/spec/NodeSpec.scala | 2 + .../spark/sql/clickhouse/ExprUtils.scala | 41 ++++++------ .../clickhouse/func/FunctionRegistry.scala | 3 +- .../clickhouse/func/clickhouse/Pmod.scala | 63 +++++++++++++++++++ .../write/WriteJobDescription.scala | 6 +- 5 files changed, 93 insertions(+), 22 deletions(-) create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala diff --git a/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala b/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala index 454312df..eb809169 100644 --- a/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala +++ b/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala @@ -97,4 +97,6 @@ case class ClusterSpec( override def toString: String = s"cluster: $name, shards: [${shards.mkString(", ")}]" @JsonIgnore @transient override lazy val nodes: Array[NodeSpec] = shards.sorted.flatMap(_.nodes) + + def totalWeight: Int = shards.map(_.weight).sum } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 8a7d1f96..2b2a2cae 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -23,27 +23,37 @@ import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.connector.expressions.Expressions._ import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, _} -import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} +import org.apache.spark.sql.types.{StructField, StructType} import xenon.clickhouse.exception.CHClientException import xenon.clickhouse.expr._ import xenon.clickhouse.func.FunctionRegistry +import xenon.clickhouse.spec.ClusterSpec import scala.util.{Failure, Success, Try} class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable { + private def toSplitWithModulo(shardingKey: Expr, cluster: ClusterSpec): FuncExpr = + FuncExpr("positiveModulo", List(shardingKey, StringLiteral(cluster.totalWeight.toString))) + def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] = partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray - def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] = - (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray + def toSparkSplits( + shardingKey: Option[Expr], + partitionKey: Option[List[Expr]], + cluster: Option[ClusterSpec] + ): Array[Transform] = + (shardingKey.map(k => toSplitWithModulo(k, cluster.get)).seq ++ partitionKey.seq.flatten) + .flatten(toSparkTransformOpt).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], partitionKey: Option[List[Expr]], - sortingKey: Option[List[OrderExpr]] + sortingKey: Option[List[OrderExpr]], + cluster: Option[ClusterSpec] ): Array[SortOrder] = - toSparkSplits(shardingKeyIgnoreRand, partitionKey).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: + toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST @@ -93,25 +103,20 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S ) } - def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkTransform(expr)) match { - case Success(t) => Some(t) + def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match { + case Success(t: Transform) => Some(t) + case Success(_) => None case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow)) } - // Some functions of ClickHouse which match Spark pre-defined Transforms - // - // toYear, YEAR - Converts a date or date with time to a UInt16 (AD) - // toYYYYMM - Converts a date or date with time to a UInt32 (YYYY*100 + MM) - // toYYYYMMDD - Converts a date or date with time to a UInt32 (YYYY*10000 + MM*100 + DD) - // toHour, HOUR - Converts a date with time to a UInt8 (0-23) - - def toSparkTransform(expr: Expr): Transform = expr match { + def toSparkExpression(expr: Expr): V2Expression = expr match { case FieldRef(col) => identity(col) + case StringLiteral(value) => literal(value) case FuncExpr("rand", Nil) => apply("rand") case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) - case FuncExpr(funName, List(FieldRef(col))) if functionRegistry.getFuncMappingByCk.contains(funName) => - apply(functionRegistry.getFuncMappingByCk(funName), column(col)) + case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) => + apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*) case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported") } @@ -131,7 +136,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col) .orElse(secondarySchema.find(_.name == col)) .getOrElse(throw CHClientException(s"Invalid partition column: $col")) - case t @ ApplyTransform(transformName, _) => + case t @ ApplyTransform(transformName, _) if functionRegistry.load(transformName).isDefined => val resType = functionRegistry.load(transformName).getOrElse(throw new NoSuchFunctionException(transformName)) match { case f: ScalarFunction[_] => f.resultType() diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index e6094eaf..8a7ec436 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -57,7 +57,8 @@ object StaticFunctionRegistry extends FunctionRegistry { "clickhouse_years" -> Years, "clickhouse_months" -> Months, "clickhouse_days" -> Days, - "clickhouse_hours" -> Hours + "clickhouse_hours" -> Hours, + "sharding_pmod" -> Pmod ) override def list: Array[String] = functions.keys.toArray diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala new file mode 100644 index 00000000..e9eafb8d --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import xenon.clickhouse.func.ClickhouseEquivFunction + +object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { + + override def name: String = "sharding_pmod" + + override def canonicalName: String = s"clickhouse.$name" + + override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod") + + override def description: String = s"$name: (a: long, b: long) => mod: long" + + override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(a, b) if + (a match { + case StructField(_, LongType, _, _) => true + case StructField(_, IntegerType, _, _) => true + case StructField(_, ShortType, _, _) => true + case StructField(_, ByteType, _, _) => true + case StructField(_, StringType, _, _) => true + case _ => false + }) && + (b match { + case StructField(_, LongType, _, _) => true + case StructField(_, IntegerType, _, _) => true + case StructField(_, ShortType, _, _) => true + case StructField(_, ByteType, _, _) => true + case StructField(_, StringType, _, _) => true + case _ => false + }) => + this + case _ => throw new UnsupportedOperationException(s"Expect 2 integer arguments. $description") + } + + override def inputTypes: Array[DataType] = Array(LongType, LongType) + + override def resultType: DataType = LongType + + override def isResultNullable: Boolean = false + + def invoke(a: Long, b: Long): Long = { + val mod = a % b + if (mod < 0) mod + b else mod + } +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index b374c996..81a347ee 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -63,14 +63,14 @@ case class WriteJobDescription( def sparkSplits: Array[Transform] = if (writeOptions.repartitionByPartition) { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster) } else { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster) } def sparkSortOrders: Array[SortOrder] = { val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None - ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey) + ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster) } } From a1d4dce4ddbf54039be9f083da0ba7032f3143d7 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Sat, 20 May 2023 01:54:46 +0800 Subject: [PATCH 04/20] Scala 2.13: Fix Spark 3.4 compile issue --- .../scala/xenon/clickhouse/func/FunctionRegistry.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index 8a7ec436..fd12edc1 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -90,12 +90,12 @@ class DynamicFunctionRegistry extends FunctionRegistry { override def load(name: String): Option[UnboundFunction] = functions.get(name) override def getFuncMappingBySpark: Map[String, String] = - functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) => v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _)) - }.toMap + } override def getFuncMappingByCk: Map[String, String] = - functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) => + functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) => v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k)) - }.toMap + } } From 5ddb98f60e70674fbd5cd2e044472b421ccaccc6 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Mon, 22 May 2023 10:31:32 +0800 Subject: [PATCH 05/20] Spark 3.4: Optimize sharding key handling when shuffle and sort --- .../apache/spark/sql/clickhouse/ExprUtils.scala | 15 ++++++++++----- .../clickhouse/write/WriteJobDescription.scala | 4 ++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 2b2a2cae..cbf57630 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -41,11 +41,11 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S def toSparkSplits( shardingKey: Option[Expr], - partitionKey: Option[List[Expr]], - cluster: Option[ClusterSpec] + partitionKey: Option[List[Expr]] ): Array[Transform] = - (shardingKey.map(k => toSplitWithModulo(k, cluster.get)).seq ++ partitionKey.seq.flatten) - .flatten(toSparkTransformOpt).toArray + // no pmod shard key here, because we want to shuffle it more evenly, + // hence spread the load in Spark tasks to multiple Clickhouse nodes + (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], @@ -53,7 +53,11 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S sortingKey: Option[List[OrderExpr]], cluster: Option[ClusterSpec] ): Array[SortOrder] = - toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: + // pmod shard key here, because we need same cluster number but not same hash value + // to be sorted together and be written as a batch + toSparkSplits(shardingKeyIgnoreRand.map(k => toSplitWithModulo(k, cluster.get)), partitionKey).map( + Expressions.sort(_, SortDirection.ASCENDING) + ) ++: sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST @@ -104,6 +108,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S } def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match { + // need this function because spark `Table`'s `partitioning` field should be `Transform` case Success(t: Transform) => Some(t) case Success(_) => None case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index 81a347ee..de28ec87 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -63,9 +63,9 @@ case class WriteJobDescription( def sparkSplits: Array[Transform] = if (writeOptions.repartitionByPartition) { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey) } else { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None) } def sparkSortOrders: Array[SortOrder] = { From 000638e0c377d7c687eccd297c88fad0da9d05a3 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Mon, 22 May 2023 18:30:02 +0800 Subject: [PATCH 06/20] Spark 3.4: Optimize sharding key handling when shuffle and sort, approach 2 --- .../spark/sql/clickhouse/ExprUtils.scala | 21 +++++++++++-------- .../write/WriteJobDescription.scala | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index cbf57630..a873fc4d 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -41,11 +41,18 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S def toSparkSplits( shardingKey: Option[Expr], - partitionKey: Option[List[Expr]] + partitionKey: Option[List[Expr]], + cluster: Option[ClusterSpec] ): Array[Transform] = - // no pmod shard key here, because we want to shuffle it more evenly, - // hence spread the load in Spark tasks to multiple Clickhouse nodes - (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray + // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this: + // - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused + // by this. + // - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we + // need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the + // front for all tasks, resulting in instant high pressure for shard 1 when stage starts. + (shardingKey.map(k => + FuncExpr("positiveModulo", List(k, StringLiteral((cluster.get.totalWeight * 10).toString))) + ).seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], @@ -53,11 +60,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S sortingKey: Option[List[OrderExpr]], cluster: Option[ClusterSpec] ): Array[SortOrder] = - // pmod shard key here, because we need same cluster number but not same hash value - // to be sorted together and be written as a batch - toSparkSplits(shardingKeyIgnoreRand.map(k => toSplitWithModulo(k, cluster.get)), partitionKey).map( - Expressions.sort(_, SortDirection.ASCENDING) - ) ++: + toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index de28ec87..81a347ee 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -63,9 +63,9 @@ case class WriteJobDescription( def sparkSplits: Array[Transform] = if (writeOptions.repartitionByPartition) { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster) } else { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None) + ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster) } def sparkSortOrders: Array[SortOrder] = { From 59f3bed98e6f4bb66e14761249aa0c747df5e3bd Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Tue, 23 May 2023 17:30:41 +0800 Subject: [PATCH 07/20] Spark 3.4: Support variable length arguments for murmurHash (up to 5 string arguments) --- .../ClickHouseClusterHashUDFSuite.scala | 33 ++++++ .../xenon/clickhouse/func/MultiArgsHash.scala | 101 ++++++++++++++++++ .../scala/xenon/clickhouse/func/Util.scala | 52 +++++++++ .../func/clickhouse/MurmurHash2.scala | 60 +++-------- .../func/clickhouse/MurmurHash3.scala | 58 +++------- 5 files changed, 219 insertions(+), 85 deletions(-) create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala index 9ef15241..b3556258 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala @@ -30,6 +30,15 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry)) } + def product[A](xs: Seq[Seq[A]]): Seq[Seq[A]] = + xs.toList match { + case Nil => Seq(Seq()) + case head :: tail => for { + h <- head + t <- product(tail) + } yield h +: t + } + def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = { val sparkResult = spark.sql( s"""SELECT @@ -64,4 +73,28 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { } } } + + Seq( + "clickhouse_murmurHash3_64", + "clickhouse_murmurHash3_32", + "clickhouse_murmurHash2_64", + "clickhouse_murmurHash2_32" + ).foreach { funcSparkName => + val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName) + test(s"UDF $funcSparkName multiple args") { + val strings = Seq( + "\'spark-clickhouse-connector\'", + "\'Apache Spark\'", + "\'ClickHouse\'", + "\'Yandex\'", + "\'热爱\'", + "\'🇨🇳\'" + ) + val test_5 = strings.combinations(5) + test_5.foreach { seq => + val stringVal = seq.mkString(", ") + runTest(funcSparkName, funcCkName, stringVal) + } + } + } } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala new file mode 100644 index 00000000..dc635a27 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala @@ -0,0 +1,101 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func + +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction { + trait Base extends ScalarFunction[Long] { + // must not be private object, nor do it successors, because spark would compile them + override def canonicalName: String = s"clickhouse.$name" + override def resultType: DataType = LongType + override def isResultNullable: Boolean = false + } + + object Arg1 extends Base { + override def name: String = s"${funcName}_1" + override def inputTypes: Array[DataType] = Array.fill(1)(StringType) + def invoke(value: UTF8String): Long = invokeBase(value) + } + + object Arg2 extends Base { + override def name: String = s"${funcName}_2" + override def inputTypes: Array[DataType] = Array.fill(2)(StringType) + def invoke(v1: UTF8String, v2: UTF8String): Long = Seq(v1, v2).map(invokeBase).reduce(combineHashes) + } + + object Arg3 extends Base { + override def name: String = s"${funcName}_3" + override def inputTypes: Array[DataType] = Array.fill(3)(StringType) + def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String): Long = + Seq(v1, v2, v3).map(invokeBase).reduce(combineHashes) + } + + object Arg4 extends Base { + override def name: String = s"${funcName}_4" + override def inputTypes: Array[DataType] = Array.fill(4)(StringType) + def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String): Long = + Seq(v1, v2, v3, v4).map(invokeBase).reduce(combineHashes) + } + + object Arg5 extends Base { + override def name: String = s"${funcName}_4" + override def inputTypes: Array[DataType] = Array.fill(5)(StringType) + def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String, v5: UTF8String): Long = + Seq(v1, v2, v3, v4, v5).map(invokeBase).reduce(combineHashes) + } + private def isExceptedType(dt: DataType): Boolean = + dt.isInstanceOf[StringType] + + final override def name: String = funcName + final override def bind(inputType: StructType): BoundFunction = inputType.fields match { + case Array(StructField(_, dt, _, _)) if List(dt).forall(isExceptedType) => this.Arg1 + case Array( + StructField(_, dt1, _, _), + StructField(_, dt2, _, _) + ) if List(dt1, dt2).forall(isExceptedType) => + this.Arg2 + case Array( + StructField(_, dt1, _, _), + StructField(_, dt2, _, _), + StructField(_, dt3, _, _) + ) if List(dt1, dt2, dt3).forall(isExceptedType) => + this.Arg3 + case Array( + StructField(_, dt1, _, _), + StructField(_, dt2, _, _), + StructField(_, dt3, _, _), + StructField(_, dt4, _, _) + ) if List(dt1, dt2, dt3, dt4).forall(isExceptedType) => + this.Arg4 + case Array( + StructField(_, dt1, _, _), + StructField(_, dt2, _, _), + StructField(_, dt3, _, _), + StructField(_, dt4, _, _), + StructField(_, dt5, _, _) + ) if List(dt1, dt2, dt3, dt4, dt5).forall(isExceptedType) => + this.Arg5 + case _ => throw new UnsupportedOperationException(s"Expect up to 5 STRING argument. $description") + } + + protected def funcName: String + override val ckFuncNames: Array[String] + override def description: String = s"$name: (value: string, ...) => hash_value: long" + def invokeBase(value: UTF8String): Long + def combineHashes(v1: Long, v2: Long): Long +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala new file mode 100644 index 00000000..9ba35f10 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func + +object Util { + def intHash64Impl(x: Long): Long = + // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Functions/FunctionsHashing.h#L143 + intHash64(x ^ 0x4cf2d2baae6da887L) + + def intHash64(l: Long): Long = { + // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Common/HashTable/Hash.h#L28 + var x = l + x ^= x >>> 33; + x *= 0xff51afd7ed558ccdL; + x ^= x >>> 33; + x *= 0xc4ceb9fe1a85ec53L; + x ^= x >>> 33; + x + } + + def int32Impl(x: Long): Int = + // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Functions/FunctionsHashing.h#L133 + intHash32(x, 0x75d9543de018bf45L) + + def intHash32(l: Long, salt: Long): Int = { + // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Common/HashTable/Hash.h#L502 + var x = l + + x ^= salt; + x = (~x) + (x << 18) + x = x ^ ((x >>> 31) | (x << 33)) + x = x * 21 + x = x ^ ((x >>> 11) | (x << 53)) + x = x + (x << 6) + x = x ^ ((x >>> 22) | (x << 42)) + x.toInt + } + + def toUInt32Range(v: Long): Long = if (v < 0) v + (1L << 32) else v +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala index 49daaeae..052be5f9 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala @@ -14,64 +14,38 @@ package xenon.clickhouse.func.clickhouse -import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.commons.codec.digest.{MurmurHash2, MurmurHash3} import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.ClickhouseEquivFunction +import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} -object MurmurHash2_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { - - override def name: String = "clickhouse_murmurHash2_64" - - override def canonicalName: String = s"clickhouse.$name" +object MurmurHash2_64 extends MultiArgsHash { + // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L460 + override protected def funcName: String = "clickhouse_murmurHash2_64" override val ckFuncNames: Array[String] = Array("murmurHash2_64") - override def description: String = s"$name: (value: string) => hash_value: long" - - override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, StringType, _, _)) => this - case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") - } - - override def inputTypes: Array[DataType] = Array(StringType) - - override def resultType: DataType = LongType - - override def isResultNullable: Boolean = false - - def invoke(values: UTF8String): Long = { + override def invokeBase(value: UTF8String): Long = { // ignore UInt64 vs Int64 - val data = values.getBytes + val data = value.getBytes MurmurHash2.hash64(data, data.length, 0) } -} - -object MurmurHash2_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { - override def name: String = "clickhouse_murmurHash2_32" + override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2 +} - override def canonicalName: String = s"clickhouse.$name" +object MurmurHash2_32 extends MultiArgsHash { + // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519 + override protected def funcName: String = "clickhouse_murmurHash2_32" override val ckFuncNames: Array[String] = Array("murmurHash2_32") - override def description: String = s"$name: (value: string) => hash_value: long" - - override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, StringType, _, _)) => this - case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + override def invokeBase(value: UTF8String): Long = { + val data = value.getBytes + val v = MurmurHash2.hash32(data, data.length, 0) + Util.toUInt32Range(v) } - override def inputTypes: Array[DataType] = Array(StringType) - - override def resultType: DataType = LongType - - override def isResultNullable: Boolean = false - - def invoke(values: UTF8String): Long = { - val data = values.getBytes - val v = MurmurHash2.hash32(data, data.length, 0).toLong - if (v < 0) v + (1L << 32) else v - } + override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala index db15a8e7..f353d1e7 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala @@ -18,61 +18,35 @@ import org.apache.commons.codec.digest.MurmurHash3 import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.ClickhouseEquivFunction +import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} -object MurmurHash3_64 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { - - override def name: String = "clickhouse_murmurHash3_64" - - override def canonicalName: String = s"clickhouse.$name" +object MurmurHash3_64 extends MultiArgsHash { + // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L543 + override protected def funcName: String = "clickhouse_murmurHash3_64" override val ckFuncNames: Array[String] = Array("murmurHash3_64") - override def description: String = s"$name: (value: string) => hash_value: long" - - override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, StringType, _, _)) => this - case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") - } - - override def inputTypes: Array[DataType] = Array(StringType) - - override def resultType: DataType = LongType - - override def isResultNullable: Boolean = false - - def invoke(values: UTF8String): Long = { + override def invokeBase(value: UTF8String): Long = { // ignore UInt64 vs Int64 - val data = values.getBytes + val data = value.getBytes val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0) hashes(0) ^ hashes(1) } -} - -object MurmurHash3_32 extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { - override def name: String = "clickhouse_murmurHash3_32" + override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2 +} - override def canonicalName: String = s"clickhouse.$name" +object MurmurHash3_32 extends MultiArgsHash { + // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519 + override protected def funcName: String = "clickhouse_murmurHash3_32" override val ckFuncNames: Array[String] = Array("murmurHash3_32") - override def description: String = s"$name: (value: string) => hash_value: long" - - override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, StringType, _, _)) => this - case _ => throw new UnsupportedOperationException(s"Expect 1 STRING argument. $description") + override def invokeBase(value: UTF8String): Long = { + val data = value.getBytes + val v = MurmurHash3.hash32x86(data, 0, data.length, 0) + Util.toUInt32Range(v) } - override def inputTypes: Array[DataType] = Array(StringType) - - override def resultType: DataType = LongType - - override def isResultNullable: Boolean = false - - def invoke(values: UTF8String): Long = { - val data = values.getBytes - val v = MurmurHash3.hash32x86(data, 0, data.length, 0).toLong - if (v < 0) v + (1L << 32) else v - } + override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2) } From af14b3a9fb8706503d332c79fe35a8958de507b2 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Wed, 24 May 2023 13:52:37 +0800 Subject: [PATCH 08/20] Spark 3.4: add CityHash64 --- .../ClickHouseClusterHashUDFSuite.scala | 22 +- .../ClusterShardByTransformSuite.scala | 3 +- .../clickhouse/func/FunctionRegistry.scala | 1 + .../func/clickhouse/CityHash64.scala | 40 ++ .../clickhouse/cityhash/CityHash_v1_0_2.java | 344 ++++++++++++++++++ .../func/clickhouse/cityhash/UInt128.java | 34 ++ 6 files changed, 439 insertions(+), 5 deletions(-) create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java create mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala index b3556258..adf3d9de 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala @@ -55,7 +55,10 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { ).head.getString(0) val clickhouseResultJson = om.readTree(clickhouseResultJsonStr) val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText) - assert(sparkHashVal == clickhouseHashVal) + assert( + sparkHashVal == clickhouseHashVal, + s"ck_function: $funcCkName, spark_function: $funcSparkName, args: ($stringVal)" + ) } Seq( @@ -63,11 +66,20 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { "clickhouse_murmurHash3_64", "clickhouse_murmurHash3_32", "clickhouse_murmurHash2_64", - "clickhouse_murmurHash2_32" + "clickhouse_murmurHash2_32", + "clickhouse_cityHash64" ).foreach { funcSparkName => val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName) test(s"UDF $funcSparkName") { - Seq("spark-clickhouse-connector", "Apache Spark", "ClickHouse", "Yandex", "热爱", "🇨🇳").foreach { rawStringVal => + Seq( + "spark-clickhouse-connector", + "Apache Spark", + "ClickHouse", + "Yandex", + "热爱", + "在传统的行式数据库系统中,数据按如下顺序存储:", + "🇨🇳" + ).foreach { rawStringVal => val stringVal = s"\'$rawStringVal\'" runTest(funcSparkName, funcCkName, stringVal) } @@ -78,7 +90,8 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { "clickhouse_murmurHash3_64", "clickhouse_murmurHash3_32", "clickhouse_murmurHash2_64", - "clickhouse_murmurHash2_32" + "clickhouse_murmurHash2_32", + "clickhouse_cityHash64" ).foreach { funcSparkName => val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName) test(s"UDF $funcSparkName multiple args") { @@ -88,6 +101,7 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { "\'ClickHouse\'", "\'Yandex\'", "\'热爱\'", + "\'在传统的行式数据库系统中,数据按如下顺序存储:\'", "\'🇨🇳\'" ) val test_5 = strings.combinations(5) diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala index db8a3036..21e984bc 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -102,7 +102,8 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { ("murmurHash2_64", Array("value")), ("murmurHash2_32", Array("value")), ("murmurHash3_64", Array("value")), - ("murmurHash3_32", Array("value")) + ("murmurHash3_32", Array("value")), + ("cityHash64", Array("value")) ).foreach { case (func_name: String, func_args: Array[String]) => test(s"shard by $func_name")(runTest(func_name, func_args)) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index fd12edc1..a509f07e 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -54,6 +54,7 @@ object StaticFunctionRegistry extends FunctionRegistry { "clickhouse_murmurHash2_64" -> MurmurHash2_64, "clickhouse_murmurHash3_32" -> MurmurHash3_32, "clickhouse_murmurHash3_64" -> MurmurHash3_64, + "clickhouse_cityHash64" -> CityHash64, "clickhouse_years" -> Years, "clickhouse_months" -> Months, "clickhouse_days" -> Days, diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala new file mode 100644 index 00000000..fa599cbd --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse + +import io.netty.buffer.{ByteBuf, Unpooled} +import org.apache.spark.unsafe.types.UTF8String +import xenon.clickhouse.func.MultiArgsHash +import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128} + +object CityHash64 extends MultiArgsHash { + // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L694 + + override protected def funcName: String = "clickhouse_cityHash64" + override val ckFuncNames: Array[String] = Array("cityHash64") + + def convertToByteBuf(array: Array[Byte]): ByteBuf = { + val byteBuf = Unpooled.buffer(array.length).writeBytes(array) + byteBuf + } + + override def invokeBase(value: UTF8String): Long = { + // ignore UInt64 vs Int64 + val data = value.getBytes + CityHash_v1_0_2.CityHash64(convertToByteBuf(data), 0, data.length) + } + + override def combineHashes(v1: Long, v2: Long): Long = CityHash_v1_0_2.Hash128to64(new UInt128(v1, v2)) +} diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java new file mode 100644 index 00000000..df218df3 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java @@ -0,0 +1,344 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse.cityhash; + +import io.netty.buffer.ByteBuf; + +// copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/CityHash_v1_0_2.java +// fixed some bugs involving int32 to uint32 conversion +final public class CityHash_v1_0_2 { + + private static final long kMul = 0x9ddfea08eb382d69L; + // Some primes between 2^63 and 2^64 for various uses. + private static final long k0 = 0xc3a5c85c97cb3127L; + private static final long k1 = 0xb492b66fbe98f273L; + private static final long k2 = 0x9ae16a3b2f90404fL; + private static final long k3 = 0xc949d7c7509e6557L; + + private CityHash_v1_0_2() { /* restricted */ } + + private static long Fetch64(ByteBuf p, int index) { + return p.getLongLE(index); + } + + private static int Fetch32(ByteBuf p, int index) { + return p.getIntLE(index); + } + + private static long toUint32(int x) { + return x & 0xFFFFFFFFL; + } + + // Equivalent to Rotate(), but requires the second arg to be non-zero. +// On x86-64, and probably others, it's possible for this to compile +// to a single instruction if both args are already in registers. + private static long RotateByAtLeast1(long val, int shift) { + return (val >>> shift) | (val << (64 - shift)); + } + + private static long ShiftMix(long val) { + return val ^ (val >>> 47); + } + + private static long Uint128Low64(UInt128 x) { + return x.first; + } + + private static long Rotate(long val, int shift) { + return shift == 0 ? val : (val >>> shift) | (val << (64 - shift)); + } + + private static long Uint128High64(UInt128 x) { + return x.second; + } + + // Hash 128 input bits down to 64 bits of output. +// This is intended to be a reasonably good hash function. + public static long Hash128to64(UInt128 x) { + // Murmur-inspired hashing. + long a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >>> 47); + long b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >>> 47); + b *= kMul; + return b; + } + + private static long HashLen16(long u, long v) { + return Hash128to64(UInt128.of(u, v)); + } + + private static long HashLen0to16(ByteBuf s, int index, int len) { + if (len > 8) { + long a = Fetch64(s, index); + long b = Fetch64(s, index + len - 8); + return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; + } + if (len >= 4) { + long a = toUint32(Fetch32(s, index)); + return HashLen16(len + (a << 3), toUint32(Fetch32(s, index + len - 4))); + } + if (len > 0) { + byte a = s.getByte(index); + byte b = s.getByte(index + len >>> 1); + byte c = s.getByte(index + len - 1); + int y = (a & 0xFF) + ((b & 0xFF) << 8); + int z = len + ((c & 0xFF) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; + } + + // This probably works well for 16-byte strings as well, but it may be overkill +// in that case. + private static long HashLen17to32(ByteBuf s, int index, int len) { + long a = Fetch64(s, index) * k1; + long b = Fetch64(s, index + 8); + long c = Fetch64(s, index + len - 8) * k2; + long d = Fetch64(s, index + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) - c + len); + } + + // Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. + private static UInt128 WeakHashLen32WithSeeds( + long w, long x, long y, long z, long a, long b) { + a += w; + b = Rotate(b + a + z, 21); + long c = a; + a += x; + a += y; + b += Rotate(a, 44); + return UInt128.of(a + z, b + c); + } + + // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. + private static UInt128 WeakHashLen32WithSeeds(ByteBuf s, int index, long a, long b) { + return WeakHashLen32WithSeeds(Fetch64(s, index), + Fetch64(s, index + 8), + Fetch64(s, index + 16), + Fetch64(s, index + 24), + a, + b); + } + + // Return an 8-byte hash for 33 to 64 bytes. + private static long HashLen33to64(ByteBuf s, int index, int len) { + long z = Fetch64(s, index + 24); + long a = Fetch64(s, index) + (len + Fetch64(s, index + len - 16)) * k0; + long b = Rotate(a + z, 52); + long c = Rotate(a, 37); + a += Fetch64(s, index + 8); + c += Rotate(a, 7); + a += Fetch64(s, index + 16); + long vf = a + z; + long vs = b + Rotate(a, 31) + c; + a = Fetch64(s, index + 16) + Fetch64(s, index + len - 32); + z = Fetch64(s, index + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s, index + len - 24); + c += Rotate(a, 7); + a += Fetch64(s, index + len - 16); + long wf = a + z; + long ws = b + Rotate(a, 31) + c; + long r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; + } + + // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in ssize_t. Based on City and Murmur. + private static UInt128 CityMurmur(ByteBuf s, int index, int len, UInt128 seed) { + long a = Uint128Low64(seed); + long b = Uint128High64(seed); + long c; + long d; + int l = len - 16; + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + HashLen0to16(s, index, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s, index) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s, index + len - 8) + k1, a); + d = HashLen16(b + len, c + Fetch64(s, index + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s, index) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s, index + 8) * k1) * k1; + c *= k1; + d ^= c; + index += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return UInt128.of(a ^ b, HashLen16(b, a)); + } + + public static long CityHash64(ByteBuf s, int index, int len) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, index, len); + } else { + return HashLen17to32(s, index, len); + } + } else if (len <= 64) { + return HashLen33to64(s, index, len); + } + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + long x = Fetch64(s, index); + long y = Fetch64(s, index + len - 16) ^ k1; + long z = Fetch64(s, index + len - 56) ^ k0; + UInt128 v = WeakHashLen32WithSeeds(s, len - 64, len, y); + UInt128 w = WeakHashLen32WithSeeds(s, len - 32, len * k1, k0); + z += ShiftMix(v.second) * k1; + x = Rotate(z + x, 39) * k1; + y = Rotate(y, 33) * k1; + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~63; + do { + x = Rotate(x + y + v.first + Fetch64(s, index + 16), 37) * k1; + y = Rotate(y + v.second + Fetch64(s, index + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, index, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s, index + 32, z + w.second, y); + // swap + long t = z; + z = x; + x = t; + index += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); + } + + private static long CityHash64WithSeed(ByteBuf s, int index, int len, long seed) { + return CityHash64WithSeeds(s, index, len, k2, seed); + } + + private static long CityHash64WithSeeds(ByteBuf s, int index, int len, + long seed0, long seed1) { + return HashLen16(CityHash64(s, index, len) - seed0, seed1); + } + + private static UInt128 CityHash128WithSeed(ByteBuf s, int index, int len, UInt128 seed) { + if (len < 128) { + return CityMurmur(s, index, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + UInt128 v, w; + long x = Uint128Low64(seed); + long y = Uint128High64(seed); + long z = len * k1; + long vFirst = Rotate(y ^ k1, 49) * k1 + Fetch64(s, index); + long vSecond = Rotate(vFirst, 42) * k1 + Fetch64(s, index + 8); + long wFirst = Rotate(y + z, 35) * k1 + x; + long wSecond = Rotate(x + Fetch64(s, index + 88), 53) * k1; + +// v = UInt128.of(vFirst, vSecond); +// w = UInt128.of(wFirst, wSecond); + + // This is the same inner loop as CityHash64(), manually unrolled. + do { + x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1; + y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1; + x ^= wSecond; + y ^= vFirst; + z = Rotate(z ^ wFirst, 33); + v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst); + w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y); + + vFirst = v.first; + vSecond = v.second; + wFirst = w.first; + wSecond = w.second; + { + long swap = z; + z = x; + x = swap; + } + index += 64; + x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1; + y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1; + x ^= wSecond; + y ^= vFirst; + z = Rotate(z ^ wFirst, 33); + v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst); + w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y); + + vFirst = v.first; + vSecond = v.second; + wFirst = w.first; + wSecond = w.second; + { + long swap = z; + z = x; + x = swap; + } + index += 64; + len -= 128; + } while (len >= 128); + y += Rotate(wFirst, 37) * k0 + z; + x += Rotate(vFirst + z, 49) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (int tail_done = 0; tail_done < len; ) { + tail_done += 32; + y = Rotate(y - x, 42) * k0 + vSecond; + wFirst += Fetch64(s, index + len - tail_done + 16); + x = Rotate(x, 49) * k0 + wFirst; + wFirst += vFirst; + v = WeakHashLen32WithSeeds(s, index + len - tail_done, vFirst, vSecond); + + vFirst = v.first; + vSecond = v.second; + } + // At this point our 48 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 48-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x, vFirst); + y = HashLen16(y, wFirst); + return UInt128.of(HashLen16(x + vSecond, wSecond) + y, + HashLen16(x + wSecond, y + vSecond)); + } + + public static UInt128 CityHash128(ByteBuf s, int len) { + if (len >= 16) { + return CityHash128WithSeed(s, 16, + len - 16, + UInt128.of(Fetch64(s, 0) ^ k3, + Fetch64(s, 8))); + } else if (len >= 8) { + return CityHash128WithSeed(null, + 0, 0, + UInt128.of(Fetch64(s, 0) ^ (len * k0), + Fetch64(s, len - 8) ^ k1)); + } else { + return CityHash128WithSeed(s, 0, len, UInt128.of(k0, k1)); + } + } +} + diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java new file mode 100644 index 00000000..2ba6c1f7 --- /dev/null +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package xenon.clickhouse.func.clickhouse.cityhash; + +/** + * @author Dmitriy Poluyanov + * @since 15/02/2018 + * copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/UInt128.java + */ +final public class UInt128 { + final public long first; + final public long second; + + public UInt128(long first, long second) { + this.first = first; + this.second = second; + } + + static UInt128 of(long first, long second) { + return new UInt128(first, second); + } +} From 22f191a4701e81b0c5b7f89ffc44fce014dc8f7b Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 26 May 2023 11:13:02 +0800 Subject: [PATCH 09/20] Spark 3.4: Optimize sharding key handling when shuffle and sort, approach 3 --- .../spark/sql/clickhouse/ExprUtils.scala | 27 +++++++------------ .../write/WriteJobDescription.scala | 16 +++++++++-- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index a873fc4d..8ec72448 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -33,26 +33,11 @@ import scala.util.{Failure, Success, Try} class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable { - private def toSplitWithModulo(shardingKey: Expr, cluster: ClusterSpec): FuncExpr = - FuncExpr("positiveModulo", List(shardingKey, StringLiteral(cluster.totalWeight.toString))) - def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] = partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray - def toSparkSplits( - shardingKey: Option[Expr], - partitionKey: Option[List[Expr]], - cluster: Option[ClusterSpec] - ): Array[Transform] = - // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this: - // - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused - // by this. - // - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we - // need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the - // front for all tasks, resulting in instant high pressure for shard 1 when stage starts. - (shardingKey.map(k => - FuncExpr("positiveModulo", List(k, StringLiteral((cluster.get.totalWeight * 10).toString))) - ).seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray + def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] = + (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], @@ -60,7 +45,10 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S sortingKey: Option[List[OrderExpr]], cluster: Option[ClusterSpec] ): Array[SortOrder] = - toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: + toSparkSplits( + shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)), + partitionKey + ).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST @@ -158,4 +146,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S object ExprUtils { def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry) + + def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr = + FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString))) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index 81a347ee..f0d9a5d9 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -62,10 +62,22 @@ case class WriteJobDescription( } def sparkSplits: Array[Transform] = + // Pmod by total weight * constant. Note that this key will be further hashed by spark. Reasons of doing this: + // - Enlarged range of modulo to avoid hash collision of small number of shards, hence mitigate data skew caused + // by this. + // - Still distribute data from one shard to only a subset of executors. If we do not apply modulo (instead we + // need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the + // front for all tasks, resulting in instant high pressure for shard 1 when stage starts. if (writeOptions.repartitionByPartition) { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, partitionKey, cluster) + ExprUtils(functionRegistry).toSparkSplits( + shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)), + partitionKey + ) } else { - ExprUtils(functionRegistry).toSparkSplits(shardingKeyIgnoreRand, None, cluster) + ExprUtils(functionRegistry).toSparkSplits( + shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)), + None + ) } def sparkSortOrders: Array[SortOrder] = { From ea5ed0e236d3862c6b4a63362ce0496b31faafcc Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 26 May 2023 15:00:47 +0800 Subject: [PATCH 10/20] Spark 3.4 UDF: Amend input type, Make clickhouse function nullable, better spark help text --- .../src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala | 2 +- .../main/scala/xenon/clickhouse/func/clickhouse/Days.scala | 5 +++-- .../main/scala/xenon/clickhouse/func/clickhouse/Hours.scala | 5 +++-- .../main/scala/xenon/clickhouse/func/clickhouse/Months.scala | 5 +++-- .../main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala | 2 ++ .../scala/xenon/clickhouse/func/clickhouse/XxHash64.scala | 2 ++ .../main/scala/xenon/clickhouse/func/clickhouse/Years.scala | 5 +++-- 7 files changed, 17 insertions(+), 9 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala index dc635a27..555001fa 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala @@ -23,7 +23,7 @@ abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunctio // must not be private object, nor do it successors, because spark would compile them override def canonicalName: String = s"clickhouse.$name" override def resultType: DataType = LongType - override def isResultNullable: Boolean = false + override def toString: String = name } object Arg1 extends Base { diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala index 9ceca80e..672fd44f 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala @@ -27,14 +27,15 @@ object Days extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqui override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("toYYYYMMDD") override def description: String = s"$name: (date: Date) => shard_num: int" override def bind(inputType: StructType): BoundFunction = inputType.fields match { case Array(StructField(_, DateType, _, _)) => this - case Array(StructField(_, TimestampType, _, _)) => this - case Array(StructField(_, StringType, _, _)) => this +// case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala index 77dbe4c2..0abe25cb 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala @@ -27,13 +27,14 @@ object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("toHour", "HOUR") override def description: String = s"$name: (time: timestamp) => shard_num: int" override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, TimestampType, _, _)) => this - case Array(StructField(_, StringType, _, _)) => this + case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this case _ => throw new UnsupportedOperationException(s"Expect 1 TIMESTAMP argument. $description") } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala index 0be1bc9b..846dd245 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala @@ -27,14 +27,15 @@ object Months extends UnboundFunction with ScalarFunction[Int] with ClickhouseEq override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("toYYYYMM") override def description: String = s"$name: (date: Date) => shard_num: int" override def bind(inputType: StructType): BoundFunction = inputType.fields match { case Array(StructField(_, DateType, _, _)) => this - case Array(StructField(_, TimestampType, _, _)) => this - case Array(StructField(_, StringType, _, _)) => this +// case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala index e9eafb8d..f7c3e228 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala @@ -24,6 +24,8 @@ object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqu override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod") override def description: String = s"$name: (a: long, b: long) => mod: long" diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala index f02af236..241ae9d8 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala @@ -33,6 +33,8 @@ object ClickHouseXxHash64 extends UnboundFunction with ScalarFunction[Long] with override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("xxHash64") override def description: String = s"$name: (value: string) => hash_value: long" diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala index b3c0a135..4b2e650d 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala @@ -27,14 +27,15 @@ object Years extends UnboundFunction with ScalarFunction[Int] with ClickhouseEqu override def canonicalName: String = s"clickhouse.$name" + override def toString: String = name + override val ckFuncNames: Array[String] = Array("toYear", "YEAR") override def description: String = s"$name: (date: Date) => shard_num: int" override def bind(inputType: StructType): BoundFunction = inputType.fields match { case Array(StructField(_, DateType, _, _)) => this - case Array(StructField(_, TimestampType, _, _)) => this - case Array(StructField(_, StringType, _, _)) => this +// case Array(StructField(_, TimestampType, _, _)) | Array(StructField(_, TimestampNTZType, _, _)) => this case _ => throw new UnsupportedOperationException(s"Expect 1 DATE argument. $description") } From a8bdcbf1a58c0f9f16bdbe907b29a28047dce74d Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Tue, 30 May 2023 18:42:14 +0800 Subject: [PATCH 11/20] Spark 3.4: Optimize sharding key handling when shuffle and sort, amend approach 3 --- .../scala/xenon/clickhouse/write/WriteJobDescription.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index f0d9a5d9..4fb5afcf 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -70,12 +70,12 @@ case class WriteJobDescription( // front for all tasks, resulting in instant high pressure for shard 1 when stage starts. if (writeOptions.repartitionByPartition) { ExprUtils(functionRegistry).toSparkSplits( - shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)), + shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), partitionKey ) } else { ExprUtils(functionRegistry).toSparkSplits( - shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 10)), + shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), None ) } From 3dcdd81e421019a5fd00a41052522198f908bdaa Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 2 Jun 2023 10:46:12 +0800 Subject: [PATCH 12/20] Spark 3.4: Change ExprUtils to implicit --- .../ClusterShardByTransformSuite.scala | 2 +- .../spark/sql/clickhouse/ExprUtils.scala | 73 ++++++++++--------- .../xenon/clickhouse/ClickHouseCatalog.scala | 2 +- .../xenon/clickhouse/ClickHouseTable.scala | 4 +- .../clickhouse/write/ClickHouseWriter.scala | 2 +- .../write/WriteJobDescription.scala | 10 ++- 6 files changed, 50 insertions(+), 43 deletions(-) diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala index 21e984bc..32e4fc5d 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -106,7 +106,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { ("cityHash64", Array("value")) ).foreach { case (func_name: String, func_args: Array[String]) => - test(s"shard by $func_name")(runTest(func_name, func_args)) + test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args)) } } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 8ec72448..55350ebe 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -31,20 +31,24 @@ import xenon.clickhouse.spec.ClusterSpec import scala.util.{Failure, Success, Try} -class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable { +object ExprUtils extends SQLConfHelper with Serializable { - def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] = - partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray + def toSparkPartitions(partitionKey: Option[List[Expr]])(implicit + functionRegistry: FunctionRegistry + ): Array[Transform] = + partitionKey.seq.flatten.flatten(toSparkTransformOpt(_)).toArray - def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] = - (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray + def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]])(implicit + functionRegistry: FunctionRegistry + ): Array[Transform] = + (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_)).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], partitionKey: Option[List[Expr]], sortingKey: Option[List[OrderExpr]], cluster: Option[ClusterSpec] - ): Array[SortOrder] = + )(implicit functionRegistry: FunctionRegistry): Array[SortOrder] = toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)), partitionKey @@ -52,13 +56,15 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST - toSparkTransformOpt(expr).map(trans => Expressions.sort(trans, direction, nullOrder)) + toSparkTransformOpt(expr).map(trans => + Expressions.sort(trans, direction, nullOrder) + ) }.toArray private def loadV2FunctionOpt( name: String, args: Seq[Expression] - ): Option[BoundFunction] = { + )(implicit functionRegistry: FunctionRegistry): Option[BoundFunction] = { def loadFunction(ident: Identifier): UnboundFunction = functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident)) val inputType = StructType(args.zipWithIndex.map { @@ -77,7 +83,10 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S } } - def toCatalyst(v2Expr: V2Expression, fields: Array[StructField]): Expression = + def toCatalyst( + v2Expr: V2Expression, + fields: Array[StructField] + )(implicit functionRegistry: FunctionRegistry): Expression = v2Expr match { case IdentityTransform(ref) => toCatalyst(ref, fields) case ref: NamedReference if ref.fieldNames.length == 1 => @@ -88,9 +97,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S BoundReference(ordinal, field.dataType, field.nullable) case t: Transform => val catalystArgs = t.arguments().map(toCatalyst(_, fields)) - loadV2FunctionOpt(t.name(), catalystArgs).map { bound => - TransformExpression(bound, catalystArgs) - }.getOrElse { + loadV2FunctionOpt(t.name(), catalystArgs).map(bound => TransformExpression(bound, catalystArgs)).getOrElse { throw CHClientException(s"Unsupported expression: $v2Expr") } case _ => throw CHClientException( @@ -98,25 +105,27 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S ) } - def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match { - // need this function because spark `Table`'s `partitioning` field should be `Transform` - case Success(t: Transform) => Some(t) - case Success(_) => None - case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None - case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow)) - } + def toSparkTransformOpt(expr: Expr)(implicit functionRegistry: FunctionRegistry): Option[Transform] = + Try(toSparkExpression(expr)) match { + // need this function because spark `Table`'s `partitioning` field should be `Transform` + case Success(t: Transform) => Some(t) + case Success(_) => None + case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None + case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow)) + } - def toSparkExpression(expr: Expr): V2Expression = expr match { - case FieldRef(col) => identity(col) - case StringLiteral(value) => literal(value) - case FuncExpr("rand", Nil) => apply("rand") - case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) - case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) => - apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*) - case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported") - } + def toSparkExpression(expr: Expr)(implicit functionRegistry: FunctionRegistry): V2Expression = + expr match { + case FieldRef(col) => identity(col) + case StringLiteral(value) => literal(value) + case FuncExpr("rand", Nil) => apply("rand") + case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) + case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) => + apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*) + case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported") + } - def toClickHouse(transform: Transform): Expr = transform match { + def toClickHouse(transform: Transform)(implicit functionRegistry: FunctionRegistry): Expr = transform match { case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe) case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) => FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList) @@ -128,7 +137,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S primarySchema: StructType, secondarySchema: StructType, transform: Transform - ): StructField = transform match { + )(implicit functionRegistry: FunctionRegistry): StructField = transform match { case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col) .orElse(secondarySchema.find(_.name == col)) .getOrElse(throw CHClientException(s"Invalid partition column: $col")) @@ -142,10 +151,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket") case other: Transform => throw CHClientException(s"Unsupported transform: $other") } -} - -object ExprUtils { - def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry) def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr = FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString))) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala index 5fd043cd..83327fa4 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala @@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog val partitionsClause = partitions match { case transforms if transforms.nonEmpty => - transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")") + transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")") case _ => "" } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala index f4e19071..ced6e07e 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala @@ -133,11 +133,11 @@ case class ClickHouseTable( private lazy val metadataSchema: StructType = StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField)) - override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey) + override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry) override lazy val partitionSchema: StructType = StructType( partitioning.map(partTransform => - ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform) + ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry) ) ) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala index 3cd43c5e..c9e14c2d 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala @@ -63,7 +63,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match { case None => None case Some(v2Expr) => - val catalystExpr = ExprUtils(writeJob.functionRegistry).toCatalyst(v2Expr, writeJob.dataSetSchema.fields) + val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)(writeJob.functionRegistry) catalystExpr match { case BoundReference(_, dataType, _) if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType` diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index 4fb5afcf..bb6cca02 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -41,6 +41,8 @@ case class WriteJobDescription( functionRegistry: FunctionRegistry ) { + implicit val _functionRegistry: FunctionRegistry = functionRegistry + def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match { case dist: DistributedEngineSpec if convert2Local => dist.local_db case _ => tableSpec.database @@ -57,7 +59,7 @@ case class WriteJobDescription( } def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match { - case Some(expr) => ExprUtils(functionRegistry).toSparkTransformOpt(expr) + case Some(expr) => ExprUtils.toSparkTransformOpt(expr) case _ => None } @@ -69,12 +71,12 @@ case class WriteJobDescription( // need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the // front for all tasks, resulting in instant high pressure for shard 1 when stage starts. if (writeOptions.repartitionByPartition) { - ExprUtils(functionRegistry).toSparkSplits( + ExprUtils.toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), partitionKey ) } else { - ExprUtils(functionRegistry).toSparkSplits( + ExprUtils.toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), None ) @@ -83,6 +85,6 @@ case class WriteJobDescription( def sparkSortOrders: Array[SortOrder] = { val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None - ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster) + ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster) } } From 386ddb0daa420f82eb3874f319785cba841facc2 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Sun, 25 Jun 2023 13:51:20 +0800 Subject: [PATCH 13/20] Spark 3.4 UDF: clickhouse code reference using tag from commit hash --- .../src/main/scala/xenon/clickhouse/func/Util.scala | 8 ++++---- .../xenon/clickhouse/func/clickhouse/CityHash64.scala | 2 +- .../xenon/clickhouse/func/clickhouse/MurmurHash2.scala | 4 ++-- .../xenon/clickhouse/func/clickhouse/MurmurHash3.scala | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala index 9ba35f10..ac7c331f 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Util.scala @@ -16,11 +16,11 @@ package xenon.clickhouse.func object Util { def intHash64Impl(x: Long): Long = - // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Functions/FunctionsHashing.h#L143 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L140 intHash64(x ^ 0x4cf2d2baae6da887L) def intHash64(l: Long): Long = { - // https://github.com/ClickHouse/ClickHouse/blob/f4c73e94d21c6de0b1af7da3c42c2db6bf97fc73/src/Common/HashTable/Hash.h#L28 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Common/HashTable/Hash.h#L26 var x = l x ^= x >>> 33; x *= 0xff51afd7ed558ccdL; @@ -31,11 +31,11 @@ object Util { } def int32Impl(x: Long): Int = - // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Functions/FunctionsHashing.h#L133 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L133 intHash32(x, 0x75d9543de018bf45L) def intHash32(l: Long, salt: Long): Int = { - // https://github.com/ClickHouse/ClickHouse/blob/a05088ab731f1e625ce5197829f59b765c94474f/src/Common/HashTable/Hash.h#L502 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Common/HashTable/Hash.h#L502 var x = l x ^= salt; diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala index fa599cbd..160d45e9 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala @@ -20,7 +20,7 @@ import xenon.clickhouse.func.MultiArgsHash import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128} object CityHash64 extends MultiArgsHash { - // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L694 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694 override protected def funcName: String = "clickhouse_cityHash64" override val ckFuncNames: Array[String] = Array("cityHash64") diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala index 052be5f9..f2ff9ed2 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala @@ -21,7 +21,7 @@ import org.apache.spark.unsafe.types.UTF8String import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} object MurmurHash2_64 extends MultiArgsHash { - // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L460 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460 override protected def funcName: String = "clickhouse_murmurHash2_64" override val ckFuncNames: Array[String] = Array("murmurHash2_64") @@ -36,7 +36,7 @@ object MurmurHash2_64 extends MultiArgsHash { } object MurmurHash2_32 extends MultiArgsHash { - // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519 override protected def funcName: String = "clickhouse_murmurHash2_32" override val ckFuncNames: Array[String] = Array("murmurHash2_32") diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala index f353d1e7..1db654c1 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala @@ -21,7 +21,7 @@ import org.apache.spark.unsafe.types.UTF8String import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} object MurmurHash3_64 extends MultiArgsHash { - // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L543 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L543 override protected def funcName: String = "clickhouse_murmurHash3_64" override val ckFuncNames: Array[String] = Array("murmurHash3_64") @@ -37,7 +37,7 @@ object MurmurHash3_64 extends MultiArgsHash { } object MurmurHash3_32 extends MultiArgsHash { - // https://github.com/ClickHouse/ClickHouse/blob/a4fe3fbb1f288b4e066eb3781b2c7b9e238a4aa3/src/Functions/FunctionsHashing.h#L519 + // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519 override protected def funcName: String = "clickhouse_murmurHash3_32" override val ckFuncNames: Array[String] = Array("murmurHash3_32") From 286c21faae13f5a9f33ded1f5f3d45755d9a88c3 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Mon, 26 Jun 2023 18:04:44 +0800 Subject: [PATCH 14/20] Spark 3.4 UDF: support varargs for Hash UDFs --- .../xenon/clickhouse/func/MultiArgsHash.scala | 85 ++++--------------- 1 file changed, 17 insertions(+), 68 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala index 555001fa..adc3a382 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala @@ -14,83 +14,32 @@ package xenon.clickhouse.func +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction { - trait Base extends ScalarFunction[Long] { - // must not be private object, nor do it successors, because spark would compile them - override def canonicalName: String = s"clickhouse.$name" - override def resultType: DataType = LongType - override def toString: String = name - } - - object Arg1 extends Base { - override def name: String = s"${funcName}_1" - override def inputTypes: Array[DataType] = Array.fill(1)(StringType) - def invoke(value: UTF8String): Long = invokeBase(value) - } - - object Arg2 extends Base { - override def name: String = s"${funcName}_2" - override def inputTypes: Array[DataType] = Array.fill(2)(StringType) - def invoke(v1: UTF8String, v2: UTF8String): Long = Seq(v1, v2).map(invokeBase).reduce(combineHashes) - } - - object Arg3 extends Base { - override def name: String = s"${funcName}_3" - override def inputTypes: Array[DataType] = Array.fill(3)(StringType) - def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String): Long = - Seq(v1, v2, v3).map(invokeBase).reduce(combineHashes) - } - - object Arg4 extends Base { - override def name: String = s"${funcName}_4" - override def inputTypes: Array[DataType] = Array.fill(4)(StringType) - def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String): Long = - Seq(v1, v2, v3, v4).map(invokeBase).reduce(combineHashes) - } - - object Arg5 extends Base { - override def name: String = s"${funcName}_4" - override def inputTypes: Array[DataType] = Array.fill(5)(StringType) - def invoke(v1: UTF8String, v2: UTF8String, v3: UTF8String, v4: UTF8String, v5: UTF8String): Long = - Seq(v1, v2, v3, v4, v5).map(invokeBase).reduce(combineHashes) - } private def isExceptedType(dt: DataType): Boolean = dt.isInstanceOf[StringType] final override def name: String = funcName - final override def bind(inputType: StructType): BoundFunction = inputType.fields match { - case Array(StructField(_, dt, _, _)) if List(dt).forall(isExceptedType) => this.Arg1 - case Array( - StructField(_, dt1, _, _), - StructField(_, dt2, _, _) - ) if List(dt1, dt2).forall(isExceptedType) => - this.Arg2 - case Array( - StructField(_, dt1, _, _), - StructField(_, dt2, _, _), - StructField(_, dt3, _, _) - ) if List(dt1, dt2, dt3).forall(isExceptedType) => - this.Arg3 - case Array( - StructField(_, dt1, _, _), - StructField(_, dt2, _, _), - StructField(_, dt3, _, _), - StructField(_, dt4, _, _) - ) if List(dt1, dt2, dt3, dt4).forall(isExceptedType) => - this.Arg4 - case Array( - StructField(_, dt1, _, _), - StructField(_, dt2, _, _), - StructField(_, dt3, _, _), - StructField(_, dt4, _, _), - StructField(_, dt5, _, _) - ) if List(dt1, dt2, dt3, dt4, dt5).forall(isExceptedType) => - this.Arg5 - case _ => throw new UnsupportedOperationException(s"Expect up to 5 STRING argument. $description") + final override def bind(inputType: StructType): BoundFunction = { + val inputDataTypes = inputType.fields.map(_.dataType) + if (inputDataTypes.forall(isExceptedType)) new ScalarFunction[Long] { + override def inputTypes(): Array[DataType] = inputDataTypes + override def name: String = funcName + override def canonicalName: String = s"clickhouse.$name" + override def resultType: DataType = LongType + override def toString: String = name + override def produceResult(input: InternalRow): Long = { + val inputStrings: Seq[UTF8String] = + input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]] + inputStrings.map(invokeBase).reduce(combineHashes) + } + } + else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description") + } protected def funcName: String From e5809f7b7d77117456864e62d8473d7b11dc2b0b Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Tue, 27 Jun 2023 10:05:04 +0800 Subject: [PATCH 15/20] Spark 3.4: refactor implicit into normal arg in ExprUtils --- .../spark/sql/clickhouse/ExprUtils.scala | 60 +++++++++++-------- .../xenon/clickhouse/ClickHouseCatalog.scala | 2 +- .../xenon/clickhouse/ClickHouseTable.scala | 4 +- .../clickhouse/write/ClickHouseWriter.scala | 2 +- .../write/WriteJobDescription.scala | 12 ++-- 5 files changed, 46 insertions(+), 34 deletions(-) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 55350ebe..1626267c 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -33,38 +33,44 @@ import scala.util.{Failure, Success, Try} object ExprUtils extends SQLConfHelper with Serializable { - def toSparkPartitions(partitionKey: Option[List[Expr]])(implicit + def toSparkPartitions( + partitionKey: Option[List[Expr]], functionRegistry: FunctionRegistry ): Array[Transform] = - partitionKey.seq.flatten.flatten(toSparkTransformOpt(_)).toArray + partitionKey.seq.flatten.flatten(toSparkTransformOpt(_, functionRegistry)).toArray - def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]])(implicit + def toSparkSplits( + shardingKey: Option[Expr], + partitionKey: Option[List[Expr]], functionRegistry: FunctionRegistry ): Array[Transform] = - (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_)).toArray + (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_, functionRegistry)).toArray def toSparkSortOrders( shardingKeyIgnoreRand: Option[Expr], partitionKey: Option[List[Expr]], sortingKey: Option[List[OrderExpr]], - cluster: Option[ClusterSpec] - )(implicit functionRegistry: FunctionRegistry): Array[SortOrder] = + cluster: Option[ClusterSpec], + functionRegistry: FunctionRegistry + ): Array[SortOrder] = toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)), - partitionKey + partitionKey, + functionRegistry ).map(Expressions.sort(_, SortDirection.ASCENDING)) ++: sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) => val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST - toSparkTransformOpt(expr).map(trans => + toSparkTransformOpt(expr, functionRegistry).map(trans => Expressions.sort(trans, direction, nullOrder) ) }.toArray private def loadV2FunctionOpt( name: String, - args: Seq[Expression] - )(implicit functionRegistry: FunctionRegistry): Option[BoundFunction] = { + args: Seq[Expression], + functionRegistry: FunctionRegistry + ): Option[BoundFunction] = { def loadFunction(ident: Identifier): UnboundFunction = functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident)) val inputType = StructType(args.zipWithIndex.map { @@ -85,10 +91,11 @@ object ExprUtils extends SQLConfHelper with Serializable { def toCatalyst( v2Expr: V2Expression, - fields: Array[StructField] - )(implicit functionRegistry: FunctionRegistry): Expression = + fields: Array[StructField], + functionRegistry: FunctionRegistry + ): Expression = v2Expr match { - case IdentityTransform(ref) => toCatalyst(ref, fields) + case IdentityTransform(ref) => toCatalyst(ref, fields, functionRegistry) case ref: NamedReference if ref.fieldNames.length == 1 => val (field, ordinal) = fields .zipWithIndex @@ -96,17 +103,18 @@ object ExprUtils extends SQLConfHelper with Serializable { .getOrElse(throw CHClientException(s"Invalid field reference: $ref")) BoundReference(ordinal, field.dataType, field.nullable) case t: Transform => - val catalystArgs = t.arguments().map(toCatalyst(_, fields)) - loadV2FunctionOpt(t.name(), catalystArgs).map(bound => TransformExpression(bound, catalystArgs)).getOrElse { - throw CHClientException(s"Unsupported expression: $v2Expr") - } + val catalystArgs = t.arguments().map(toCatalyst(_, fields, functionRegistry)) + loadV2FunctionOpt(t.name(), catalystArgs, functionRegistry) + .map(bound => TransformExpression(bound, catalystArgs)).getOrElse { + throw CHClientException(s"Unsupported expression: $v2Expr") + } case _ => throw CHClientException( s"Unsupported expression: $v2Expr" ) } - def toSparkTransformOpt(expr: Expr)(implicit functionRegistry: FunctionRegistry): Option[Transform] = - Try(toSparkExpression(expr)) match { + def toSparkTransformOpt(expr: Expr, functionRegistry: FunctionRegistry): Option[Transform] = + Try(toSparkExpression(expr, functionRegistry)) match { // need this function because spark `Table`'s `partitioning` field should be `Transform` case Success(t: Transform) => Some(t) case Success(_) => None @@ -114,18 +122,21 @@ object ExprUtils extends SQLConfHelper with Serializable { case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow)) } - def toSparkExpression(expr: Expr)(implicit functionRegistry: FunctionRegistry): V2Expression = + def toSparkExpression(expr: Expr, functionRegistry: FunctionRegistry): V2Expression = expr match { case FieldRef(col) => identity(col) case StringLiteral(value) => literal(value) case FuncExpr("rand", Nil) => apply("rand") case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col) case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) => - apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*) + apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression(_, functionRegistry)): _*) case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported") } - def toClickHouse(transform: Transform)(implicit functionRegistry: FunctionRegistry): Expr = transform match { + def toClickHouse( + transform: Transform, + functionRegistry: FunctionRegistry + ): Expr = transform match { case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe) case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) => FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList) @@ -136,8 +147,9 @@ object ExprUtils extends SQLConfHelper with Serializable { def inferTransformSchema( primarySchema: StructType, secondarySchema: StructType, - transform: Transform - )(implicit functionRegistry: FunctionRegistry): StructField = transform match { + transform: Transform, + functionRegistry: FunctionRegistry + ): StructField = transform match { case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col) .orElse(secondarySchema.find(_.name == col)) .getOrElse(throw CHClientException(s"Invalid partition column: $col")) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala index 83327fa4..caff6a50 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala @@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog val partitionsClause = partitions match { case transforms if transforms.nonEmpty => - transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")") + transforms.map(ExprUtils.toClickHouse(_, functionRegistry).sql).mkString("PARTITION BY (", ", ", ")") case _ => "" } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala index ced6e07e..eda3a1a4 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala @@ -133,11 +133,11 @@ case class ClickHouseTable( private lazy val metadataSchema: StructType = StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField)) - override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry) + override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey, functionRegistry) override lazy val partitionSchema: StructType = StructType( partitioning.map(partTransform => - ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry) + ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform, functionRegistry) ) ) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala index c9e14c2d..65a4bc33 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala @@ -63,7 +63,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match { case None => None case Some(v2Expr) => - val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)(writeJob.functionRegistry) + val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields, writeJob.functionRegistry) catalystExpr match { case BoundReference(_, dataType, _) if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType` diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala index bb6cca02..646d6ca5 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala @@ -41,8 +41,6 @@ case class WriteJobDescription( functionRegistry: FunctionRegistry ) { - implicit val _functionRegistry: FunctionRegistry = functionRegistry - def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match { case dist: DistributedEngineSpec if convert2Local => dist.local_db case _ => tableSpec.database @@ -59,7 +57,7 @@ case class WriteJobDescription( } def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match { - case Some(expr) => ExprUtils.toSparkTransformOpt(expr) + case Some(expr) => ExprUtils.toSparkTransformOpt(expr, functionRegistry) case _ => None } @@ -73,18 +71,20 @@ case class WriteJobDescription( if (writeOptions.repartitionByPartition) { ExprUtils.toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), - partitionKey + partitionKey, + functionRegistry ) } else { ExprUtils.toSparkSplits( shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)), - None + None, + functionRegistry ) } def sparkSortOrders: Array[SortOrder] = { val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None - ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster) + ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster, functionRegistry) } } From 5ae4f3df1fb53972355e308b2a84ead667d85abc Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Tue, 27 Jun 2023 15:50:48 +0800 Subject: [PATCH 16/20] Spark 3.4: Cast type when calling projection, support recursive resolve (cherry picked from commit 936a18af65c37ca2cfad97b63645e579632ff72d) --- .../ClusterShardByTransformSuite.scala | 7 +++- .../spark/sql/clickhouse/ExprUtils.scala | 25 ++++++++++- .../clickhouse/write/ClickHouseWriter.scala | 41 ++++++++----------- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala index 32e4fc5d..06b7f9b4 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -94,16 +94,21 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { } Seq( + // wait for SPARK-44180 to be fixed, then add implicit cast test cases ("toYear", Array("create_date")), +// ("toYear", Array("create_time")), ("toYYYYMM", Array("create_date")), +// ("toYYYYMM", Array("create_time")), ("toYYYYMMDD", Array("create_date")), +// ("toYYYYMMDD", Array("create_time")), ("toHour", Array("create_time")), ("xxHash64", Array("value")), ("murmurHash2_64", Array("value")), ("murmurHash2_32", Array("value")), ("murmurHash3_64", Array("value")), ("murmurHash3_32", Array("value")), - ("cityHash64", Array("value")) + ("cityHash64", Array("value")), + ("positiveModulo", Array("toYYYYMM(create_date)", "10")) ).foreach { case (func_name: String, func_args: Array[String]) => test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args)) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 1626267c..8c2f6d6d 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -15,9 +15,15 @@ package org.apache.spark.sql.clickhouse import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.NoSuchFunctionException -import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, TransformExpression} +import org.apache.spark.sql.catalyst.{expressions, SQLConfHelper} +import org.apache.spark.sql.catalyst.expressions.{ + BoundReference, + Cast, + Expression, + TransformExpression, + V2ExpressionUtils +} import org.apache.spark.sql.clickhouse.ClickHouseSQLConf.IGNORE_UNSUPPORTED_TRANSFORM import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} @@ -89,6 +95,20 @@ object ExprUtils extends SQLConfHelper with Serializable { } } + def resolveTransformCatalyst( + catalystExpr: Expression, + timeZoneId: Option[String] = None + ): Expression = catalystExpr match { + case TransformExpression(function: ScalarFunction[_], args, _) => + val resolvedArgs: Seq[Expression] = args.map(resolveTransformCatalyst(_, timeZoneId)) + val castedArgs: Seq[Expression] = resolvedArgs.zip(function.inputTypes()).map { + case (arg, expectedType) if !arg.dataType.sameType(expectedType) => Cast(arg, expectedType, timeZoneId) + case (arg, _) => arg + } + V2ExpressionUtils.resolveScalarFunction(function, castedArgs) + case other => other + } + def toCatalyst( v2Expr: V2Expression, fields: Array[StructField], @@ -108,6 +128,7 @@ object ExprUtils extends SQLConfHelper with Serializable { .map(bound => TransformExpression(bound, catalystArgs)).getOrElse { throw CHClientException(s"Unsupported expression: $v2Expr") } + case literal: LiteralValue[Any] => expressions.Literal(literal.value) case _ => throw CHClientException( s"Unsupported expression: $v2Expr" ) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala index 65a4bc33..a6b5a5fe 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala @@ -17,16 +17,9 @@ package xenon.clickhouse.write import com.clickhouse.client.ClickHouseProtocol import com.clickhouse.data.ClickHouseCompression import org.apache.commons.io.IOUtils -import org.apache.spark.sql.catalyst.expressions.{ - BoundReference, - Expression, - SafeProjection, - TransformExpression, - V2ExpressionUtils -} +import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, SafeProjection, TransformExpression} import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.clickhouse.ExprUtils -import org.apache.spark.sql.connector.catalog.functions.ScalarFunction import org.apache.spark.sql.connector.metric.CustomTaskMetric import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} import org.apache.spark.sql.types._ @@ -86,23 +79,21 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription) protected lazy val shardProjection: Option[expressions.Projection] = shardExpr .filter(_ => writeJob.writeOptions.convertDistributedToLocal) - .flatMap(expr => - expr match { - case BoundReference(_, _, _) => - Some(SafeProjection.create(Seq(expr))) - case TransformExpression(function, args, _) => - val retType = function.resultType() match { - case ByteType => classOf[Byte] - case ShortType => classOf[Short] - case IntegerType => classOf[Int] - case LongType => classOf[Long] - case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," + - s"sharding field: ${function.resultType()}") - } - val expr = V2ExpressionUtils.resolveScalarFunction(function.asInstanceOf[ScalarFunction[retType.type]], args) - Some(SafeProjection.create(Seq(expr))) - } - ) + .flatMap { + case expr: BoundReference => + Some(SafeProjection.create(Seq(expr))) + case expr @ TransformExpression(function, _, _) => + // result type must be integer class + function.resultType() match { + case ByteType => classOf[Byte] + case ShortType => classOf[Short] + case IntegerType => classOf[Int] + case LongType => classOf[Long] + case _ => throw CHClientException(s"Invalid return data type for function ${function.name()}," + + s"sharding field: ${function.resultType()}") + } + Some(SafeProjection.create(Seq(ExprUtils.resolveTransformCatalyst(expr, Some(writeJob.tz.getId))))) + } // put the node select strategy in executor side because we need to calculate shard and don't know the records // util DataWriter#write(InternalRow) invoked. From 088bf3dc8da996ba241052ba2a7323aa35865a9b Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 14 Jul 2023 17:31:38 +0800 Subject: [PATCH 17/20] Spark 3.4 UDF: change pmod to mod because positiveModulo does not exists in early version of clickhouse (cherry picked from commit ea0592d3f6f9262e931141af9868441f6422977b) (cherry picked from commit 8a270a24441fa2d0b5b9ff7426b54e5357c66b92) --- .../cluster/ClusterShardByTransformSuite.scala | 2 +- .../org/apache/spark/sql/clickhouse/ExprUtils.scala | 2 +- .../xenon/clickhouse/func/FunctionRegistry.scala | 2 +- .../func/clickhouse/{Pmod.scala => Mod.scala} | 11 ++++------- 4 files changed, 7 insertions(+), 10 deletions(-) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/{Pmod.scala => Mod.scala} (86%) diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala index 06b7f9b4..e02dad11 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala @@ -108,7 +108,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest { ("murmurHash3_64", Array("value")), ("murmurHash3_32", Array("value")), ("cityHash64", Array("value")), - ("positiveModulo", Array("toYYYYMM(create_date)", "10")) + ("modulo", Array("toYYYYMM(create_date)", "10")) ).foreach { case (func_name: String, func_args: Array[String]) => test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args)) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala index 8c2f6d6d..7ba7ad62 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala @@ -186,5 +186,5 @@ object ExprUtils extends SQLConfHelper with Serializable { } def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr = - FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString))) + FuncExpr("modulo", List(shardingKey, StringLiteral(weight.toString))) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index a509f07e..c6f01110 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -59,7 +59,7 @@ object StaticFunctionRegistry extends FunctionRegistry { "clickhouse_months" -> Months, "clickhouse_days" -> Days, "clickhouse_hours" -> Hours, - "sharding_pmod" -> Pmod + "sharding_mod" -> Mod ) override def list: Array[String] = functions.keys.toArray diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala similarity index 86% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala index f7c3e228..3e8c5182 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Pmod.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala @@ -18,15 +18,15 @@ import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFu import org.apache.spark.sql.types._ import xenon.clickhouse.func.ClickhouseEquivFunction -object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { +object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { - override def name: String = "sharding_pmod" + override def name: String = "sharding_mod" override def canonicalName: String = s"clickhouse.$name" override def toString: String = name - override val ckFuncNames: Array[String] = Array("positiveModulo", "positive_modulo", "pmod") + override val ckFuncNames: Array[String] = Array("modulo", "remainder") override def description: String = s"$name: (a: long, b: long) => mod: long" @@ -58,8 +58,5 @@ object Pmod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqu override def isResultNullable: Boolean = false - def invoke(a: Long, b: Long): Long = { - val mod = a % b - if (mod < 0) mod + b else mod - } + def invoke(a: Long, b: Long): Long = a % b } From 85a025f659ef7122dfcde8a4d5fa38d7f58a5c2b Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Fri, 14 Jul 2023 17:37:57 +0800 Subject: [PATCH 18/20] Docs: add comment for modulo UDF (cherry picked from commit d2bb743f1be1c27c7e133f3c4bd43a41427eadb2) (cherry picked from commit f4ae4ad42cb2b7f59b672435b0a0a2ef6adb6e5d) --- .../src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala index 3e8c5182..b10f0f7e 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala @@ -26,6 +26,8 @@ object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEqui override def toString: String = name + // remainder is not a Clickhouse function, but modulo will be parsed to remainder in the connector. + // Added remainder as a synonym. override val ckFuncNames: Array[String] = Array("modulo", "remainder") override def description: String = s"$name: (a: long, b: long) => mod: long" From 4e201d61cfd9ca6ab3323c2aae6b1555f28c7e78 Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Tue, 25 Jul 2023 18:31:57 +0800 Subject: [PATCH 19/20] Spark 3.4: Adapt to hash function under clickhouse-core --- .../ClickHouseClusterHashUDFSuite.scala | 17 +- .../xenon/clickhouse/ClickHouseCatalog.scala | 3 +- .../func/{clickhouse => }/CityHash64.scala | 22 +- .../func/{clickhouse => }/Days.scala | 3 +- .../clickhouse/func/FunctionRegistry.scala | 1 - .../func/{clickhouse => }/Hours.scala | 5 +- .../func/{clickhouse => }/Mod.scala | 3 +- .../func/{clickhouse => }/Months.scala | 3 +- ...gsHash.scala => MultiStringArgsHash.scala} | 45 ++- .../func/{clickhouse => }/MurmurHash2.scala | 29 +- .../func/{clickhouse => }/MurmurHash3.scala | 30 +- .../func/{clickhouse => }/XxHash64.scala | 3 +- .../func/{clickhouse => }/Years.scala | 3 +- .../clickhouse/cityhash/CityHash_v1_0_2.java | 344 ------------------ .../func/clickhouse/cityhash/UInt128.java | 34 -- .../clickhouse/FunctionRegistrySuite.scala | 8 +- 16 files changed, 60 insertions(+), 493 deletions(-) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/CityHash64.scala (52%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Days.scala (95%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Hours.scala (93%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Mod.scala (96%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Months.scala (95%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{MultiArgsHash.scala => MultiStringArgsHash.scala} (55%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/MurmurHash2.scala (52%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/MurmurHash3.scala (51%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/XxHash64.scala (97%) rename spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/{clickhouse => }/Years.scala (95%) delete mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java delete mode 100644 spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala index adf3d9de..65f667b2 100644 --- a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala +++ b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala @@ -15,8 +15,12 @@ package org.apache.spark.sql.clickhouse.cluster import org.apache.spark.sql.clickhouse.TestUtils.om -import xenon.clickhouse.func.{CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry} -import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard +import xenon.clickhouse.func.{ + ClickHouseXxHash64Shard, + CompositeFunctionRegistry, + DynamicFunctionRegistry, + StaticFunctionRegistry +} import java.lang.{Long => JLong} @@ -30,15 +34,6 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest { new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry)) } - def product[A](xs: Seq[Seq[A]]): Seq[Seq[A]] = - xs.toList match { - case Nil => Seq(Seq()) - case head :: tail => for { - h <- head - t <- product(tail) - } yield h +: t - } - def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = { val sparkResult = spark.sql( s"""SELECT diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala index caff6a50..6db307f3 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala @@ -26,8 +26,7 @@ import xenon.clickhouse.Constants._ import xenon.clickhouse.client.NodeClient import xenon.clickhouse.exception.CHClientException import xenon.clickhouse.exception.ClickHouseErrCode._ -import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard -import xenon.clickhouse.func.{FunctionRegistry, _} +import xenon.clickhouse.func.{ClickHouseXxHash64Shard, FunctionRegistry, _} import xenon.clickhouse.spec._ import java.time.ZoneId diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala similarity index 52% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala index 160d45e9..b78f8ee3 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/CityHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/CityHash64.scala @@ -12,29 +12,15 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func -import io.netty.buffer.{ByteBuf, Unpooled} -import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.MultiArgsHash -import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128} +import xenon.clickhouse.hash -object CityHash64 extends MultiArgsHash { +object CityHash64 extends MultiStringArgsHash { // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694 override protected def funcName: String = "clickhouse_cityHash64" override val ckFuncNames: Array[String] = Array("cityHash64") - def convertToByteBuf(array: Array[Byte]): ByteBuf = { - val byteBuf = Unpooled.buffer(array.length).writeBytes(array) - byteBuf - } - - override def invokeBase(value: UTF8String): Long = { - // ignore UInt64 vs Int64 - val data = value.getBytes - CityHash_v1_0_2.CityHash64(convertToByteBuf(data), 0, data.length) - } - - override def combineHashes(v1: Long, v2: Long): Long = CityHash_v1_0_2.Hash128to64(new UInt128(v1, v2)) + override def applyHash(input: Array[Any]): Long = hash.CityHash64(input) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala similarity index 95% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala index 672fd44f..3008d7fd 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Days.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Days.scala @@ -12,11 +12,10 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ -import xenon.clickhouse.func.ClickhouseEquivFunction import java.time.LocalDate import java.time.format.DateTimeFormatter diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala index c6f01110..d7856c3c 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala @@ -15,7 +15,6 @@ package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.UnboundFunction -import xenon.clickhouse.func.clickhouse._ import scala.collection.mutable diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala similarity index 93% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala index 0abe25cb..e88907be 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Hours.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Hours.scala @@ -12,13 +12,12 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ -import xenon.clickhouse.func.ClickhouseEquivFunction -import java.sql.{Date, Timestamp} +import java.sql.Timestamp import java.text.SimpleDateFormat object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction { diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala similarity index 96% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala index b10f0f7e..69fdedc9 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Mod.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Mod.scala @@ -12,11 +12,10 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ -import xenon.clickhouse.func.ClickhouseEquivFunction object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction { diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala similarity index 95% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala index 846dd245..13e06d88 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Months.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Months.scala @@ -12,11 +12,10 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ -import xenon.clickhouse.func.ClickhouseEquivFunction import java.time.LocalDate import java.time.format.DateTimeFormatter diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala similarity index 55% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala index adc3a382..69ce07c1 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala @@ -19,32 +19,41 @@ import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFu import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction { +abstract class MultiStringArgsHash extends UnboundFunction with ClickhouseEquivFunction { + + def applyHash(input: Array[Any]): Long + + protected def funcName: String + + override val ckFuncNames: Array[String] + + override def description: String = s"$name: (value: string, ...) => hash_value: long" + private def isExceptedType(dt: DataType): Boolean = dt.isInstanceOf[StringType] final override def name: String = funcName + final override def bind(inputType: StructType): BoundFunction = { val inputDataTypes = inputType.fields.map(_.dataType) - if (inputDataTypes.forall(isExceptedType)) new ScalarFunction[Long] { - override def inputTypes(): Array[DataType] = inputDataTypes - override def name: String = funcName - override def canonicalName: String = s"clickhouse.$name" - override def resultType: DataType = LongType - override def toString: String = name - override def produceResult(input: InternalRow): Long = { - val inputStrings: Seq[UTF8String] = - input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]] - inputStrings.map(invokeBase).reduce(combineHashes) + if (inputDataTypes.forall(isExceptedType)) { + // need to new a ScalarFunction instance for each bind, + // because we do not know the number of arguments in advance + new ScalarFunction[Long] { + override def inputTypes(): Array[DataType] = inputDataTypes + override def name: String = funcName + override def canonicalName: String = s"clickhouse.$name" + override def resultType: DataType = LongType + override def toString: String = name + override def produceResult(input: InternalRow): Long = { + val inputStrings: Array[Any] = + input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]].toArray + .map(_.getBytes) + applyHash(inputStrings) + } } - } - else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description") + } else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description") } - protected def funcName: String - override val ckFuncNames: Array[String] - override def description: String = s"$name: (value: string, ...) => hash_value: long" - def invokeBase(value: UTF8String): Long - def combineHashes(v1: Long, v2: Long): Long } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala similarity index 52% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala index f2ff9ed2..9fac4d60 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash2.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash2.scala @@ -12,40 +12,25 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func -import org.apache.commons.codec.digest.{MurmurHash2, MurmurHash3} -import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} +import xenon.clickhouse.hash +import xenon.clickhouse.hash.HashUtils -object MurmurHash2_64 extends MultiArgsHash { +object MurmurHash2_64 extends MultiStringArgsHash { // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460 override protected def funcName: String = "clickhouse_murmurHash2_64" override val ckFuncNames: Array[String] = Array("murmurHash2_64") - override def invokeBase(value: UTF8String): Long = { - // ignore UInt64 vs Int64 - val data = value.getBytes - MurmurHash2.hash64(data, data.length, 0) - } - - override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2 + override def applyHash(input: Array[Any]): Long = hash.Murmurhash2_64(input) } -object MurmurHash2_32 extends MultiArgsHash { +object MurmurHash2_32 extends MultiStringArgsHash { // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519 override protected def funcName: String = "clickhouse_murmurHash2_32" override val ckFuncNames: Array[String] = Array("murmurHash2_32") - override def invokeBase(value: UTF8String): Long = { - val data = value.getBytes - val v = MurmurHash2.hash32(data, data.length, 0) - Util.toUInt32Range(v) - } - - override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2) + override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash2_32(input)) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala similarity index 51% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala index 1db654c1..848bb3b0 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/MurmurHash3.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MurmurHash3.scala @@ -12,41 +12,25 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func -import org.apache.commons.codec.digest.MurmurHash3 -import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util} +import xenon.clickhouse.hash +import xenon.clickhouse.hash.HashUtils -object MurmurHash3_64 extends MultiArgsHash { +object MurmurHash3_64 extends MultiStringArgsHash { // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L543 override protected def funcName: String = "clickhouse_murmurHash3_64" override val ckFuncNames: Array[String] = Array("murmurHash3_64") - override def invokeBase(value: UTF8String): Long = { - // ignore UInt64 vs Int64 - val data = value.getBytes - val hashes = MurmurHash3.hash128x64(data, 0, data.length, 0) - hashes(0) ^ hashes(1) - } - - override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2 + override def applyHash(input: Array[Any]): Long = hash.Murmurhash3_64(input) } -object MurmurHash3_32 extends MultiArgsHash { +object MurmurHash3_32 extends MultiStringArgsHash { // https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519 override protected def funcName: String = "clickhouse_murmurHash3_32" override val ckFuncNames: Array[String] = Array("murmurHash3_32") - override def invokeBase(value: UTF8String): Long = { - val data = value.getBytes - val v = MurmurHash3.hash32x86(data, 0, data.length, 0) - Util.toUInt32Range(v) - } - - override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2) + override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash3_32(input)) } diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala similarity index 97% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala index 241ae9d8..3c4a5b1a 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/XxHash64.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/XxHash64.scala @@ -12,13 +12,12 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.catalyst.expressions.XxHash64Function import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import xenon.clickhouse.func.ClickhouseEquivFunction import xenon.clickhouse.spec.{ClusterSpec, ShardUtils} /** diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala similarity index 95% rename from spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala rename to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala index 4b2e650d..6bf987fb 100644 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/Years.scala +++ b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/Years.scala @@ -12,11 +12,10 @@ * limitations under the License. */ -package xenon.clickhouse.func.clickhouse +package xenon.clickhouse.func import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction} import org.apache.spark.sql.types._ -import xenon.clickhouse.func.ClickhouseEquivFunction import java.time.LocalDate import java.time.format.DateTimeFormatter diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java deleted file mode 100644 index df218df3..00000000 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/CityHash_v1_0_2.java +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xenon.clickhouse.func.clickhouse.cityhash; - -import io.netty.buffer.ByteBuf; - -// copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/CityHash_v1_0_2.java -// fixed some bugs involving int32 to uint32 conversion -final public class CityHash_v1_0_2 { - - private static final long kMul = 0x9ddfea08eb382d69L; - // Some primes between 2^63 and 2^64 for various uses. - private static final long k0 = 0xc3a5c85c97cb3127L; - private static final long k1 = 0xb492b66fbe98f273L; - private static final long k2 = 0x9ae16a3b2f90404fL; - private static final long k3 = 0xc949d7c7509e6557L; - - private CityHash_v1_0_2() { /* restricted */ } - - private static long Fetch64(ByteBuf p, int index) { - return p.getLongLE(index); - } - - private static int Fetch32(ByteBuf p, int index) { - return p.getIntLE(index); - } - - private static long toUint32(int x) { - return x & 0xFFFFFFFFL; - } - - // Equivalent to Rotate(), but requires the second arg to be non-zero. -// On x86-64, and probably others, it's possible for this to compile -// to a single instruction if both args are already in registers. - private static long RotateByAtLeast1(long val, int shift) { - return (val >>> shift) | (val << (64 - shift)); - } - - private static long ShiftMix(long val) { - return val ^ (val >>> 47); - } - - private static long Uint128Low64(UInt128 x) { - return x.first; - } - - private static long Rotate(long val, int shift) { - return shift == 0 ? val : (val >>> shift) | (val << (64 - shift)); - } - - private static long Uint128High64(UInt128 x) { - return x.second; - } - - // Hash 128 input bits down to 64 bits of output. -// This is intended to be a reasonably good hash function. - public static long Hash128to64(UInt128 x) { - // Murmur-inspired hashing. - long a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; - a ^= (a >>> 47); - long b = (Uint128High64(x) ^ a) * kMul; - b ^= (b >>> 47); - b *= kMul; - return b; - } - - private static long HashLen16(long u, long v) { - return Hash128to64(UInt128.of(u, v)); - } - - private static long HashLen0to16(ByteBuf s, int index, int len) { - if (len > 8) { - long a = Fetch64(s, index); - long b = Fetch64(s, index + len - 8); - return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; - } - if (len >= 4) { - long a = toUint32(Fetch32(s, index)); - return HashLen16(len + (a << 3), toUint32(Fetch32(s, index + len - 4))); - } - if (len > 0) { - byte a = s.getByte(index); - byte b = s.getByte(index + len >>> 1); - byte c = s.getByte(index + len - 1); - int y = (a & 0xFF) + ((b & 0xFF) << 8); - int z = len + ((c & 0xFF) << 2); - return ShiftMix(y * k2 ^ z * k3) * k2; - } - return k2; - } - - // This probably works well for 16-byte strings as well, but it may be overkill -// in that case. - private static long HashLen17to32(ByteBuf s, int index, int len) { - long a = Fetch64(s, index) * k1; - long b = Fetch64(s, index + 8); - long c = Fetch64(s, index + len - 8) * k2; - long d = Fetch64(s, index + len - 16) * k0; - return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, - a + Rotate(b ^ k3, 20) - c + len); - } - - // Return a 16-byte hash for 48 bytes. Quick and dirty. -// Callers do best to use "random-looking" values for a and b. - private static UInt128 WeakHashLen32WithSeeds( - long w, long x, long y, long z, long a, long b) { - a += w; - b = Rotate(b + a + z, 21); - long c = a; - a += x; - a += y; - b += Rotate(a, 44); - return UInt128.of(a + z, b + c); - } - - // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. - private static UInt128 WeakHashLen32WithSeeds(ByteBuf s, int index, long a, long b) { - return WeakHashLen32WithSeeds(Fetch64(s, index), - Fetch64(s, index + 8), - Fetch64(s, index + 16), - Fetch64(s, index + 24), - a, - b); - } - - // Return an 8-byte hash for 33 to 64 bytes. - private static long HashLen33to64(ByteBuf s, int index, int len) { - long z = Fetch64(s, index + 24); - long a = Fetch64(s, index) + (len + Fetch64(s, index + len - 16)) * k0; - long b = Rotate(a + z, 52); - long c = Rotate(a, 37); - a += Fetch64(s, index + 8); - c += Rotate(a, 7); - a += Fetch64(s, index + 16); - long vf = a + z; - long vs = b + Rotate(a, 31) + c; - a = Fetch64(s, index + 16) + Fetch64(s, index + len - 32); - z = Fetch64(s, index + len - 8); - b = Rotate(a + z, 52); - c = Rotate(a, 37); - a += Fetch64(s, index + len - 24); - c += Rotate(a, 7); - a += Fetch64(s, index + len - 16); - long wf = a + z; - long ws = b + Rotate(a, 31) + c; - long r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); - return ShiftMix(r * k0 + vs) * k2; - } - - // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings -// of any length representable in ssize_t. Based on City and Murmur. - private static UInt128 CityMurmur(ByteBuf s, int index, int len, UInt128 seed) { - long a = Uint128Low64(seed); - long b = Uint128High64(seed); - long c; - long d; - int l = len - 16; - if (l <= 0) { // len <= 16 - a = ShiftMix(a * k1) * k1; - c = b * k1 + HashLen0to16(s, index, len); - d = ShiftMix(a + (len >= 8 ? Fetch64(s, index) : c)); - } else { // len > 16 - c = HashLen16(Fetch64(s, index + len - 8) + k1, a); - d = HashLen16(b + len, c + Fetch64(s, index + len - 16)); - a += d; - do { - a ^= ShiftMix(Fetch64(s, index) * k1) * k1; - a *= k1; - b ^= a; - c ^= ShiftMix(Fetch64(s, index + 8) * k1) * k1; - c *= k1; - d ^= c; - index += 16; - l -= 16; - } while (l > 0); - } - a = HashLen16(a, c); - b = HashLen16(d, b); - return UInt128.of(a ^ b, HashLen16(b, a)); - } - - public static long CityHash64(ByteBuf s, int index, int len) { - if (len <= 32) { - if (len <= 16) { - return HashLen0to16(s, index, len); - } else { - return HashLen17to32(s, index, len); - } - } else if (len <= 64) { - return HashLen33to64(s, index, len); - } - - // For strings over 64 bytes we hash the end first, and then as we - // loop we keep 56 bytes of state: v, w, x, y, and z. - long x = Fetch64(s, index); - long y = Fetch64(s, index + len - 16) ^ k1; - long z = Fetch64(s, index + len - 56) ^ k0; - UInt128 v = WeakHashLen32WithSeeds(s, len - 64, len, y); - UInt128 w = WeakHashLen32WithSeeds(s, len - 32, len * k1, k0); - z += ShiftMix(v.second) * k1; - x = Rotate(z + x, 39) * k1; - y = Rotate(y, 33) * k1; - - // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. - len = (len - 1) & ~63; - do { - x = Rotate(x + y + v.first + Fetch64(s, index + 16), 37) * k1; - y = Rotate(y + v.second + Fetch64(s, index + 48), 42) * k1; - x ^= w.second; - y ^= v.first; - z = Rotate(z ^ w.first, 33); - v = WeakHashLen32WithSeeds(s, index, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s, index + 32, z + w.second, y); - // swap - long t = z; - z = x; - x = t; - index += 64; - len -= 64; - } while (len != 0); - return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, - HashLen16(v.second, w.second) + x); - } - - private static long CityHash64WithSeed(ByteBuf s, int index, int len, long seed) { - return CityHash64WithSeeds(s, index, len, k2, seed); - } - - private static long CityHash64WithSeeds(ByteBuf s, int index, int len, - long seed0, long seed1) { - return HashLen16(CityHash64(s, index, len) - seed0, seed1); - } - - private static UInt128 CityHash128WithSeed(ByteBuf s, int index, int len, UInt128 seed) { - if (len < 128) { - return CityMurmur(s, index, len, seed); - } - - // We expect len >= 128 to be the common case. Keep 56 bytes of state: - // v, w, x, y, and z. - UInt128 v, w; - long x = Uint128Low64(seed); - long y = Uint128High64(seed); - long z = len * k1; - long vFirst = Rotate(y ^ k1, 49) * k1 + Fetch64(s, index); - long vSecond = Rotate(vFirst, 42) * k1 + Fetch64(s, index + 8); - long wFirst = Rotate(y + z, 35) * k1 + x; - long wSecond = Rotate(x + Fetch64(s, index + 88), 53) * k1; - -// v = UInt128.of(vFirst, vSecond); -// w = UInt128.of(wFirst, wSecond); - - // This is the same inner loop as CityHash64(), manually unrolled. - do { - x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1; - y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1; - x ^= wSecond; - y ^= vFirst; - z = Rotate(z ^ wFirst, 33); - v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst); - w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y); - - vFirst = v.first; - vSecond = v.second; - wFirst = w.first; - wSecond = w.second; - { - long swap = z; - z = x; - x = swap; - } - index += 64; - x = Rotate(x + y + vFirst + Fetch64(s, index + 16), 37) * k1; - y = Rotate(y + vSecond + Fetch64(s, index + 48), 42) * k1; - x ^= wSecond; - y ^= vFirst; - z = Rotate(z ^ wFirst, 33); - v = WeakHashLen32WithSeeds(s, index, vSecond * k1, x + wFirst); - w = WeakHashLen32WithSeeds(s, index + 32, z + wSecond, y); - - vFirst = v.first; - vSecond = v.second; - wFirst = w.first; - wSecond = w.second; - { - long swap = z; - z = x; - x = swap; - } - index += 64; - len -= 128; - } while (len >= 128); - y += Rotate(wFirst, 37) * k0 + z; - x += Rotate(vFirst + z, 49) * k0; - // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. - for (int tail_done = 0; tail_done < len; ) { - tail_done += 32; - y = Rotate(y - x, 42) * k0 + vSecond; - wFirst += Fetch64(s, index + len - tail_done + 16); - x = Rotate(x, 49) * k0 + wFirst; - wFirst += vFirst; - v = WeakHashLen32WithSeeds(s, index + len - tail_done, vFirst, vSecond); - - vFirst = v.first; - vSecond = v.second; - } - // At this point our 48 bytes of state should contain more than - // enough information for a strong 128-bit hash. We use two - // different 48-byte-to-8-byte hashes to get a 16-byte final result. - x = HashLen16(x, vFirst); - y = HashLen16(y, wFirst); - return UInt128.of(HashLen16(x + vSecond, wSecond) + y, - HashLen16(x + wSecond, y + vSecond)); - } - - public static UInt128 CityHash128(ByteBuf s, int len) { - if (len >= 16) { - return CityHash128WithSeed(s, 16, - len - 16, - UInt128.of(Fetch64(s, 0) ^ k3, - Fetch64(s, 8))); - } else if (len >= 8) { - return CityHash128WithSeed(null, - 0, 0, - UInt128.of(Fetch64(s, 0) ^ (len * k0), - Fetch64(s, len - 8) ^ k1)); - } else { - return CityHash128WithSeed(s, 0, len, UInt128.of(k0, k1)); - } - } -} - diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java deleted file mode 100644 index 2ba6c1f7..00000000 --- a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/clickhouse/cityhash/UInt128.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xenon.clickhouse.func.clickhouse.cityhash; - -/** - * @author Dmitriy Poluyanov - * @since 15/02/2018 - * copy from https://github.com/dpoluyanov/achord/blob/master/src/main/java/com/github/mangelion/achord/UInt128.java - */ -final public class UInt128 { - final public long first; - final public long second; - - public UInt128(long first, long second) { - this.first = first; - this.second = second; - } - - static UInt128 of(long first, long second) { - return new UInt128(first, second); - } -} diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala index 34254907..d241e87b 100644 --- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala +++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala @@ -17,13 +17,7 @@ package org.apache.spark.sql.clickhouse import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.scalatest.funsuite.AnyFunSuite import xenon.clickhouse.ClickHouseHelper -import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64 -import xenon.clickhouse.func.{ - ClickhouseEquivFunction, - CompositeFunctionRegistry, - DynamicFunctionRegistry, - StaticFunctionRegistry -} +import xenon.clickhouse.func.{ClickHouseXxHash64, ClickhouseEquivFunction, CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry} import scala.collection.JavaConverters._ From 085b3adf28bc0916943841085ef61b4d705c023a Mon Sep 17 00:00:00 2001 From: Xinyuan Yang Date: Wed, 26 Jul 2023 12:29:11 +0800 Subject: [PATCH 20/20] fix style --- .../spark/sql/clickhouse/FunctionRegistrySuite.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala index d241e87b..33369cb1 100644 --- a/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala +++ b/spark-3.4/clickhouse-spark/src/test/scala/org/apache/spark/sql/clickhouse/FunctionRegistrySuite.scala @@ -17,7 +17,13 @@ package org.apache.spark.sql.clickhouse import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.scalatest.funsuite.AnyFunSuite import xenon.clickhouse.ClickHouseHelper -import xenon.clickhouse.func.{ClickHouseXxHash64, ClickhouseEquivFunction, CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry} +import xenon.clickhouse.func.{ + ClickHouseXxHash64, + ClickhouseEquivFunction, + CompositeFunctionRegistry, + DynamicFunctionRegistry, + StaticFunctionRegistry +} import scala.collection.JavaConverters._