Skip to content

Spark 3.4: Enhance FunctionRegistry to support more hash functions #268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.clickhouse.cluster

import org.apache.spark.sql.clickhouse.TestUtils.om
import xenon.clickhouse.func._
import java.lang.{Long => JLong}

class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
// only for query function names
val dummyRegistry: CompositeFunctionRegistry = {
val dynamicFunctionRegistry = new DynamicFunctionRegistry
val xxHash64ShardFunc = new ClickHouseXxHash64Shard(Seq.empty)
dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible
dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc)
new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
}

def runTest(sparkFuncName: String, ckFuncName: String, stringVal: String): Unit = {
val sparkResult = spark.sql(
s"SELECT $sparkFuncName($stringVal) AS hash_value"
).collect
assert(sparkResult.length == 1)
val sparkHashVal = sparkResult.head.getAs[Long]("hash_value")

val clickhouseResultJsonStr = runClickHouseSQL(
s"SELECT $ckFuncName($stringVal) AS hash_value "
).head.getString(0)
val clickhouseResultJson = om.readTree(clickhouseResultJsonStr)
val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText)
assert(
sparkHashVal == clickhouseHashVal,
s"ck_function: $ckFuncName, spark_function: $sparkFuncName, args: ($stringVal)"
)
}

Seq(
"clickhouse_xxHash64",
"clickhouse_murmurHash3_64",
"clickhouse_murmurHash3_32",
"clickhouse_murmurHash2_64",
"clickhouse_murmurHash2_32",
"clickhouse_cityHash64"
).foreach { sparkFuncName =>
val ckFuncName = dummyRegistry.sparkToClickHouseFunc(sparkFuncName)
test(s"UDF $sparkFuncName") {
Seq(
"spark-clickhouse-connector",
"Apache Spark",
"ClickHouse",
"Yandex",
"热爱",
"在传统的行式数据库系统中,数据按如下顺序存储:",
"🇨🇳"
).map("'" + _ + "'").foreach { stringVal =>
runTest(sparkFuncName, ckFuncName, stringVal)
}
}
}

Seq(
"clickhouse_murmurHash3_64",
"clickhouse_murmurHash3_32",
"clickhouse_murmurHash2_64",
"clickhouse_murmurHash2_32",
"clickhouse_cityHash64"
).foreach { sparkFuncName =>
val ckFuncName = dummyRegistry.sparkToClickHouseFunc(sparkFuncName)
test(s"UDF $sparkFuncName multiple args") {
Seq(
"spark-clickhouse-connector",
"Apache Spark",
"ClickHouse",
"Yandex",
"热爱",
"在传统的行式数据库系统中,数据按如下顺序存储:",
"🇨🇳"
).map("'" + _ + "'").combinations(5).foreach { seq =>
val stringVal = seq.mkString(", ")
runTest(sparkFuncName, ckFuncName, stringVal)
}
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package xenon.clickhouse.func

import xenon.clickhouse.hash

// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694
object CityHash64 extends MultiStringArgsHash {

override protected def funcName: String = "clickhouse_cityHash64"

override val ckFuncNames: Array[String] = Array("cityHash64")

override def applyHash(input: Array[Any]): Long = hash.CityHash64(input)
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,57 @@ import org.apache.spark.sql.connector.catalog.functions.UnboundFunction

import scala.collection.mutable

trait FunctionRegistry {
trait FunctionRegistry extends Serializable {

def list: Array[String]

def load(name: String): Option[UnboundFunction]

def sparkToClickHouseFunc: Map[String, String]

def clickHouseToSparkFunc: Map[String, String]
}

trait ClickhouseEquivFunction {
val ckFuncNames: Array[String]
}

class CompositeFunctionRegistry(registries: Array[FunctionRegistry]) extends FunctionRegistry {

override def list: Array[String] = registries.flatMap(_.list)

override def load(name: String): Option[UnboundFunction] = registries.flatMap(_.load(name)).headOption

override def sparkToClickHouseFunc: Map[String, String] = registries.flatMap(_.sparkToClickHouseFunc).toMap

override def clickHouseToSparkFunc: Map[String, String] = registries.flatMap(_.clickHouseToSparkFunc).toMap
}

object StaticFunctionRegistry extends FunctionRegistry {

private val functions = Map[String, UnboundFunction](
"ck_xx_hash64" -> ClickHouseXxHash64, // for compatible
"clickhouse_xxHash64" -> ClickHouseXxHash64
"clickhouse_xxHash64" -> ClickHouseXxHash64,
"clickhouse_murmurHash2_32" -> MurmurHash2_32,
"clickhouse_murmurHash2_64" -> MurmurHash2_64,
"clickhouse_murmurHash3_32" -> MurmurHash3_32,
"clickhouse_murmurHash3_64" -> MurmurHash3_64,
"clickhouse_cityHash64" -> CityHash64
)

override def list: Array[String] = functions.keys.toArray

override def load(name: String): Option[UnboundFunction] = functions.get(name)

override val sparkToClickHouseFunc: Map[String, String] =
functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
}

override val clickHouseToSparkFunc: Map[String, String] =
functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).flatMap { case (k, v) =>
v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k))
}
}

class DynamicFunctionRegistry extends FunctionRegistry {
Expand All @@ -56,4 +83,14 @@ class DynamicFunctionRegistry extends FunctionRegistry {
override def list: Array[String] = functions.keys.toArray

override def load(name: String): Option[UnboundFunction] = functions.get(name)

override def sparkToClickHouseFunc: Map[String, String] =
functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) =>
v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((k, _))
}

override def clickHouseToSparkFunc: Map[String, String] =
functions.filter(_._2.isInstanceOf[ClickhouseEquivFunction]).toMap.flatMap { case (k, v) =>
v.asInstanceOf[ClickhouseEquivFunction].ckFuncNames.map((_, k))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package xenon.clickhouse.func

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

abstract class MultiStringArgsHash extends UnboundFunction with ClickhouseEquivFunction {

def applyHash(input: Array[Any]): Long

protected def funcName: String

override val ckFuncNames: Array[String]

override def description: String = s"$name: (value: string, ...) => hash_value: long"

private def isExceptedType(dt: DataType): Boolean =
dt.isInstanceOf[StringType]

final override def name: String = funcName

final override def bind(inputType: StructType): BoundFunction = {
val inputDataTypes = inputType.fields.map(_.dataType)
if (inputDataTypes.forall(isExceptedType)) {
// need to new a ScalarFunction instance for each bind,
// because we do not know the number of arguments in advance
new ScalarFunction[Long] {
override def inputTypes(): Array[DataType] = inputDataTypes
override def name: String = funcName
override def canonicalName: String = s"clickhouse.$name"
override def resultType: DataType = LongType
override def toString: String = name
override def produceResult(input: InternalRow): Long = {
val inputStrings = new Array[Any](input.numFields)
var i = 0
do {
inputStrings(i) = input.getUTF8String(i).getBytes
i += 1
} while (i < input.numFields)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to https://github.com/databricks/scala-style-guide#traversal-and-zipwithindex, we should use while for performance-sensitive code.

applyHash(inputStrings)
}
}
} else {
throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package xenon.clickhouse.func

import xenon.clickhouse.hash
import xenon.clickhouse.hash.HashUtils

// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460
object MurmurHash2_64 extends MultiStringArgsHash {

override protected def funcName: String = "clickhouse_murmurHash2_64"

override val ckFuncNames: Array[String] = Array("murmurHash2_64")

override def applyHash(input: Array[Any]): Long = hash.Murmurhash2_64(input)
}

// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
object MurmurHash2_32 extends MultiStringArgsHash {

override protected def funcName: String = "clickhouse_murmurHash2_32"

override val ckFuncNames: Array[String] = Array("murmurHash2_32")

override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash2_32(input))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package xenon.clickhouse.func

import xenon.clickhouse.hash
import xenon.clickhouse.hash.HashUtils

// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L543
object MurmurHash3_64 extends MultiStringArgsHash {

override protected def funcName: String = "clickhouse_murmurHash3_64"

override val ckFuncNames: Array[String] = Array("murmurHash3_64")

override def applyHash(input: Array[Any]): Long = hash.Murmurhash3_64(input)
}

// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
object MurmurHash3_32 extends MultiStringArgsHash {

override protected def funcName: String = "clickhouse_murmurHash3_32"

override val ckFuncNames: Array[String] = Array("murmurHash3_32")

override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash3_32(input))
}
Loading