From 050101f1a775fd46d0f74a939e5b5aa949663c3a Mon Sep 17 00:00:00 2001 From: chenliang Date: Wed, 9 Dec 2020 20:52:51 +0800 Subject: [PATCH] [SPARK-33721][SQL] Support to use Hive build-in functions by configuration --- .../apache/spark/sql/internal/SQLConf.scala | 22 ++++++++++++++ .../sql/hive/HiveSessionStateBuilder.scala | 29 ++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index bc62213bdb740..44520d2c5de11 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2973,6 +2973,24 @@ object SQLConf { .booleanConf .createWithDefault(false) + val USE_HIVE_BUILD_IN_FUNCTIONS_ENABLED = + buildConf("spark.sql.hive.buildin.functions.enabled") + .internal() + .doc("When true, Spark will register hive build-in functions like unix_timestamp,to_date," + + " datediff,collect_set instead of self build-in functions.") + .booleanConf + .createWithDefault(false) + + val HIVE_BUILD_IN_FUNCTIONS_LIST = + buildConf("spark.sql.hive.buildin.functions.list") + .internal() + .doc("Configures a list of hive build-in functions, here is an example for the format: " + + " org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp:unix_timestamp;" + + " org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate:to_date. This configuration only " + + s" has an effect when '${USE_HIVE_BUILD_IN_FUNCTIONS_ENABLED.key}' is set to true.") + .stringConf + .createWithDefault("") + /** * Holds information about keys that have been deprecated. * @@ -3627,6 +3645,10 @@ class SQLConf extends Serializable with Logging { def charVarcharAsString: Boolean = getConf(SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING) + def useHiveBuildInFunctionsEnabled: Boolean = getConf(SQLConf.USE_HIVE_BUILD_IN_FUNCTIONS_ENABLED) + + def hiveBuildINFunctionsList: String = getConf(SQLConf.HIVE_BUILD_IN_FUNCTIONS_LIST) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index da37b61688951..1f631993b2e95 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.hive import org.apache.spark.annotation.Unstable import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.{Analyzer, ResolveSessionCatalog} -import org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener +import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, ExternalCatalogWithListener} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlanner @@ -66,6 +67,32 @@ class HiveSessionStateBuilder( sqlParser, resourceLoader) parentState.foreach(_.catalog.copyStateTo(catalog)) + + // Use hive build-in functions first. + if (conf.useHiveBuildInFunctionsEnabled) { + catalog.registerFunction(CatalogFunction(FunctionIdentifier("unix_timestamp", None), + "org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp", Seq.empty), true) + catalog.registerFunction(CatalogFunction(FunctionIdentifier("to_date", None), + "org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate", Seq.empty), true) + catalog.registerFunction(CatalogFunction(FunctionIdentifier("collect_set", None), + "org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectSet", Seq.empty), true) + catalog.registerFunction(CatalogFunction(FunctionIdentifier("datediff", None), + "org.apache.hadoop.hive.ql.udf.generic.GenericUDFDateDiff", Seq.empty), true) + + // Support to add additional hive build-in functions through configuration + val hiveBuildINFunctionsList = conf.hiveBuildINFunctionsList + if (!hiveBuildINFunctionsList.isEmpty) { + hiveBuildINFunctionsList.split(";").foreach(oneUdf => { + val parts = oneUdf.split(":") + if (parts.length == 2) { + catalog.registerFunction(CatalogFunction(FunctionIdentifier(parts(0), None), + parts(1), Seq.empty), true) + } + }) + } + } + + catalog }