apache · gatorsmile · Jul 13, 2017 · Jul 13, 2017 · Jul 17, 2017 · Jul 18, 2017
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -28,7 +28,7 @@
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import IntegerType, Row, StringType
+from pyspark.sql.types import DoubleType, IntegerType, Row, StringType
 from pyspark.sql.utils import install_exception_handler
 
 __all__ = ["SQLContext", "HiveContext", "UDFRegistration"]
@@ -208,29 +208,37 @@ def registerFunction(self, name, f, returnType=StringType()):
 
     @ignore_unicode_prefix
     @since(2.1)
-    def registerJavaFunction(self, name, javaClassName, returnType=None):
+    def registerJavaFunction(self, name, javaClassName, returnType=None, deterministic=True):
         """Register a java UDF so it can be used in SQL statements.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not specified we would infer it via reflection.
-        :param name:  name of the UDF
-        :param javaClassName: fully qualified name of java class
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
+
+        :param name: name of the UDF.
+        :param javaClassName: fully qualified name of java class.
+        :param returnType: a :class:`pyspark.sql.types.DataType` object.
+        :param deterministic: a flag indicating if the UDF is deterministic.  Deterministic UDF
+            returns same result each time it is invoked with a particular input.
 
         >>> sqlContext.registerJavaFunction("javaStringLength",
         ...   "test.org.apache.spark.sql.JavaStringLength", IntegerType())
         >>> sqlContext.sql("SELECT javaStringLength('test')").collect()
-        [Row(UDF(test)=4)]
+        [Row(UDF:javaStringLength(test)=4)]
         >>> sqlContext.registerJavaFunction("javaStringLength2",
         ...   "test.org.apache.spark.sql.JavaStringLength")
         >>> sqlContext.sql("SELECT javaStringLength2('test')").collect()
-        [Row(UDF(test)=4)]
+        [Row(UDF:javaStringLength2(test)=4)]
+        >>> sqlContext.registerJavaFunction("javaRand",
+        ...   "test.org.apache.spark.sql.JavaRandUDF", DoubleType(), deterministic=False)
+        >>> sqlContext.sql("SELECT javaRand(3)").collect()  # doctest: +SKIP
+        [Row(UDF:javaRand(3)=3.12345)]
 
         """
         jdt = None
         if returnType is not None:
             jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
-        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
+        self.sparkSession._jsparkSession.udf().registerJava(
+            name, javaClassName, jdt, deterministic)
 
     @ignore_unicode_prefix
     @since(2.3)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1950,7 +1950,7 @@ class Analyzer(
 
       case p => p transformExpressionsUp {
 
-        case udf @ ScalaUDF(func, _, inputs, _, _, _) =>
+        case udf @ ScalaUDF(func, _, inputs, _, _, _, _) =>
           val parameterTypes = ScalaReflection.getParameterTypes(func)
           assert(parameterTypes.length == inputs.length)
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * User-defined function.
- * Note that the user-defined functions must be deterministic.
  * @param function  The user defined scala function to run.
  *                  Note that if you use primitive parameters, you are not able to check if it is
  *                  null or not, and the UDF will return null for you if the primitive input is
@@ -35,18 +34,23 @@ import org.apache.spark.sql.types.DataType
  *                    not want to perform coercion, simply use "Nil". Note that it would've been
  *                    better to use Option of Seq[DataType] so we can use "None" as the case for no
  *                    type coercion. However, that would require more refactoring of the codebase.
- * @param udfName   The user-specified name of this UDF.
+ * @param udfName  The user-specified name of this UDF.
  * @param nullable  True if the UDF can return null value.
+ * @param udfDeterministic  True if the UDF is deterministic. Deterministic UDF returns same result
+ *                          each time it is invoked with a particular input.
  */
 case class ScalaUDF(
     function: AnyRef,
     dataType: DataType,
     children: Seq[Expression],
     inputTypes: Seq[DataType] = Nil,
     udfName: Option[String] = None,
-    nullable: Boolean = true)
+    nullable: Boolean = true,
+    udfDeterministic: Boolean = true)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression {
 
+  override def deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+
   override def toString: String =
     s"${udfName.map(name => s"UDF:$name").getOrElse("UDF")}(${children.mkString(", ")})"