addressed review comments

kiszk · kiszk · commit 2c405a130c09 · 2016-11-08T04:20:34.000+09:00
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -20,11 +20,12 @@ package org.apache.spark.sql.catalyst.expressions
 import java.nio.charset.StandardCharsets
 
 import org.apache.commons.codec.digest.DigestUtils
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -124,6 +125,19 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         new StructType().add("array", arrayOfString).add("map", mapOfString))
       .add("structOfUDT", structOfUDT))
 
+  test("SPARK-18207: Compute hash for a lot of String expressions") {
+    val N = 1000
+    val wideRow = new GenericInternalRow(
+      (1 to N).map(i => UTF8String.fromString(i.toString)).toArray[Any])
+    val schema = StructType((1 to N).map(i => StructField("", StringType)))
+
+    val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
+      BoundReference(i, f.dataType, true)
+    }
+    val hashExpr = Murmur3Hash(exprs, 42)
+    GenerateMutableProjection.generate(Seq(hashExpr))
+  }
+
   private def testHash(inputSchema: StructType): Unit = {
     val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
     val encoder = RowEncoder(inputSchema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1728,23 +1728,3 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema)
     assert(df.filter($"array1" === $"array2").count() == 1)
   }
-
-  test("SPARK-18207: Compute hash for wider table") {
-    import org.apache.spark.sql.types.{StructType, StringType}
-
-    val COLMAX = 1000
-    val schema: StructType = (1 to COLMAX)
-      .foldLeft(new StructType())((s, i) => s.add(s"g$i", StringType, nullable = true))
-    val rdds = spark.sparkContext.parallelize(Seq(Row.fromSeq((1 to COLMAX).map(_.toString))))
-    val wideDF = spark.createDataFrame(rdds, schema)
-
-    val widePlus = wideDF.withColumn("d_rank", lit(1))
-    widePlus.createOrReplaceTempView("wide_plus")
-    val widePlus2 = widePlus.withColumn("d_rank", lit(0))
-    widePlus2.createOrReplaceTempView("wide_plus2")
-
-    // HashAggregate operation in this SQL union operator involves computation of hash for a row
-    val df = spark.sqlContext.sql("select * from wide_plus union select * from wide_plus2")
-    df.count
-  }
-}