Adding tests

mmolimar · MaxGekk · commit 48e143d43a87 · 2018-09-08T22:19:32.000+02:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -91,9 +91,10 @@ abstract class CSVDataSource extends Serializable {
       }
 
       row.zipWithIndex.map { case (value, index) =>
-        if (value == null || value.isEmpty || value == options.nullValue) {
-          // When there are empty strings or the values set in `nullValue`, put the
-          // index as the suffix.
+        if (value == null || value.isEmpty || value == options.nullValue ||
+          value == options.emptyValueInRead) {
+          // When there are empty strings or the values set in `nullValue` or in `emptyValue`,
+          // put the index as the suffix.
           s"_c$index"
         } else if (!caseSensitive && duplicates.contains(value.toLowerCase)) {
           // When there are case-insensitive duplicates, put the index as the suffix.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -79,7 +79,8 @@ private[csv] object CSVInferSchema {
    * point checking if it is an Int, as the final type must be Double or higher.
    */
   def inferField(typeSoFar: DataType, field: String, options: CSVOptions): DataType = {
-    if (field == null || field.isEmpty || field == options.nullValue) {
+    if (field == null || field.isEmpty || field == options.nullValue ||
+      field == options.emptyValueInRead) {
       typeSoFar
     } else {
       typeSoFar match {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
@@ -105,6 +105,20 @@ class CSVInferSchemaSuite extends SparkFunSuite {
     assert(CSVInferSchema.inferField(DecimalType(1, 1), "\\N", options) == DecimalType(1, 1))
   }
 
+  test("Empty fields are handled properly when an emptyValue is specified") {
+    var options = new CSVOptions(Map("emptyValue" -> "empty"), false, "GMT")
+    assert(CSVInferSchema.inferField(NullType, "empty", options) == NullType)
+    assert(CSVInferSchema.inferField(StringType, "empty", options) == StringType)
+    assert(CSVInferSchema.inferField(LongType, "empty", options) == LongType)
+
+    options = new CSVOptions(Map("emptyValue" -> "\\N"), false, "GMT")
+    assert(CSVInferSchema.inferField(IntegerType, "\\N", options) == IntegerType)
+    assert(CSVInferSchema.inferField(DoubleType, "\\N", options) == DoubleType)
+    assert(CSVInferSchema.inferField(TimestampType, "\\N", options) == TimestampType)
+    assert(CSVInferSchema.inferField(BooleanType, "\\N", options) == BooleanType)
+    assert(CSVInferSchema.inferField(DecimalType(1, 1), "\\N", options) == DecimalType(1, 1))
+  }
+
   test("Merging Nulltypes should yield Nulltype.") {
     val mergedNullTypes = CSVInferSchema.mergeRowTypes(Array(NullType), Array(NullType))
     assert(mergedNullTypes.deep == Array(NullType).deep)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1440,6 +1440,52 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
     }
   }
 
+  test("SPARK-25241: An empty string should not be coerced to null when emptyValue is passed.") {
+    val litNull: String = null
+    val df = Seq(
+      (1, "John Doe"),
+      (2, ""),
+      (3, "-"),
+      (4, litNull)
+    ).toDF("id", "name")
+
+    // Checks for new behavior where a null is not coerced to an empty string when `emptyValue` is
+    // set to anything but an empty string literal.
+    withTempPath { path =>
+      df.write
+        .option("emptyValue", "-")
+        .csv(path.getAbsolutePath)
+      val computed = spark.read
+        .option("emptyValue", "-")
+        .schema(df.schema)
+        .csv(path.getAbsolutePath)
+      val expected = Seq(
+        (1, "John Doe"),
+        (2, "-"),
+        (3, "-"),
+        (4, "-")
+      ).toDF("id", "name")
+
+      checkAnswer(computed, expected)
+    }
+    // Keeps the old behavior where empty string us coerced to emptyValue is not passed.
+    withTempPath { path =>
+      df.write
+        .csv(path.getAbsolutePath)
+      val computed = spark.read
+        .schema(df.schema)
+        .csv(path.getAbsolutePath)
+      val expected = Seq(
+        (1, "John Doe"),
+        (2, litNull),
+        (3, "-"),
+        (4, litNull)
+      ).toDF("id", "name")
+
+      checkAnswer(computed, expected)
+    }
+  }
+
   test("SPARK-24329: skip lines with comments, and one or multiple whitespaces") {
     val schema = new StructType().add("colA", StringType)
     val ds = spark