databricks · tanwanirahul · Jan 29, 2016 · Jan 29, 2016 · Jan 30, 2016 · Jan 30, 2016
diff --git a/src/main/scala/com/databricks/spark/csv/CsvParser.scala b/src/main/scala/com/databricks/spark/csv/CsvParser.scala
@@ -15,7 +15,6 @@
  */
 package com.databricks.spark.csv
 
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.types.StructType
@@ -117,12 +116,12 @@ class CsvParser extends Serializable {
     this
   }
 
-  /** Returns a Schema RDD for the given CSV path. */
-  @throws[RuntimeException]
-  def csvFile(sqlContext: SQLContext, path: String): DataFrame = {
-    val relation: CsvRelation = CsvRelation(
-      () => TextFile.withCharset(sqlContext.sparkContext, path, charset),
-      Some(path),
+  /** Returns a csvRelation instance based on the state definition of csv parser. */
+  private[csv] def csvRelation(sqlContext: SQLContext, csvRDD: RDD[String],
+                               path: Option[String]): CsvRelation = {
+      CsvRelation(
+      () => csvRDD,
+      path,
       useHeader,
       delimiter,
       quote,
@@ -137,27 +136,17 @@ class CsvParser extends Serializable {
       inferSchema,
       codec,
       nullValue)(sqlContext)
+  }
+  /** Returns a Schema RDD for the given CSV path. */
+  @throws[RuntimeException]
+  def csvFile(sqlContext: SQLContext, path: String): DataFrame = {
+    val relation: CsvRelation = csvRelation(sqlContext,
+                TextFile.withCharset(sqlContext.sparkContext, path, charset), Some(path))
     sqlContext.baseRelationToDataFrame(relation)
   }
 
   def csvRdd(sqlContext: SQLContext, csvRDD: RDD[String]): DataFrame = {
-    val relation: CsvRelation = CsvRelation(
-      () => csvRDD,
-      None,
-      useHeader,
-      delimiter,
-      quote,
-      escape,
-      comment,
-      parseMode,
-      parserLib,
-      ignoreLeadingWhiteSpace,
-      ignoreTrailingWhiteSpace,
-      treatEmptyValuesAsNulls,
-      schema,
-      inferSchema,
-      codec,
-      nullValue)(sqlContext)
+    val relation: CsvRelation = csvRelation(sqlContext, csvRDD, None)
     sqlContext.baseRelationToDataFrame(relation)
   }
 }
diff --git a/src/main/scala/com/databricks/spark/csv/util/InferSchema.scala b/src/main/scala/com/databricks/spark/csv/util/InferSchema.scala
@@ -42,7 +42,11 @@ private[csv] object InferSchema {
       mergeRowTypes)
 
     val structFields = header.zip(rootTypes).map { case (thisHeader, rootType) =>
-      StructField(thisHeader, rootType, nullable = true)
+      val dType = rootType match {
+          case z: NullType => StringType
+          case other => other
+      }
+      StructField(thisHeader, dType, nullable = true)
     }
 
     StructType(structFields)
@@ -62,11 +66,7 @@ private[csv] object InferSchema {
       first: Array[DataType],
       second: Array[DataType]): Array[DataType] = {
     first.zipAll(second, NullType, NullType).map { case ((a, b)) =>
-      val tpe = findTightestCommonType(a, b).getOrElse(StringType)
-      tpe match {
-        case _: NullType => StringType
-        case other => other
-      }
+      findTightestCommonType(a, b).getOrElse(NullType)
     }
   }
 
@@ -93,7 +93,6 @@ private[csv] object InferSchema {
     }
   }
 
-
   private def tryParseInteger(field: String): DataType = if ((allCatch opt field.toInt).isDefined) {
     IntegerType
   } else {
@@ -152,6 +151,8 @@ private[csv] object InferSchema {
     case (t1, t2) if t1 == t2 => Some(t1)
     case (NullType, t1) => Some(t1)
     case (t1, NullType) => Some(t1)
+    case (StringType, t2) => Some(StringType)
+    case (t1, StringType) => Some(StringType)
 
     // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
     case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>

diff --git a/src/test/resources/simple.csv b/src/test/resources/simple.csv
@@ -0,0 +1,5 @@
+A,B,C,D
+1,,,
+,1,,
+,,1,
+,,,1
diff --git a/src/test/scala/com/databricks/spark/csv/util/InferSchemaSuite.scala b/src/test/scala/com/databricks/spark/csv/util/InferSchemaSuite.scala
@@ -2,8 +2,31 @@ package com.databricks.spark.csv.util
 
 import org.apache.spark.sql.types._
 import org.scalatest.FunSuite
+import org.scalatest.BeforeAndAfterAll
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.SQLContext
+import com.databricks.spark.csv.CsvParser
+import com.databricks.spark.csv.CsvRelation
 
-class InferSchemaSuite extends FunSuite {
+class InferSchemaSuite extends FunSuite with BeforeAndAfterAll {
+
+  private val simpleDatasetFile = "src/test/resources/simple.csv"
+  private val utf8Charset = "utf-8"
+  private var sqlContext: SQLContext = _
+
+  override def beforeAll(): Unit =
+  {
+    super.beforeAll()
+    sqlContext = new SQLContext(new SparkContext("local[2]", "InferSchemaSuite"))
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      sqlContext.sparkContext.stop()
+    } finally {
+      super.afterAll()
+    }
+  }
 
   test("String fields types are inferred correctly from null types") {
     assert(InferSchema.inferField(NullType, "") == NullType)
@@ -40,6 +63,14 @@ class InferSchemaSuite extends FunSuite {
     assert(InferSchema.inferField(LongType, "2015-08 14:49:00") == StringType)
   }
 
+  test("Merging Nulltypes should yeild Nulltype.")
+  {
+      assert(
+      InferSchema.mergeRowTypes(Array(NullType),
+      Array(NullType)).deep == Array(NullType).deep)
+
+  }
+
   test("Type arrays are merged to highest common type") {
     assert(
       InferSchema.mergeRowTypes(Array(StringType),
@@ -52,4 +83,14 @@ class InferSchemaSuite extends FunSuite {
       Array(LongType)).deep == Array(DoubleType).deep)
   }
 
+  test("Type/Schema inference works as expected for the simple parse dataset.")
+  {
+    val df = new CsvParser().withUseHeader(true).withInferSchema(true)
+            .csvFile(sqlContext, simpleDatasetFile)
+    assert(
+        df.schema.fields.map{field => field.dataType}.deep ==
+        Array(IntegerType, IntegerType, IntegerType, IntegerType).deep
+    )
+
+  }
 }
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    A,B,C,D
+,,,
+    ,1,,
+    ,,1,
+    ,,,1