Fix JSON parsers

brkyvz · brkyvz · commit 1d998fb909e6 · 2018-01-17T17:05:33.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {
   }
 
   def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    jsonFactory.createParser(record.getBytes, 0, record.getLength)
+    val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
+    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
   def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
-    jsonFactory.createParser(record)
+    jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
+  private val badJson = "\u0000\u0000\u0000A\u0001AAA"
+
   // JSON does not write data of NullType and does not play well with BinaryType.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
     case _: NullType => false
@@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       )
     }
   }
+
+  test("invalid json with leading nulls - from file (multiLine=true)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val expected = s"""$badJson\n{"a":1}\n"""
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
+      checkAnswer(df, Row(null, expected))
+    }
+  }
+
+  test("invalid json with leading nulls - from file (multiLine=false)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
+      checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
+    }
+  }
+
+  test("invalid json with leading nulls - from dataset") {
+    import testImplicits._
+    checkAnswer(
+      spark.read.json(Seq(badJson).toDS()),
+      Row(badJson))
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`def text(jsonFactory: JsonFactory, record: Text): JsonParser = {`
`43`		`- jsonFactory.createParser(record.getBytes, 0, record.getLength)`
	`43`	`+ val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)`
	`44`	`+ jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))`
`44`	`45`	`}`
`45`	`46`
`46`	`47`	`def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {`
`47`		`- jsonFactory.createParser(record)`
	`48`	`+ jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))`
`48`	`49`	`}`
`49`	`50`	`}`