Add missed other default case when parsing/inferring XML documents

HyukjinKwon · HyukjinKwon · commit b79b1a99493d · 2016-09-02T19:44:59.000+09:00
This PR adds the support for skipping multiple white spaces around a comment. This should have been added but missed. As `XMLStreamConstants.COMMENT` is always skipped [here](https://github.com/databricks/spark-xml/blob/master/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala#L51-L52) and [here](https://github.com/databricks/spark-xml/blob/master/src/main/scala/com/databricks/spark/xml/util/InferSchema.scala#L85-L86) but it seems it is possible to have the `COMMENT` is between white spaces. In this case, `factory.setProperty(XMLInputFactory.IS_COALESCING, true)` does not coalesce both white spaces. In more details, ```xml <a>  <b>...</b> </a> ``` in this case, `` is surrounded with whitespaces. This produces the events as blow: ```bash XMLStreamConstants.CHARACTERS # whitespace XMLStreamConstants.COMMENT # comment XMLStreamConstants.CHARACTERS # whitespace XMLStreamConstants.START_ELEMENT # <b> ``` Current codes always filter `XmlEvent.COMMENT` so it ends up with ```bash XMLStreamConstants.CHARACTERS # whitespace XMLStreamConstants.CHARACTERS # whitespace XMLStreamConstants.START_ELEMENT # <b> ``` which does not happen in normal cases because we are coalescing multiple `XMLStreamConstants.CHARACTERS` into single one as below: ```bash XMLStreamConstants.CHARACTERS # whitespace XMLStreamConstants.START_ELEMENT # <b> ``` Author: hyukjinkwon <gurwls223@gmail.com> Closes #166 from HyukjinKwon/missed-other-cases.
diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala
@@ -112,6 +112,7 @@ private[xml] object StaxXmlParser {
           case _: EndElement if data.isEmpty => null
           case _: EndElement if options.treatEmptyValuesAsNulls => null
           case _: EndElement => data
+          case _ => convertField(parser, dataType, options)
         }
 
       case (c: Characters, ArrayType(st, _)) =>
diff --git a/src/main/scala/com/databricks/spark/xml/util/InferSchema.scala b/src/main/scala/com/databricks/spark/xml/util/InferSchema.scala
@@ -143,6 +143,7 @@ private[xml] object InferSchema {
           case _: EndElement if data.isEmpty => NullType
           case _: EndElement if options.treatEmptyValuesAsNulls => NullType
           case _: EndElement => StringType
+          case _ => inferField(parser, options)
         }
       case c: Characters if !c.isWhiteSpace =>
         // This means data exists
diff --git a/src/test/resources/null-nested-struct.xml b/src/test/resources/null-nested-struct.xml
@@ -2,6 +2,7 @@
 <root>
 	<item>
 		<b>
+			<!-- nested comments -->
 			<es>
 				<e>1</e>
 			</es>
@@ -10,6 +11,9 @@
 	<item>
         <!-- Issue 117 - This is where an empty Row would be produced instead of null -->
 		<b>
+
+			<!-- nested comments -->
+
 			<es></es>
 		</b>
 	</item>

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,7 @@ private[xml] object StaxXmlParser {`
`112`	`112`	`case _: EndElement if data.isEmpty => null`
`113`	`113`	`case _: EndElement if options.treatEmptyValuesAsNulls => null`
`114`	`114`	`case _: EndElement => data`
	`115`	`+ case _ => convertField(parser, dataType, options)`
`115`	`116`	`}`
`116`	`117`
`117`	`118`	`case (c: Characters, ArrayType(st, _)) =>`
Original file line number	Diff line number	Diff line change
`@@ -143,6 +143,7 @@ private[xml] object InferSchema {`
`143`	`143`	`case _: EndElement if data.isEmpty => NullType`
`144`	`144`	`case _: EndElement if options.treatEmptyValuesAsNulls => NullType`
`145`	`145`	`case _: EndElement => StringType`
	`146`	`+ case _ => inferField(parser, options)`
`146`	`147`	`}`
`147`	`148`	`case c: Characters if !c.isWhiteSpace =>`
`148`	`149`	`// This means data exists`