-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24391][SQL] Support arrays of any types by from_json #21439
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3a7559b
b601a93
02a97ac
f34bd88
f062dd2
86d2f20
9d0230a
181dcae
6d54cf0
e321e37
1c657e8
b5b0d9c
ce9918b
fced8ec
0fd0fb9
e49ee9d
2bca7e0
f3efb1b
82d4fd5
758d1df
8349ca8
2746d35
bc3a2dd
39a0a4e
9c2681a
021350b
89719c0
bdfd8a1
74a7799
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -523,6 +523,8 @@ case class JsonToStructs( | |
| // can generate incorrect files if values are missing in columns declared as non-nullable. | ||
| val nullableSchema = if (forceNullableSchema) schema.asNullable else schema | ||
|
|
||
| val unpackArray: Boolean = options.get("unpackArray").map(_.toBoolean).getOrElse(false) | ||
|
|
||
| override def nullable: Boolean = true | ||
|
|
||
| // Used in `FunctionRegistry` | ||
|
|
@@ -548,7 +550,9 @@ case class JsonToStructs( | |
| forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA)) | ||
|
|
||
| override def checkInputDataTypes(): TypeCheckResult = nullableSchema match { | ||
| case _: StructType | ArrayType(_: StructType, _) | _: MapType => | ||
| case ArrayType(_: StructType, _) if unpackArray => | ||
|
||
| super.checkInputDataTypes() | ||
| case _: StructType | _: ArrayType | _: MapType => | ||
| super.checkInputDataTypes() | ||
| case _ => TypeCheckResult.TypeCheckFailure( | ||
| s"Input schema ${nullableSchema.simpleString} must be a struct or an array of structs.") | ||
|
|
@@ -557,7 +561,8 @@ case class JsonToStructs( | |
| @transient | ||
| lazy val rowSchema = nullableSchema match { | ||
| case st: StructType => st | ||
| case ArrayType(st: StructType, _) => st | ||
| case ArrayType(st: StructType, _) if unpackArray => st | ||
| case at: ArrayType => at | ||
| case mt: MapType => mt | ||
| } | ||
|
|
||
|
|
@@ -566,8 +571,10 @@ case class JsonToStructs( | |
| lazy val converter = nullableSchema match { | ||
| case _: StructType => | ||
| (rows: Seq[InternalRow]) => if (rows.length == 1) rows.head else null | ||
| case ArrayType(_: StructType, _) => | ||
| case ArrayType(_: StructType, _) if unpackArray => | ||
| (rows: Seq[InternalRow]) => new GenericArrayData(rows) | ||
| case _: ArrayType => | ||
| (rows: Seq[InternalRow]) => rows.head.getArray(0) | ||
| case _: MapType => | ||
| (rows: Seq[InternalRow]) => rows.head.getMap(0) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,6 +61,7 @@ class JacksonParser( | |
| dt match { | ||
| case st: StructType => makeStructRootConverter(st) | ||
| case mt: MapType => makeMapRootConverter(mt) | ||
| case at: ArrayType => makeArrayRootConverter(at) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change accepts the json datasource form that the master can't parse? If so, I think we need tests in
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, it accept arrays of any types comparing to the master which accepts arrays of structs only
I added a few tests to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You've already added tests in
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I didn't catch you meant specific class. Just in case, what is the reason for adding tests to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, ok. You touched the the |
||
| } | ||
| } | ||
|
|
||
|
|
@@ -101,6 +102,13 @@ class JacksonParser( | |
| } | ||
| } | ||
|
|
||
| private def makeArrayRootConverter(at: ArrayType): JsonParser => Seq[InternalRow] = { | ||
| val elemConverter = makeConverter(at.elementType) | ||
| (parser: JsonParser) => parseJsonToken[Seq[InternalRow]](parser, at) { | ||
| case START_ARRAY => Seq(InternalRow(convertArray(parser, elemConverter))) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In line 87: Should we also follow this?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code in line 87 returns In case when schema is |
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a converter which converts the JSON documents held by the `JsonParser` | ||
| * to a value according to a desired schema. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -423,7 +423,9 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with | |
| val input = """{"a": 1}""" | ||
| val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
| val output = InternalRow(1) :: Nil | ||
| checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output) | ||
| checkEvaluation( | ||
| JsonToStructs(schema, Map("unpackArray" -> "true"), Literal(input), gmtId, true), | ||
|
||
| output) | ||
|
||
| } | ||
|
|
||
| test("from_json - input=empty array, schema=array, output=empty array") { | ||
|
|
@@ -437,7 +439,9 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with | |
| val input = "{ }" | ||
| val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil)) | ||
| val output = InternalRow(null) :: Nil | ||
| checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output) | ||
| checkEvaluation( | ||
| JsonToStructs(schema, Map("unpackArray" -> "true"), Literal(input), gmtId, true), | ||
| output) | ||
| } | ||
|
|
||
| test("from_json - input=array of single object, schema=struct, output=single row") { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,12 +136,11 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { | |
| test("from_json invalid schema") { | ||
|
||
| val df = Seq("""{"a" 1}""").toDS() | ||
| val schema = ArrayType(StringType) | ||
| val message = intercept[AnalysisException] { | ||
| df.select(from_json($"value", schema)) | ||
| }.getMessage | ||
|
|
||
| assert(message.contains( | ||
| "Input schema array<string> must be a struct or an array of structs.")) | ||
| checkAnswer( | ||
| df.select(from_json($"value", schema)), | ||
| Seq(Row(null)) | ||
| ) | ||
| } | ||
|
|
||
| test("from_json array support") { | ||
|
|
@@ -392,4 +391,72 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { | |
| checkAnswer(Seq("""{"{"f": 1}": "a"}""").toDS().select(from_json($"value", schema)), | ||
| Row(null)) | ||
| } | ||
|
|
||
| test("from_json - array of primitive types") { | ||
| val df = Seq("[1, 2, 3]").toDF("a") | ||
| val schema = new ArrayType(IntegerType, false) | ||
|
|
||
| checkAnswer(df.select(from_json($"a", schema)), Seq(Row(Array(1, 2, 3)))) | ||
| } | ||
|
|
||
| test("from_json - array of primitive types - malformed row") { | ||
| val df = Seq("[1, 2 3]").toDF("a") | ||
| val schema = new ArrayType(IntegerType, false) | ||
|
|
||
| checkAnswer(df.select(from_json($"a", schema)), Seq(Row(null))) | ||
| } | ||
|
|
||
| test("from_json - array of arrays") { | ||
| val jsonDF = Seq("[[1], [2, 3], [4, 5, 6]]").toDF("a") | ||
| val schema = new ArrayType(ArrayType(IntegerType, false), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer( | ||
| sql("select json[0][0], json[1][1], json[2][2] from jsonTable"), | ||
| Seq(Row(1, 3, 6))) | ||
| } | ||
|
|
||
| test("from_json - array of arrays - malformed row") { | ||
| val jsonDF = Seq("[[1], [2, 3], 4, 5, 6]]").toDF("a") | ||
| val schema = new ArrayType(ArrayType(IntegerType, false), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer(sql("select json[0] from jsonTable"), Seq(Row(null))) | ||
| } | ||
|
|
||
| test("from_json - array of structs") { | ||
| val jsonDF = Seq("""[{"a":1}, {"a":2}, {"a":3}]""").toDF("a") | ||
| val schema = new ArrayType(new StructType().add("a", IntegerType), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer( | ||
| sql("select json[0], json[1], json[2] from jsonTable"), | ||
| Seq(Row(Row(1), Row(2), Row(3)))) | ||
| } | ||
|
|
||
| test("from_json - array of structs - malformed row") { | ||
| val jsonDF = Seq("""[{"a":1}, {"a:2}, {"a":3}]""").toDF("a") | ||
| val schema = new ArrayType(new StructType().add("a", IntegerType), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer(sql("select json[0], json[1]from jsonTable"), Seq(Row(null, null))) | ||
| } | ||
|
|
||
| test("from_json - array of maps") { | ||
| val jsonDF = Seq("""[{"a":1}, {"b":2}]""").toDF("a") | ||
| val schema = new ArrayType(MapType(StringType, IntegerType, false), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer( | ||
| sql("""select json[0], json[1] from jsonTable"""), | ||
| Seq(Row(Map("a" -> 1), Map("b" -> 2)))) | ||
| } | ||
|
|
||
| test("from_json - array of maps - malformed row") { | ||
| val jsonDF = Seq("""[{"a":1} "b":2}]""").toDF("a") | ||
| val schema = new ArrayType(MapType(StringType, IntegerType, false), false) | ||
| jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") | ||
|
|
||
| checkAnswer(sql("""select json[0] from jsonTable"""), Seq(Row(null))) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
private? (This is not related to this pr though,
nullableSchemaalso can be private?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you make the option
unpackArraycase-insensitive?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we add this new option here, I feel we'd be better to document somewhere (e.g.,
sq/functions.scala)