-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-22000][SQL] Address missing Upcast in JavaTypeInference.deserializerFor #23854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
acd5d55
443e74f
d869bba
60ca088
9eeb391
d4c2060
9c040e2
2007452
4d564ab
e01bfe6
8cbad26
7ff01db
24a1b19
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -115,6 +115,37 @@ public void testBeanWithMapFieldsDeserialization() { | |
| Assert.assertEquals(records, MAP_RECORDS); | ||
| } | ||
|
|
||
| private static final List<RecordSpark22000> RECORDS_SPARK_22000 = new ArrayList<>(); | ||
|
|
||
| static { | ||
| RECORDS_SPARK_22000.add(new RecordSpark22000("1", "[email protected]", 2, 11)); | ||
| RECORDS_SPARK_22000.add(new RecordSpark22000("2", "[email protected]", 3, 12)); | ||
| RECORDS_SPARK_22000.add(new RecordSpark22000("3", "[email protected]", 4, 13)); | ||
| RECORDS_SPARK_22000.add(new RecordSpark22000("4", "[email protected]", 5, 14)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSpark22000() { | ||
| // Here we try to convert the type of 'ref' field, from integer to string. | ||
| // Before applying SPARK-22000, Spark called toString() against variable which type might be primitive. | ||
| // SPARK-22000 it calls String.valueOf() which finally calls toString() but handles boxing | ||
| // if the type is primitive. | ||
| Encoder<RecordSpark22000> encoder = Encoders.bean(RecordSpark22000.class); | ||
|
|
||
| Dataset<RecordSpark22000> dataset = spark | ||
| .read() | ||
| .format("csv") | ||
| .option("header", "true") | ||
| .option("mode", "DROPMALFORMED") | ||
| .schema("ref int, userId string, x int, y int") | ||
|
||
| .load("src/test/resources/test-data/spark-22000.csv") | ||
|
||
| .as(encoder); | ||
|
|
||
| List<RecordSpark22000> records = dataset.collectAsList(); | ||
|
|
||
| Assert.assertEquals(records, RECORDS_SPARK_22000); | ||
| } | ||
|
|
||
| public static class ArrayRecord { | ||
|
|
||
| private int id; | ||
|
|
@@ -252,4 +283,73 @@ public String toString() { | |
| return String.format("[%d,%d]", startTime, endTime); | ||
| } | ||
| } | ||
|
|
||
| public static class RecordSpark22000 { | ||
|
||
| private String ref; | ||
| private String userId; | ||
| private int x; | ||
| private int y; | ||
|
|
||
| public RecordSpark22000() { } | ||
|
|
||
| RecordSpark22000(String ref, String userId, int x, int y) { | ||
| this.ref = ref; | ||
| this.userId = userId; | ||
| this.x = x; | ||
| this.y = y; | ||
| } | ||
|
|
||
| public String getRef() { | ||
| return ref; | ||
| } | ||
|
|
||
| public void setRef(String ref) { | ||
| this.ref = ref; | ||
| } | ||
|
|
||
| public String getUserId() { | ||
| return userId; | ||
| } | ||
|
|
||
| public void setUserId(String userId) { | ||
| this.userId = userId; | ||
| } | ||
|
|
||
| public int getX() { | ||
| return x; | ||
| } | ||
|
|
||
| public void setX(int x) { | ||
| this.x = x; | ||
| } | ||
|
|
||
| public int getY() { | ||
| return y; | ||
| } | ||
|
|
||
| public void setY(int y) { | ||
| this.y = y; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) return true; | ||
| if (o == null || getClass() != o.getClass()) return false; | ||
| RecordSpark22000 that = (RecordSpark22000) o; | ||
| return x == that.x && | ||
| y == that.y && | ||
| Objects.equals(ref, that.ref) && | ||
| Objects.equals(userId, that.userId); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(ref, userId, x, y); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this need toString? I understand hashCode and equals
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will help to compare expected and actual when test fails. Otherwise they would've seen as |
||
| return String.format("ref='%s', userId='%s', x=%d, y=%d", ref, userId, x, y); | ||
| } | ||
HeartSaVioR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| ref,userId,x,y | ||
| 1,[email protected],2,11 | ||
| 2,[email protected],3,12 | ||
| 3,[email protected],4,13 | ||
| 4,[email protected],5,14 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
ScalaReflectiondoes the same thing, do we have a problem there too?AFAIK the
pathshould be a string type column, and it's always safe to callUTF8String.toString. My gut feeling is, we miss to addUpcastsomewhere inJavaTypeInference.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The sample code in JIRA issue tried to bind IntegerType column to String field in Java bean, which looks to break your expectation. (I guess ScalaReflection would not encounter this case.)
Spark doesn't throw error for this case though - actually Spark would show undefined behaviors, compilation failures on generated code, even might be possible to throw runtime exception.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In scala, we can also do this and Spark will add
Upcast. e.g.spark.range(1).as[String].collectworks fine.I did a quick search and
JavaTypeInferencehas noUpcast. We should fix it and followScalaReflectionThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah OK. I'll check and address it. Maybe it would be a separate PR if it doesn't fix the new test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah your suggestion seems to work nicely! I left comment to ask which approach to choose: please compare both approach and comment. Thanks!