-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26456][SQL] Cast date/timestamp to string by Date/TimestampFormatter #23391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
e09c972
697688a
fcffc12
f0a9fe7
5f0b0a3
17a32a3
9d90d3f
038ad80
f6308f6
9f85ac6
d348c07
36c9f9a
4580c25
56bdae4
8541602
ecf0e89
c3066e1
eed40d7
94cad6a
0de72c8
1156291
5ae58a3
78c3961
2397401
206c955
483e95b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -88,11 +88,18 @@ class LegacyFallbackDateFormatter( | |
| } | ||
|
|
||
| object DateFormatter { | ||
| val defaultPattern: String = "yyyy-MM-dd" | ||
| val defaultLocale: Locale = Locale.US | ||
|
|
||
| def apply(format: String, locale: Locale): DateFormatter = { | ||
| if (SQLConf.get.legacyTimeParserEnabled) { | ||
| new LegacyFallbackDateFormatter(format, locale) | ||
| } else { | ||
| new Iso8601DateFormatter(format, locale) | ||
| } | ||
| } | ||
|
|
||
| def apply(format: String): DateFormatter = apply(format, defaultLocale) | ||
|
|
||
| def apply(): DateFormatter = apply(defaultPattern) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both formatters seems to use thread safe implementations. You could consider just returning cached instances here.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment, both formatters are created per partition at least not per row. Do you think it makes sense to cache them?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, lets leave it for now. |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,7 +42,7 @@ sealed trait TimestampFormatter extends Serializable { | |
| @throws(classOf[ParseException]) | ||
| @throws(classOf[DateTimeParseException]) | ||
| @throws(classOf[DateTimeException]) | ||
| def parse(s: String): Long // returns microseconds since epoch | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove this comment?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because it duplicates method description above. |
||
| def parse(s: String): Long | ||
| def format(us: Long): String | ||
| } | ||
|
|
||
|
|
@@ -105,11 +105,22 @@ class LegacyFallbackTimestampFormatter( | |
| } | ||
|
|
||
| object TimestampFormatter { | ||
| val defaultPattern: String = "yyyy-MM-dd HH:mm:ss" | ||
| val defaultLocale: Locale = Locale.US | ||
|
|
||
| def apply(format: String, timeZone: TimeZone, locale: Locale): TimestampFormatter = { | ||
| if (SQLConf.get.legacyTimeParserEnabled) { | ||
| new LegacyFallbackTimestampFormatter(format, timeZone, locale) | ||
| } else { | ||
| new Iso8601TimestampFormatter(format, timeZone, locale) | ||
| } | ||
| } | ||
|
|
||
| def apply(format: String, timeZone: TimeZone): TimestampFormatter = { | ||
| apply(format, timeZone, defaultLocale) | ||
| } | ||
|
|
||
| def apply(timeZone: TimeZone): TimestampFormatter = { | ||
| apply(defaultPattern, timeZone, defaultLocale) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ import java.nio.charset.StandardCharsets | |
| import java.sql.{Date, Timestamp} | ||
|
|
||
| import org.apache.spark.sql.Row | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
| import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} | ||
| import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec, ShowTablesCommand} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
|
|
@@ -77,6 +77,10 @@ object HiveResult { | |
| TimestampType, | ||
| BinaryType) | ||
|
|
||
| private lazy val dateFormatter = DateFormatter() | ||
| private lazy val timestampFormatter = TimestampFormatter( | ||
| DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone)) | ||
|
|
||
| /** Hive outputs fields of structs slightly differently than top level attributes. */ | ||
| private def toHiveStructString(a: (Any, DataType)): String = a match { | ||
| case (struct: Row, StructType(fields)) => | ||
|
|
@@ -111,11 +115,9 @@ object HiveResult { | |
| toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType)) | ||
| }.toSeq.sorted.mkString("{", ",", "}") | ||
| case (null, _) => "NULL" | ||
| case (d: Date, DateType) => | ||
| DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d)) | ||
| case (d: Date, DateType) => dateFormatter.format(DateTimeUtils.fromJavaDate(d)) | ||
| case (t: Timestamp, TimestampType) => | ||
| val timeZone = DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @MaxGekk @cloud-fan by moving getting the timezone from here to a lazy val in the object, it will be initialized only once by the first session that uses it. Another session with a different sessionLocalTimeZone set will get results in wrong timezone.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @juliuszsompolski Thank you for the bug report. I will fix the issue. I think it is ok to create formatters in place because they can be pulled from caches.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point!
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is the PR #28024 |
||
| DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(t), timeZone) | ||
| DateTimeUtils.timestampToString(timestampFormatter, DateTimeUtils.fromJavaTimestamp(t)) | ||
| case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8) | ||
| case (decimal: java.math.BigDecimal, DecimalType()) => formatDecimal(decimal) | ||
| case (interval, CalendarIntervalType) => interval.toString | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When would you need this? You could argue that since we are moving to Spark 3.0 we don't need to care as much about legacy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this PR, date and timestamp patterns are fixed, and we shouldn't see any behavior changes but
DateFormatter/TimestampFormatterare used from CSV/JSON Datasources and from a functions where user can set any patterns. Unfortunately supported patterns bySimpleDateFormatandDateTimeFormatare not absolutely the same. Also there are other differences in their behavior: https://github.com/apache/spark/pull/23358/files#diff-3f19ec3d15dcd8cd42bb25dde1c5c1a9R42What I have learned from other PRs, if I introduce a behavior change, I should leave opportunity to users to come back to previous behavior. Later the old behavior can be deprecated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am just saying that we are going to break stuff anyway. If the legacy behavior is somewhat unreasonable, then we should consider not supporting it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Getting rid of the flag in the PR is slightly out of its scope, I believe. I would prefer to open a ticket and leave that to somebody who is much more brave.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ticket to remove the flag: https://issues.apache.org/jira/browse/SPARK-26503
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For clarification, I don't think we should treat the previous behaviour unreasonable ... I am okay with considering to remove that legacy configuration regarding that we're going ahead for 3.0, it causes some overhead about maintenance, and it blocks some features.
Also, for clarification, it's kind of a breaking changes. Think about that the CSV codes were dependent on timestamp being inferred and suddenly it becomes strings after upgrade. Even, this behaviour was documented in 2.x (by referring
SimpleDateFormat).