Skip to content

Commit 16db32f

Browse files
committed
address comments
1 parent 7613d54 commit 16db32f

File tree

4 files changed

+28
-4
lines changed

4 files changed

+28
-4
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ class UnivocityParser(
182182
case NonFatal(e) =>
183183
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
184184
// compatibility.
185-
val str = UTF8String.fromString(datum)
185+
val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(datum))
186186
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse(throw e)
187187
}
188188
}
@@ -195,7 +195,7 @@ class UnivocityParser(
195195
case NonFatal(e) =>
196196
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
197197
// compatibility.
198-
val str = UTF8String.fromString(datum)
198+
val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(datum))
199199
DateTimeUtils.stringToDate(str, options.zoneId).getOrElse(throw e)
200200
}
201201
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ class JacksonParser(
235235
case NonFatal(e) =>
236236
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
237237
// compatibility.
238-
val str = UTF8String.fromString(parser.getText)
238+
val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(parser.getText))
239239
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse(throw e)
240240
}
241241

@@ -252,7 +252,7 @@ class JacksonParser(
252252
case NonFatal(e) =>
253253
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
254254
// compatibility.
255-
val str = UTF8String.fromString(parser.getText)
255+
val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(parser.getText))
256256
DateTimeUtils.stringToDate(str, options.zoneId).getOrElse {
257257
// In Spark 1.5.0, we store the data as number of days since epoch in string.
258258
// So, we just convert it to Int.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,21 @@ object DateTimeUtils {
163163
instantToMicros(localDateTime.atZone(zoneId).toInstant)
164164
}
165165

166+
// A method called by JSON/CSV parser to clean up the legacy timestamp string by removing the
167+
// "GMT" string.
168+
def cleanLegacyTimestampStr(s: String): String = {
169+
val indexOfGMT = s.indexOf("GMT")
170+
if (indexOfGMT != -1) {
171+
// ISO8601 with a weird time zone specifier (2000-01-01T00:00GMT+01:00)
172+
val s0 = s.substring(0, indexOfGMT)
173+
val s1 = s.substring(indexOfGMT + 3)
174+
// Mapped to 2000-01-01T00:00+01:00
175+
s0 + s1
176+
} else {
177+
s
178+
}
179+
}
180+
166181
/**
167182
* Trim and parse a given UTF8 date string to the corresponding a corresponding [[Long]] value.
168183
* The return type is [[Option]] in order to distinguish between 0L and null. The following

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,15 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
341341
days(2020, 1, 12, 0, 0, 0))
342342
assert(parser.makeConverter("t", DateType).apply("2020-1-12 xyz") ==
343343
days(2020, 1, 12, 0, 0, 0))
344+
// The legacy format ignores the "GMT" from the string
345+
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45GMT") ==
346+
date(2020, 1, 12, 12, 3, 45, 0))
347+
assert(parser.makeConverter("t", TimestampType).apply("GMT2020-1-12 12:3:45") ==
348+
date(2020, 1, 12, 12, 3, 45, 0))
349+
assert(parser.makeConverter("t", DateType).apply("2020-1-12GMT") ==
350+
days(2020, 1, 12, 0, 0, 0))
351+
assert(parser.makeConverter("t", DateType).apply("GMT2020-1-12") ==
352+
days(2020, 1, 12, 0, 0, 0))
344353
}
345354

346355
val options = new CSVOptions(Map.empty[String, String], false, "UTC")

0 commit comments

Comments
 (0)