Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -238,59 +238,55 @@ private[csv] object CSVTypeCast {
nullable: Boolean = true,
options: CSVOptions = CSVOptions()): Any = {

castType match {
case _: ByteType => if (datum == options.nullValue && nullable) null else datum.toByte
case _: ShortType => if (datum == options.nullValue && nullable) null else datum.toShort
case _: IntegerType => if (datum == options.nullValue && nullable) null else datum.toInt
case _: LongType => if (datum == options.nullValue && nullable) null else datum.toLong
case _: FloatType =>
if (datum == options.nullValue && nullable) {
null
} else if (datum == options.nanValue) {
Float.NaN
} else if (datum == options.negativeInf) {
Float.NegativeInfinity
} else if (datum == options.positiveInf) {
Float.PositiveInfinity
} else {
Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
}
case _: DoubleType =>
if (datum == options.nullValue && nullable) {
null
} else if (datum == options.nanValue) {
Double.NaN
} else if (datum == options.negativeInf) {
Double.NegativeInfinity
} else if (datum == options.positiveInf) {
Double.PositiveInfinity
} else {
Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
}
case _: BooleanType => datum.toBoolean
case dt: DecimalType =>
if (datum == options.nullValue && nullable) {
null
} else {
if (datum == options.nullValue && nullable && (!castType.isInstanceOf[StringType])) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind if I ask why StringType is excluded?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be great to document why string type is ignored here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... why StringType is excluded?

Hi @HyukjinKwon, it's just to keep consistency with we did in spark-csv for 1.6. Actually I don't have strong preference here -- maybe we should not ignore StringType? @rxin could you share some thoughts? Thanks!

null
} else {
castType match {
case _: ByteType => datum.toByte
case _: ShortType => datum.toShort
case _: IntegerType => datum.toInt
case _: LongType => datum.toLong
case _: FloatType =>
if (datum == options.nanValue) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these nested if-else statements be a match statement? or is there some overhead to it that is too significant?

Copy link
Contributor Author

@lw-lin lw-lin Sep 16, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea they should be a match statement -- let me update this, thanks!

Float.NaN
} else if (datum == options.negativeInf) {
Float.NegativeInfinity
} else if (datum == options.positiveInf) {
Float.PositiveInfinity
} else {
Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
}
case _: DoubleType =>
if (datum == options.nanValue) {
Double.NaN
} else if (datum == options.negativeInf) {
Double.NegativeInfinity
} else if (datum == options.positiveInf) {
Double.PositiveInfinity
} else {
Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
}
case _: BooleanType => datum.toBoolean
case dt: DecimalType =>
val value = new BigDecimal(datum.replaceAll(",", ""))
Decimal(value, dt.precision, dt.scale)
}
case _: TimestampType if options.dateFormat != null =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
options.dateFormat.parse(datum).getTime * 1000L
case _: TimestampType =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
DateTimeUtils.stringToTime(datum).getTime * 1000L
case _: DateType if options.dateFormat != null =>
DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime)
case _: DateType =>
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
case _: StringType => UTF8String.fromString(datum)
case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
case _: TimestampType if options.dateFormat != null =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
options.dateFormat.parse(datum).getTime * 1000L
case _: TimestampType =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
DateTimeUtils.stringToTime(datum).getTime * 1000L
case _: DateType if options.dateFormat != null =>
DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime)
case _: DateType =>
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
case _: StringType => UTF8String.fromString(datum)
case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,31 @@ class CSVTypeCastSuite extends SparkFunSuite {
}

test("Nullable types are handled") {
assert(CSVTypeCast.castTo("", IntegerType, nullable = true, CSVOptions()) == null)
assertNull(
CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))

// special treatment for StringType
assert(
CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")) ===
UTF8String.fromString("-"))
}

test("String type should always return the same as the input") {
Expand Down Expand Up @@ -165,20 +189,4 @@ class CSVTypeCastSuite extends SparkFunSuite {
assert(doubleVal2 == Double.PositiveInfinity)
}

test("Type-specific null values are used for casting") {
assertNull(
CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
}
}