Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
* from values being read should be skipped.</li>
* <li>`ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing
* whitespaces from values being read should be skipped.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value. Since
Copy link
Member

@HyukjinKwon HyukjinKwon Aug 18, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, there would be the same documentation in readwriter.py. I guess we should fix them too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And streaming.py as well if I remember correctly.

Copy link
Contributor Author

@lw-lin lw-lin Aug 19, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh thanks! Indeed there are two occurrences (one in readwriter.py / one in streaming.py) needs fixing. I'll fix them.

* 2.0.1, this applies to all supported types including the string type.</li>
* <li>`nanValue` (default `NaN`): sets the string representation of a non-number" value.</li>
* <li>`positiveInf` (default `Inf`): sets the string representation of a positive infinity
* value.</li>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,59 +238,55 @@ private[csv] object CSVTypeCast {
nullable: Boolean = true,
options: CSVOptions = CSVOptions()): Any = {

castType match {
case _: ByteType => if (datum == options.nullValue && nullable) null else datum.toByte
case _: ShortType => if (datum == options.nullValue && nullable) null else datum.toShort
case _: IntegerType => if (datum == options.nullValue && nullable) null else datum.toInt
case _: LongType => if (datum == options.nullValue && nullable) null else datum.toLong
case _: FloatType =>
if (datum == options.nullValue && nullable) {
null
} else if (datum == options.nanValue) {
Float.NaN
} else if (datum == options.negativeInf) {
Float.NegativeInfinity
} else if (datum == options.positiveInf) {
Float.PositiveInfinity
} else {
Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
}
case _: DoubleType =>
if (datum == options.nullValue && nullable) {
null
} else if (datum == options.nanValue) {
Double.NaN
} else if (datum == options.negativeInf) {
Double.NegativeInfinity
} else if (datum == options.positiveInf) {
Double.PositiveInfinity
} else {
Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
}
case _: BooleanType => datum.toBoolean
case dt: DecimalType =>
if (datum == options.nullValue && nullable) {
null
} else {
if (datum == options.nullValue && nullable) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possibly worth checking if (nullable && ...) to avoid the comparison if it's not nullable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea let me do that. thanks.

null
} else {
castType match {
case _: ByteType => datum.toByte
case _: ShortType => datum.toShort
case _: IntegerType => datum.toInt
case _: LongType => datum.toLong
case _: FloatType =>
if (datum == options.nanValue) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these nested if-else statements be a match statement? or is there some overhead to it that is too significant?

Copy link
Contributor Author

@lw-lin lw-lin Sep 16, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea they should be a match statement -- let me update this, thanks!

Float.NaN
} else if (datum == options.negativeInf) {
Float.NegativeInfinity
} else if (datum == options.positiveInf) {
Float.PositiveInfinity
} else {
Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
}
case _: DoubleType =>
if (datum == options.nanValue) {
Double.NaN
} else if (datum == options.negativeInf) {
Double.NegativeInfinity
} else if (datum == options.positiveInf) {
Double.PositiveInfinity
} else {
Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
}
case _: BooleanType => datum.toBoolean
case dt: DecimalType =>
val value = new BigDecimal(datum.replaceAll(",", ""))
Decimal(value, dt.precision, dt.scale)
}
case _: TimestampType if options.dateFormat != null =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
options.dateFormat.parse(datum).getTime * 1000L
case _: TimestampType =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
DateTimeUtils.stringToTime(datum).getTime * 1000L
case _: DateType if options.dateFormat != null =>
DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime)
case _: DateType =>
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
case _: StringType => UTF8String.fromString(datum)
case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
case _: TimestampType if options.dateFormat != null =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
options.dateFormat.parse(datum).getTime * 1000L
case _: TimestampType =>
// This one will lose microseconds parts.
// See https://issues.apache.org/jira/browse/SPARK-10681.
DateTimeUtils.stringToTime(datum).getTime * 1000L
case _: DateType if options.dateFormat != null =>
DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime)
case _: DateType =>
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
case _: StringType => UTF8String.fromString(datum)
case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
* from values being read should be skipped.</li>
* <li>`ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing
* whitespaces from values being read should be skipped.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value. Since
* 2.0.1, this applies to all supported types including the string type.</li>
* <li>`nanValue` (default `NaN`): sets the string representation of a non-number" value.</li>
* <li>`positiveInf` (default `Inf`): sets the string representation of a positive infinity
* value.</li>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {

verifyCars(cars, withHeader = true, checkValues = false)
val results = cars.collect()
assert(results(0).toSeq === Array(2012, "Tesla", "S", "null", "null"))
assert(results(0).toSeq === Array(2012, "Tesla", "S", null, null))
assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null))
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,48 @@ class CSVTypeCastSuite extends SparkFunSuite {
}

test("Nullable types are handled") {
assert(CSVTypeCast.castTo("", IntegerType, nullable = true, CSVOptions()) == null)
assertNull(
CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")))
}

test("String type should always return the same as the input") {
test("String type should also respect `nullValue`") {
assert(
CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()) ==
UTF8String.fromString(""))
null)
assert(
CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
UTF8String.fromString(""))

assert(
CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
UTF8String.fromString(""))
assert(
CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
UTF8String.fromString(""))

assert(
CSVTypeCast.castTo(null, StringType, nullable = true, CSVOptions("nullValue", "null")) ==
null)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can use assertNull as you just did above?

Copy link
Contributor Author

@lw-lin lw-lin Aug 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh thanks!
I did this intentionally so that the diff is minimal and clear to reviewers. Maybe let's see what others think and I'm glad to change this if necessary. :)
nvm, this indeed should be assertNull

}

test("Throws exception for empty string with non null type") {
Expand Down Expand Up @@ -165,20 +197,4 @@ class CSVTypeCastSuite extends SparkFunSuite {
assert(doubleVal2 == Double.PositiveInfinity)
}

test("Type-specific null values are used for casting") {
assertNull(
CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
}
}