Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,8 +1336,8 @@ def replace(self, to_replace, value=None, subset=None):
Value to be replaced.
If the value is a dict, then `value` is ignored and `to_replace` must be a
mapping between a value and a replacement.
:param value: int, long, float, string, or list.
The replacement value must be an int, long, float, or string. If `value` is a
:param value: int, long, float, string, list or None.
The replacement value must be an int, long, float, string or None. If `value` is a
list, `value` should be of the same length and type as `to_replace`.
If `value` is a scalar and `to_replace` is a sequence, then `value` is
used as a replacement for each item in `to_replace`.
Expand All @@ -1356,6 +1356,16 @@ def replace(self, to_replace, value=None, subset=None):
|null| null| null|
+----+------+-----+

>>> df4.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
| 10| 80|null|
| 5| null| Bob|
|null| null| Tom|
|null| null|null|
+----+------+----+

>>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
Expand Down Expand Up @@ -1391,9 +1401,10 @@ def all_of_(xs):
"to_replace should be a float, int, long, string, list, tuple, or dict. "
"Got {0}".format(type(to_replace)))

if not isinstance(value, valid_types) and not isinstance(to_replace, dict):
if not isinstance(value, valid_types) and value is not None \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So slightly jet lagged style question: would this be clearer if we just add type(None) to L1398? I know PEP8 says we should only use is is not for checking if something is none rather than depending on the implicit conversion to boolean -- but since really checking the type here we aren't really in danger of that. (This is just a suggestion to make it easier to read - if others think its easier to read this way thats fine :)). @davies ?

and not isinstance(to_replace, dict):
raise ValueError("If to_replace is not a dict, value should be "
"a float, int, long, string, list, or tuple. "
"a float, int, long, string, list, tuple or None. "
"Got {0}".format(type(value)))

if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
Expand All @@ -1409,7 +1420,7 @@ def all_of_(xs):
if isinstance(to_replace, (float, int, long, basestring)):
to_replace = [to_replace]

if isinstance(value, (float, int, long, basestring)):
if isinstance(value, (float, int, long, basestring)) or value is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

value = [value for _ in range(len(to_replace))]

if isinstance(to_replace, dict):
Expand All @@ -1423,7 +1434,9 @@ def all_of_(xs):
subset = [subset]

# Verify we were not passed in mixed type generics."
if not any(all_of_type(rep_dict.keys()) and all_of_type(rep_dict.values())
if not any(all_of_type(rep_dict.keys())
and (all_of_type(rep_dict.values())
or list(rep_dict.values()).count(None) == len(rep_dict))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Scala code null is allowed in to be the replacement value for some of the elements but not in the Python code. Is this intentional? If so we should document it clearly and expand on the error message bellow (otherwise we should make it more flexible).

for all_of_type in [all_of_bool, all_of_str, all_of_numeric]):
raise ValueError("Mixed type replacements are not supported")

Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1851,6 +1851,11 @@ def test_replace(self):
.replace(False, True).first())
self.assertTupleEqual(row, (True, True))

# replace with None
row = self.spark.createDataFrame(
[(u'Alice', 10, 80.0)], schema).replace((10, 80), None).first()
self.assertTupleEqual(row, (u'Alice', None, None))

# should fail if subset is not list, tuple or None
with self.assertRaises(ValueError):
self.spark.createDataFrame(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {

/**
* (Scala-specific) Replaces values matching keys in `replacement` map.
* Key and value of `replacement` map must have the same type, and
* can only be doubles , strings or booleans.
* Key and value of `replacement` map must satisfy one of:
* 1. keys are String, values are mix of String and null
* 2. keys are Boolean, values are mix of Boolean and null
* 3. keys are Double, values are either all Double or all null
*
* {{{
* // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
Expand All @@ -342,8 +344,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
return df
}

// replacementMap is either Map[String, String] or Map[Double, Double] or Map[Boolean,Boolean]
// replacementMap is either Map[String, String], Map[Double, Double], Map[Boolean,Boolean]
// or value being null
val replacementMap: Map[_, _] = replacement.head._2 match {
case null => replacement
case v: String => replacement
case v: Boolean => replacement
case _ => replacement.map { case (k, v) => (convertToDouble(k), convertToDouble(v)) }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,16 +222,16 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
assert(out(4) === Row("Amy", null, null))
assert(out(5) === Row(null, null, null))

// Replace only the age column
val out1 = input.na.replace("age", Map(
16 -> 61,
60 -> 6,
164.3 -> 461.3 // Alice is really tall
// Replace only the name column
val out1 = input.na.replace("name", Map(
"Bob" -> "Bravo",
"Alice" -> "Jessie",
"David" -> null
)).collect()

assert(out1(0) === Row("Bob", 61, 176.5))
assert(out1(1) === Row("Alice", null, 164.3))
assert(out1(2) === Row("David", 6, null))
assert(out1(0) === Row("Bravo", 16, 176.5))
assert(out1(1) === Row("Jessie", null, 164.3))
assert(out1(2) === Row(null, 60, null))
assert(out1(3).get(2).asInstanceOf[Double].isNaN)
assert(out1(4) === Row("Amy", null, null))
assert(out1(5) === Row(null, null, null))
Expand Down