Skip to content

Commit d1d0ee4

Browse files
daviesJoshRosen
authored andcommitted
[SPARK-3103] [PySpark] fix saveAsTextFile() with utf-8
bugfix: It will raise an exception when it try to encode non-ASCII strings into unicode. It should only encode unicode as "utf-8". Author: Davies Liu <davies.liu@gmail.com> Closes #2018 from davies/fix_utf8 and squashes the following commits: 4db7967 [Davies Liu] fix saveAsTextFile() with utf-8
1 parent 3a5962f commit d1d0ee4

2 files changed

Lines changed: 12 additions & 1 deletion

File tree

python/pyspark/rdd.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1191,7 +1191,9 @@ def func(split, iterator):
11911191
for x in iterator:
11921192
if not isinstance(x, basestring):
11931193
x = unicode(x)
1194-
yield x.encode("utf-8")
1194+
if isinstance(x, unicode):
1195+
x = x.encode("utf-8")
1196+
yield x
11951197
keyed = self.mapPartitionsWithIndex(func)
11961198
keyed._bypass_serializer = True
11971199
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)

python/pyspark/tests.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,15 @@ def test_save_as_textfile_with_unicode(self):
256256
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
257257
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
258258

259+
def test_save_as_textfile_with_utf8(self):
260+
x = u"\u00A1Hola, mundo!"
261+
data = self.sc.parallelize([x.encode("utf-8")])
262+
tempFile = tempfile.NamedTemporaryFile(delete=True)
263+
tempFile.close()
264+
data.saveAsTextFile(tempFile.name)
265+
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
266+
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
267+
259268
def test_transforming_cartesian_result(self):
260269
# Regression test for SPARK-1034
261270
rdd1 = self.sc.parallelize([1, 2])

0 commit comments

Comments
 (0)