[SPARK-31441][PYSPARK][SQL][2.4] Support duplicated column names for toPandas with arrow execution

ueshin · ueshin · commit d34590cad8c1 · 2020-04-15T11:41:43.000-07:00
### What changes were proposed in this pull request? This is to backport #28210. This PR is adding support duplicated column names for `toPandas` with Arrow execution. ### Why are the changes needed? When we execute `toPandas()` with Arrow execution, it fails if the column names have duplicates. ```py >>> spark.sql("select 1 v, 1 v").toPandas() Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/path/to/lib/python3.7/site-packages/pyspark/sql/dataframe.py", line 2132, in toPandas pdf = table.to_pandas() File "pyarrow/array.pxi", line 441, in pyarrow.lib._PandasConvertible.to_pandas File "pyarrow/table.pxi", line 1367, in pyarrow.lib.Table._to_pandas File "/path/to/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 653, in table_to_blockmanager columns = _deserialize_column_index(table, all_columns, column_indexes) File "/path/to/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 704, in _deserialize_column_index columns = _flatten_single_level_multiindex(columns) File "/path/to/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 937, in _flatten_single_level_multiindex raise ValueError('Found non-unique column index') ValueError: Found non-unique column index ``` ### Does this PR introduce any user-facing change? Yes, previously we will face an error above, but after this PR, we will see the result: ```py >>> spark.sql("select 1 v, 1 v").toPandas() v v 0 1 1 ``` ### How was this patch tested? Added and modified related tests. Closes #28221 from ueshin/issues/SPARK-31441/2.4/to_pandas. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2127,10 +2127,14 @@ def toPandas(self):
                     from pyspark.sql.types import _check_dataframe_convert_date, \
                         _check_dataframe_localize_timestamps
                     import pyarrow
-                    batches = self._collectAsArrow()
+                    # Rename columns to avoid duplicated column names.
+                    tmp_column_names = ['col_{}'.format(i) for i in range(len(self.columns))]
+                    batches = self.toDF(*tmp_column_names)._collectAsArrow()
                     if len(batches) > 0:
                         table = pyarrow.Table.from_batches(batches)
                         pdf = table.to_pandas()
+                        # Rename back to the original column names.
+                        pdf.columns = self.columns
                         pdf = _check_dataframe_convert_date(pdf, self.schema)
                         return _check_dataframe_localize_timestamps(pdf, timezone)
                     else:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3296,6 +3296,19 @@ def test_to_pandas(self):
         self.assertEquals(types[4], np.object)  # datetime.date
         self.assertEquals(types[5], 'datetime64[ns]')
 
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
+    def test_to_pandas_with_duplicated_column_names(self):
+        import numpy as np
+
+        sql = "select 1 v, 1 v"
+        for arrowEnabled in [False, True]:
+            with self.sql_conf({"spark.sql.execution.arrow.enabled": arrowEnabled}):
+                df = self.spark.sql(sql)
+                pdf = df.toPandas()
+                types = pdf.dtypes
+                self.assertEquals(types.iloc[0], np.int32)
+                self.assertEquals(types.iloc[1], np.int32)
+
     @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
     def test_to_pandas_on_cross_join(self):
         import numpy as np
@@ -3307,12 +3320,14 @@ def test_to_pandas_on_cross_join(self):
           select explode(sequence(1, 3)) v
         ) t2
         """
-        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
-            df = self.spark.sql(sql)
-            pdf = df.toPandas()
-            types = pdf.dtypes
-            self.assertEquals(types.iloc[0], np.int32)
-            self.assertEquals(types.iloc[1], np.int32)
+        for arrowEnabled in [False, True]:
+            with self.sql_conf({"spark.sql.crossJoin.enabled": True,
+                                "spark.sql.execution.arrow.enabled": arrowEnabled}):
+                df = self.spark.sql(sql)
+                pdf = df.toPandas()
+                types = pdf.dtypes
+                self.assertEquals(types.iloc[0], np.int32)
+                self.assertEquals(types.iloc[1], np.int32)
 
     @unittest.skipIf(_have_pandas, "Required Pandas was found.")
     def test_to_pandas_required_pandas_not_found(self):