[SPARK-25540][SQL][PYSPARK] Make HiveContext in PySpark behave as the same as Scala.

ueshin · cloud-fan · commit c3c45cbd76d9 · 2018-09-27T09:51:20.000+08:00
## What changes were proposed in this pull request? In Scala, `HiveContext` sets a config `spark.sql.catalogImplementation` of the given `SparkContext` and then passes to `SparkSession.builder`. The `HiveContext` in PySpark should behave as the same as Scala. ## How was this patch tested? Existing tests. Closes #22552 from ueshin/issues/SPARK-25540/hive_context. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -485,7 +485,8 @@ def __init__(self, sparkContext, jhiveContext=None):
             "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
             DeprecationWarning)
         if jhiveContext is None:
-            sparkSession = SparkSession.builder.enableHiveSupport().getOrCreate()
+            sparkContext._conf.set("spark.sql.catalogImplementation", "hive")
+            sparkSession = SparkSession.builder._sparkContext(sparkContext).getOrCreate()
         else:
             sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
         SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -83,6 +83,7 @@ class Builder(object):
 
         _lock = RLock()
         _options = {}
+        _sc = None
 
         @since(2.0)
         def config(self, key=None, value=None, conf=None):
@@ -139,6 +140,11 @@ def enableHiveSupport(self):
             """
             return self.config("spark.sql.catalogImplementation", "hive")
 
+        def _sparkContext(self, sc):
+            with self._lock:
+                self._sc = sc
+                return self
+
         @since(2.0)
         def getOrCreate(self):
             """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
@@ -167,11 +173,14 @@ def getOrCreate(self):
                 from pyspark.conf import SparkConf
                 session = SparkSession._instantiatedSession
                 if session is None or session._sc._jsc is None:
-                    sparkConf = SparkConf()
-                    for key, value in self._options.items():
-                        sparkConf.set(key, value)
-                    sc = SparkContext.getOrCreate(sparkConf)
-                    # This SparkContext may be an existing one.
+                    if self._sc is not None:
+                        sc = self._sc
+                    else:
+                        sparkConf = SparkConf()
+                        for key, value in self._options.items():
+                            sparkConf.set(key, value)
+                        sc = SparkContext.getOrCreate(sparkConf)
+                        # This SparkContext may be an existing one.
                     for key, value in self._options.items():
                         # we need to propagate the confs
                         # before we create the SparkSession. Otherwise, confs like