WIP: giant and messy WIP.

concretevitamin · concretevitamin · commit 3e5d77cf80c5 · 2014-06-18T15:53:46.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -45,8 +45,8 @@ class Projection(expressions: Seq[Expression]) extends (Row => Row) {
  * that schema.
  *
  * In contrast to a normal projection, a MutableProjection reuses the same underlying row object
- * each time an input row is added.  This significatly reduces the cost of calcuating the
- * projection, but means that it is not safe
+ * each time an input row is added.  This significantly reduces the cost of calculating the
+ * projection, but means that it is not safe ...?
  */
 case class MutableProjection(expressions: Seq[Expression]) extends (Row => Row) {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.hadoop.fs.FileSystem
 import org.apache.spark.sql.{SQLContext, execution}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
@@ -38,6 +39,26 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
 
+//      case HashFilteredJoin(
+//            Inner,
+//            leftKeys,
+//            rightKeys,
+//            condition,
+//            left,
+//            right @ PhysicalOperation(_, _, b: MetastoreRelation))
+//        if tableRawSizeBelowThreshold(left) =>
+//        // TODO: these will be used
+////        import org.apache.hadoop.fs.ContentSummary
+////        import org.apache.hadoop.fs.FileSystem
+////        import org.apache.hadoop.fs.Path
+//
+//        FileSystem.get()
+//
+//        val hashJoin =
+//          execution.BroadcastHashJoin(
+//            leftKeys, rightKeys, BuildRight, planLater(left), planLater(right))(sparkContext)
+//        condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil
+
       case HashFilteredJoin(
             Inner,
             leftKeys,
@@ -129,8 +150,25 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
+//  // FIXME(zongheng): WIP
+//  object AutoBroadcastHashJoin extends Strategy {
+//    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+//      case logical.Join(left, right, joinType, condition) =>
+//
+//        execution.BroadcastHashJoin()
+//
+//        execution.BroadcastNestedLoopJoin(
+//          planLater(left), planLater(right), joinType, condition)(sparkContext) :: Nil
+//      case _ => Nil
+//    }
+//  }
+
   object BroadcastNestedLoopJoin extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+
+      // FIXME: WIP -- auto broadcast hash join
+      case logical.Join
+
       case logical.Join(left, right, joinType, condition) =>
         execution.BroadcastNestedLoopJoin(
           planLater(left), planLater(right), joinType, condition)(sparkContext) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
@@ -109,7 +109,7 @@ trait HashJoin {
       /**
        * Searches the streamed iterator for the next row that has at least one match in hashtable.
        *
-       * @return true if the search is successful, and false the streamed iterator runs out of
+       * @return true if the search is successful, and false if the streamed iterator runs out of
        *         tuples.
        */
       private final def fetchNext(): Boolean = {
@@ -136,7 +136,7 @@ trait HashJoin {
 
 /**
  * :: DeveloperApi ::
- * Performs and inner hash join of two child relations by first shuffling the data using the join
+ * Performs an inner hash join of two child relations by first shuffling the data using the join
  * keys.
  */
 @DeveloperApi
@@ -163,9 +163,10 @@ case class ShuffledHashJoin(
 
 /**
  * :: DeveloperApi ::
- * Performs an inner hash join of two child relations.  When the operator is constructed, a Spark
- * job is asynchronously started to calculate the values for the broadcasted relation.  This data
- * is then placed in a Spark broadcast variable.  The streamed relation is not shuffled.
+ * Performs an inner hash join of two child relations.  When the output RDD of this operator is
+ * being constructed, a Spark job is asynchronously started to calculate the values for the
+ * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
+ * relation is not shuffled.
  */
 @DeveloperApi
 case class BroadcastHashJoin(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import org.apache.hadoop.fs.FileSystem
+
+import org.apache.spark.sql
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
@@ -32,6 +35,29 @@ private[hive] trait HiveStrategies {
 
   val hiveContext: HiveContext
 
+  // FIXME(zongheng): WIP
+  object HashJoin extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case HashFilteredJoin(
+            Inner,
+            leftKeys,
+            rightKeys,
+            condition,
+            left,
+            right @ PhysicalOperation(_, _, b: MetastoreRelation)) =>
+
+        val path = b.hiveQlTable.getPath
+        val fs = path.getFileSystem(hiveContext.hiveconf)
+        val size = fs.getContentSummary(path).getLength // TODO: in bytes?
+
+
+        val hashJoin =
+          sql.execution.BroadcastHashJoin(
+            leftKeys, rightKeys, BuildRight, planLater(left), planLater(right))(sparkContext)
+        condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil
+    }
+  }
+
   object Scripts extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.ScriptTransformation(input, script, output, child) =>