Add pivot to dataframe api

aray · aray · commit 599e9e0b9bd4 · 2015-07-31T10:20:58.000-05:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -72,6 +72,7 @@ class Analyzer(
       ResolveRelations ::
       ResolveReferences ::
       ResolveGroupingAnalytics ::
+      ResolvePivot ::
       ResolveSortReferences ::
       ResolveGenerate ::
       ResolveFunctions ::
@@ -166,6 +167,10 @@ class Analyzer(
         if g.child.resolved && g.aggregations.exists(_.isInstanceOf[UnresolvedAlias]) =>
         g.withNewAggs(assignAliases(g.aggregations))
 
+      case Pivot(groupByExprs, pivotColumn, pivotValues, aggregate, child)
+        if child.resolved && groupByExprs.exists(_.isInstanceOf[UnresolvedAlias]) =>
+        Pivot(assignAliases(groupByExprs), pivotColumn, pivotValues, aggregate, child)
+
       case Project(projectList, child)
         if child.resolved && projectList.exists(_.isInstanceOf[UnresolvedAlias]) =>
         Project(assignAliases(projectList), child)
@@ -249,6 +254,27 @@ class Analyzer(
     }
   }
 
+  object ResolvePivot extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case p: Pivot if !p.childrenResolved => p
+      case Pivot(groupByExprs, pivotColumn, pivotValues, aggregate, child) => aggregate match {
+        case u: UnaryExpression if u.isInstanceOf[AggregateExpression] =>
+          val pivotAggregates = pivotValues.map { value =>
+            val filteredAggregate = u.withNewChildren(Seq(
+              If(EqualTo(pivotColumn, Literal(value)), u.child, Literal(null))
+            ))
+            Alias(filteredAggregate, value)()
+          }
+          val newGroupByExprs = groupByExprs.map {
+            case UnresolvedAlias(e) => e
+            case e => e
+          }
+          Aggregate(newGroupByExprs, groupByExprs ++ pivotAggregates, child)
+        case unknown => throw new AnalysisException(s"$unknown is not an aggregate expression")
+      }
+    }
+  }
+
   /**
    * Replaces [[UnresolvedRelation]]s with concrete relations from the catalog.
    */
@@ -924,6 +950,7 @@ class Analyzer(
     override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case p: Project => p
       case f: Filter => f
+      case p: Pivot => p
 
       // todo: It's hard to write a general rule to pull out nondeterministic expressions
       // from LogicalPlan, currently we only do it for UnaryNode which has same output
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -373,6 +373,16 @@ case class Rollup(
     this.copy(aggregations = aggs)
 }
 
+case class Pivot(
+    groupByExprs: Seq[NamedExpression],
+    pivotColumn: Expression,
+    pivotValues: Seq[String],
+    aggregate: Expression,
+    child: LogicalPlan) extends UnaryNode {
+  override def output: Seq[Attribute] =
+    groupByExprs.map(_.toAttribute) ++ pivotValues.map(AttributeReference(_, aggregate.dataType)())
+}
+
 case class Limit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -918,6 +918,41 @@ class DataFrame private[sql](
     GroupedData(this, colNames.map(colName => resolve(colName)), GroupedData.CubeType)
   }
 
+  /**
+   * (Scala-specific) Pivots a column of the current [[DataFrame]] and preform the specified
+   * aggregation.
+   * {{{
+   *   // Compute the sum of earnings for each year by course with each course as a separate column.
+   *   df.pivot(Seq($"year"), $"course", Seq("dotNET", "Java"), sum($"earnings"))
+   * }}}
+   * @param groupBy Columns to group by.
+   * @param pivotColumn Column to pivot
+   * @param pivotValues Values of pivotColumn that will be translated to columns in the output data
+   *                    frame.
+   * @param aggregate Aggregate expression to preform for each combination of groupBy and
+   *                  pivotValues.
+   * @group dfops
+   * @since 1.5.0
+   */
+  def pivot(
+      groupBy: Seq[Column],
+      pivotColumn: Column,
+      pivotValues: Seq[String],
+      aggregate: Column): DataFrame = {
+
+    val aliasedGroupBy = groupBy.map(_.expr).map {
+      // Wrap UnresolvedAttribute with UnresolvedAlias, as when we resolve UnresolvedAttribute, we
+      // will remove intermediate Alias for ExtractValue chain, and we need to alias it again to
+      // make it a NamedExpression.
+      case u: UnresolvedAttribute => UnresolvedAlias(u)
+      case expr: NamedExpression => expr
+      case expr: Expression => Alias(expr, expr.prettyString)()
+    }
+
+    new DataFrame(sqlContext,
+      Pivot(aliasedGroupBy, pivotColumn.expr, pivotValues, aggregate.expr, this.logicalPlan))
+  }
+
   /**
    * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
    * {{{
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.TestData._
+import org.apache.spark.sql.functions._
+
+class DataFramePivotSuite extends QueryTest {
+
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  test("pivot courses") {
+    checkAnswer(
+      courseSales.pivot(Seq($"year"), $"course", Seq("dotNET", "Java"), sum($"earnings")),
+      Row(2012, 15000.0, 20000.0) :: Row(2013, 48000.0, 30000.0) :: Nil
+    )
+  }
+
+  test("pivot year") {
+    checkAnswer(
+      courseSales.pivot(Seq($"course"), $"year", Seq("2012", "2013"), sum($"earnings")),
+      Row("dotNet", 15000.0, 48000.0) :: Row("Java", 20000.0, 30000.0) :: Nil
+    )
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -194,4 +194,13 @@ object TestData {
         :: ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2), false)
         :: Nil).toDF()
   complexData.registerTempTable("complexData")
+
+  case class CourseSales(course: String, year: Int, earnings: Double)
+  val courseSales = TestSQLContext.sparkContext.parallelize(
+    CourseSales("dotNET", 2012, 10000) ::
+    CourseSales("Java", 2012, 20000) ::
+    CourseSales("dotNET", 2012, 5000) ::
+    CourseSales("dotNET", 2013, 48000) ::
+    CourseSales("Java", 2013, 30000) :: Nil).toDF()
+  courseSales.registerTempTable("courseSales")
 }