use 2D array for summary table

actuaryzhang · actuaryzhang · commit a16cbee4e86c · 2017-07-17T15:48:22.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -115,17 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
 
     val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      val rCoefficientStandardErrors =
-        summary.summaryTable.select("StdError").collect.map(_.getDouble(0))
-
-      val rTValues =
-        summary.summaryTable.select("TValue").collect.map(_.getDouble(0))
-
-      val rPValues =
-        summary.summaryTable.select("PValue").collect.map(_.getDouble(0))
-
-      summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) ++
-        rCoefficientStandardErrors ++ rTValues ++ rPValues
+      summary.coefficientMatrix.map(_._2) ++
+        summary.coefficientMatrix.map(_._3) ++
+        summary.coefficientMatrix.map(_._4) ++
+        summary.coefficientMatrix.map(_._5)
     } else {
       if (glm.getFitIntercept) {
         Array(glm.intercept) ++ glm.coefficients.toArray
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -20,6 +20,9 @@ package org.apache.spark.ml.regression
 import java.util.Locale
 
 import breeze.stats.{distributions => dist}
+
+import org.apache.commons.lang3.StringUtils
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
@@ -34,7 +37,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
@@ -1211,8 +1214,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
    * Name of features. If the name cannot be retrieved from attributes,
    * set default names to feature column name with numbered suffix "_0", "_1", and so on.
    */
-  @Since("2.2.0")
-  lazy val featureNames: Array[String] = {
+  private[ml] lazy val featureNames: Array[String] = {
     val featureAttrs = AttributeGroup.fromStructField(
       dataset.schema(model.getFeaturesCol)).attributes
     if (featureAttrs == None) {
@@ -1479,31 +1481,165 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-   * Summary table with feature name, coefficient, standard error,
+   * Coefficient matrix with feature name, coefficient, standard error,
    * tValue and pValue.
    */
-  @Since("2.2.0")
-  lazy val summaryTable: DataFrame = {
+  @Since("2.3.0")
+  lazy val coefficientMatrix: Array[(String, Double, Double, Double, Double)] = {
     if (isNormalSolver) {
       var featureNamesLocal = featureNames
       var coefficients = model.coefficients.toArray
       var idx = Array.range(0, coefficients.length)
       if (model.getFitIntercept) {
-        featureNamesLocal = featureNamesLocal :+ Intercept
+        featureNamesLocal = featureNamesLocal :+ "(Intercept)"
         coefficients = coefficients :+ model.intercept
         // Reorder so that intercept comes first
         idx = (coefficients.length - 1) +: idx
       }
-      val result = for (i <- idx.toSeq) yield
+      val result = for (i <- idx) yield
         (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i),
         tValues(i), pValues(i))
-
-      val spark = SparkSession.builder().getOrCreate()
-      import spark.implicits._
-      result.toDF("Feature", "Coefficient", "StdError", "TValue", "PValue").repartition(1)
+      result
     } else {
       throw new UnsupportedOperationException(
         "No summary table available for this GeneralizedLinearRegressionModel")
     }
   }
+
+  private def round(x: Double, digit: Int): String = {
+    BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString()
+  }
+
+  private[regression] def showString(_numRows: Int, truncate: Int = 20,
+                                     numDigits: Int = 3): String = {
+    val numRows = _numRows.max(1)
+    val data = coefficientMatrix.take(numRows)
+    val hasMoreData = coefficientMatrix.size > numRows
+
+    val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue")
+    val numCols = colNames.size
+
+    val rows = colNames +: data.map( row => {
+      val mrow = for (cell <- row.productIterator) yield {
+        val str = cell match {
+          case s: String => s
+          case n: Double => round(n, numDigits).toString
+        }
+        if (truncate > 0 && str.length > truncate) {
+          // do not show ellipses for strings shorter than 4 characters.
+          if (truncate < 4) str.substring(0, truncate)
+          else str.substring(0, truncate - 3) + "..."
+        } else {
+          str
+        }
+      }
+      mrow.toArray
+    })
+
+    val sb = new StringBuilder
+    val colWidths = Array.fill(numCols)(3)
+
+    // Compute the width of each column
+    for (row <- rows) {
+      for ((cell, i) <- row.zipWithIndex) {
+        colWidths(i) = math.max(colWidths(i), cell.length)
+      }
+    }
+
+    // Create SeparateLine
+    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
+
+    // column names
+    rows.head.zipWithIndex.map { case (cell, i) =>
+      if (truncate > 0) {
+        StringUtils.leftPad(cell, colWidths(i))
+      } else {
+        StringUtils.rightPad(cell, colWidths(i))
+      }
+    }.addString(sb, "|", "|", "|\n")
+    sb.append(sep)
+
+    // data
+    rows.tail.map {
+      _.zipWithIndex.map { case (cell, i) =>
+        if (truncate > 0) {
+          StringUtils.leftPad(cell.toString, colWidths(i))
+        } else {
+          StringUtils.rightPad(cell.toString, colWidths(i))
+        }
+      }.addString(sb, "|", "|", "|\n")
+    }
+
+    // For Data that has more than "numRows" records
+    if (hasMoreData) {
+      sb.append("...\n")
+      sb.append(sep)
+      val rowsString = if (numRows == 1) "row" else "rows"
+      sb.append(s"only showing top $numRows $rowsString\n")
+    } else {
+      sb.append(sep)
+    }
+
+    sb.append("\n")
+    sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
+      round(dispersion, numDigits) + ")")
+
+    sb.append("\n")
+    val nd = "Null deviance: " + round(nullDeviance, numDigits) +
+      s" on $degreesOfFreedom degrees of freedom"
+    val rd = "Residual deviance: " + round(deviance, numDigits) +
+      s" on $residualDegreeOfFreedom degrees of freedom"
+    val l = math.max(nd.length, rd.length)
+    sb.append(StringUtils.leftPad(nd, l))
+    sb.append("\n")
+    sb.append(StringUtils.leftPad(rd, l))
+
+    if (family.name != "tweedie") {
+      sb.append("\n")
+      sb.append(s"AIC: " + round(aic, numDigits))
+    }
+
+    sb.toString()
+  }
+
+  /**
+   * Displays the summary of a GeneralizedLinearModel fit.
+   *
+   * @since 2.3.0
+   */
+  def show(): Unit = {
+    val numRows = coefficientMatrix.size
+    show(numRows, true, 3)
+  }
+
+  /**
+   * Displays the top numRows rows of the summary of a GeneralizedLinearModel fit.
+   *
+   * @param numRows Number of rows to show
+   *
+   * @since 2.3.0
+   */
+  @Since("2.3.0")
+  def show(numRows: Int): Unit = {
+    show(numRows, true, 3)
+  }
+
+  /**
+   * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters
+   * will be truncated, and all cells will be aligned right.
+   *
+   * @param numRows Number of rows to show
+   * @param truncate Whether truncate long strings. If true, strings more than 20 characters will
+   *              be truncated and all cells will be aligned right
+   * @param numDigits Number of decimal places used to round numerical values.
+   *
+   * @since 2.3.0
+   */
+  // scalastyle:off println
+  def show(numRows: Int, truncate: Boolean, numDigits: Int): Unit = if (truncate) {
+    println(showString(numRows, truncate = 20, numDigits))
+  } else {
+    println(showString(numRows, truncate = 0, numDigits))
+  }
+  // scalastyle:on println
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1524,7 +1524,6 @@ class GeneralizedLinearRegressionSuite
       .fit(datasetGaussianIdentity.as[LabeledPoint])
   }
 
-
   test("glm summary: feature name") {
     // dataset1 with no attribute
     val dataset1 = Seq(
@@ -1557,7 +1556,7 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  test("glm summary: summaryTable") {
+  test("glm summary: coefficient matrix") {
     /*
       R code:
 
@@ -1587,31 +1586,21 @@ class GeneralizedLinearRegressionSuite
       Vectors.dense(0.7903, 0.2258, 0.4677))
     val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
       Vectors.dense(4.0129, 2.1153, 0.5815))
-    val expectedTValue = Seq(Vectors.dense(0.1673, 1.4205),
-      Vectors.dense(0.1969, 0.1067, 0.8043))
-    val expectedPValue = Seq(Vectors.dense(0.8778, 0.2506),
-      Vectors.dense(0.8621, 0.9247, 0.5056))
 
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
       val trainer = new GeneralizedLinearRegression()
         .setFamily("gaussian")
         .setFitIntercept(fitIntercept)
       val model = trainer.fit(dataset)
-      val summaryTable = model.summary.summaryTable
+      val coefficientMatrix = model.summary.coefficientMatrix
 
-      summaryTable.select("Feature").collect.map(_.getString(0))
-        .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
+      coefficientMatrix.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
         "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(summaryTable.select("Coefficient").collect.map(_.getDouble(0)))
+      assert(Vectors.dense(coefficientMatrix.map(_._2))
         ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("StdError").collect.map(_.getDouble(0)))
+      assert(Vectors.dense(coefficientMatrix.map(_._3))
         ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("TValue").collect.map(_.getDouble(0)))
-        ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("PValue").collect.map(_.getDouble(0)))
-        ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
-
       idx += 1
     }
   }