apache · actuaryzhang · Jan 27, 2017 · Jan 28, 2017 · Jan 28, 2017 · Jan 28, 2017
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -21,7 +21,8 @@ Suggests:
     rmarkdown,
     testthat,
     e1071,
-    survival
+    survival,
+    statmod
 Collate:
     'schema.R'
     'generics.R'

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
@@ -84,6 +84,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' # can also read back the saved model and print
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
+#'
+#' # fit tweedie model
+#' require(statmod)
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, 
+#'   family = tweedie(var.power = 1.2, link.power = 0))
+#' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
@@ -101,6 +107,16 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               stop("'family' not recognized")
             }
 
+            # recover variancePower and linkPower from the specified tweedie family
+            if (tolower(family$family) == "tweedie") {
+              variancePower <- log(family$variance(exp(1)))
+              linkPower <- log(family$linkfun(exp(1)))
+            } else {
+              # these default values are not used
+              variancePower <- 0.0
+              linkPower <- 1.0
+            }
+
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(weightCol)) {
               weightCol <- ""
@@ -109,7 +125,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
-                                tol, as.integer(maxIter), as.character(weightCol), regParam)
+                                tol, as.integer(maxIter), as.character(weightCol), regParam,
+                                as.double(variancePower), as.double(linkPower))
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -124,7 +141,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{Gamma}, and \code{poisson}.
+#'               \code{poisson}, \code{Gamma}, and \code{tweedie} (\code{statmod} package).
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
@@ -170,9 +187,10 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             deviance <- callJMethod(jobj, "rDeviance")
             df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
             df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
-            aic <- callJMethod(jobj, "rAic")
             iter <- callJMethod(jobj, "rNumIterations")
             family <- callJMethod(jobj, "rFamily")
+            aic <- callJMethod(jobj, "rAic")
+            if (family == "tweedie" && aic == 0) aic <- NA
             deviance.resid <- if (is.loaded) {
               NULL
             } else {

diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -77,6 +77,18 @@ test_that("spark.glm and predict", {
   out <- capture.output(print(summary(model)))
   expect_true(any(grepl("Dispersion parameter for gamma family", out)))
 
+  # tweedie family
+  require(statmod)
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = tweedie(var.power = 1.2, link.power = 1.0))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- suppressWarnings(predict(
+    glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+        family = tweedie(var.power = 1.2, link.power = 1.0)), iris))
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -71,7 +71,9 @@ private[r] object GeneralizedLinearRegressionWrapper
       tol: Double,
       maxIter: Int,
       weightCol: String,
-      regParam: Double): GeneralizedLinearRegressionWrapper = {
+      regParam: Double,
+      variancePower: Double,
+      linkPower: Double): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula().setFormula(formula)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
@@ -81,15 +83,20 @@ private[r] object GeneralizedLinearRegressionWrapper
       .attributes.get
     val features = featureAttrs.map(_.name.get)
     // assemble and fit the pipeline
-    val glr = new GeneralizedLinearRegression()
+    var glr = new GeneralizedLinearRegression()
       .setFamily(family)
-      .setLink(link)
       .setFitIntercept(rFormula.hasIntercept)
       .setTol(tol)
       .setMaxIter(maxIter)
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
+    // set variancePower and linkPower if family is tweedie; otherwise, set link function
+    if (family.toLowerCase == "tweedie") {
+      glr = glr.setVariancePower(variancePower).setLinkPower(linkPower)
+    } else {
+      glr = glr.setLink(link)
+    }
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
       .fit(data)
@@ -143,7 +150,12 @@ private[r] object GeneralizedLinearRegressionWrapper
     val rDeviance: Double = summary.deviance
     val rResidualDegreeOfFreedomNull: Long = summary.residualDegreeOfFreedomNull
     val rResidualDegreeOfFreedom: Long = summary.residualDegreeOfFreedom
-    val rAic: Double = summary.aic
+    val rAic: Double = if (family.toLowerCase == "tweedie" &&
+      !Array(0.0, 1.0, 2.0).contains(variancePower)) {
+      0.0
+    } else {
+      summary.aic
+    }
     val rNumIterations: Int = summary.numIterations
 
     new GeneralizedLinearRegressionWrapper(pipeline, rFeatures, rCoefficients, rDispersion,