From 952887e485fb0d5fa669b3b4c9289b8069ee7769 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 15 Dec 2016 16:50:51 -0800 Subject: [PATCH 01/22] Add Tweedie family to GLM --- .../GeneralizedLinearRegression.scala | 73 +++++++++- .../spark/ml/regression/TestSuite.scala | 128 ++++++++++++++++++ 2 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 3891ae63a4e1e..fb8afbde12981 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -48,7 +48,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * Param for the name of family which is a description of the error distribution * to be used in the model. - * Supported options: "gaussian", "binomial", "poisson" and "gamma". + * Supported options: "gaussian", "binomial", "poisson", "gamma" and "tweedie". * Default is "gaussian". * * @group param @@ -63,6 +63,18 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam @Since("2.0.0") def getFamily: String = $(family) + + /** + * Param for the power value in the tweedie distribution which provides the relationship + * between the variance function and the mean of the distribution function. + * Supported options: 1 < p < 2. + * + * @group param + */ + final val variancePower: Param[Double] = new Param(this, "tweediePower", + "The power value of the power in the Tweedie distribution which provides the relationship " + + s"between the variance function and the mean of the distribution function. ") + /** * Param for the name of link function which provides the relationship * between the linear predictor and the mean of the distribution function. @@ -157,6 +169,15 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setFamily(value: String): this.type = set(family, value) setDefault(family -> Gaussian.name) + /** + * Sets the value of param [[variancePower]]. + * Default is 1.5. + * + * @group setParam + */ + @Since("2.2.0") + def setVariancePower(value: Double): this.type = set(variancePower, value) + /** * Sets the value of param [[link]]. * @@ -243,6 +264,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { val familyObj = Family.fromName($(family)) + if (familyObj == Tweedie) + Tweedie.variancePower = $(variancePower) val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -306,7 +329,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse, Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog, Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt, - Gamma -> Inverse, Gamma -> Identity, Gamma -> Log + Gamma -> Inverse, Gamma -> Identity, Gamma -> Log, + Tweedie -> Identity, Tweedie -> Log ) /** Set of family names that GeneralizedLinearRegression supports. */ @@ -412,6 +436,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine case Binomial.name => Binomial case Poisson.name => Poisson case Gamma.name => Gamma + case Tweedie.name => Tweedie } } } @@ -591,6 +616,50 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } + /** + * Tweedie exponential family distribution. + * The default link for the Tweedie family is the log link. + */ + private[regression] object Tweedie extends Family("tweedie") { + + val defaultLink: Link = Log + + var variancePower: Double = 1.5 + + override def initialize(y: Double, weight: Double): Double = { + require(y >= 0.0, "The response variable of Tweedie family " + + s"should be non-negative, but got $y") + /* + Force Poisson mean > 0 to avoid numerical instability in IRLS. + R uses y + 0.1 for initialization. See poisson()$initialize. + */ + math.max(y, 0.1) + } + + override def variance(mu: Double): Double = math.pow(mu, variancePower) + + override def deviance(y: Double, mu: Double, weight: Double): Double = { + 1.0 + } + + override def aic( + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + 1.0 + } + + override def project(mu: Double): Double = { + if (mu < epsilon) { + epsilon + } else if (mu.isInfinity) { + Double.MaxValue + } else { + mu + } + } + } /** * A description of the link function to be used in the model. * The link function provides the relationship between the linear predictor diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala new file mode 100644 index 0000000000000..356994374aca6 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.regression + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.classification.LogisticRegressionSuite._ +import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors} +import org.apache.spark.ml.param.{ParamMap, ParamsSuite} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.mllib.random._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.FloatType +import org.apache.spark.sql.{DataFrame, Row} + +import scala.util.Random + +class TestSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + private val seed: Int = 42 + @transient var datasetTweedie: DataFrame = _ + + override def beforeAll(): Unit = { + super.beforeAll() + } + + + test("glm summary: tweedie family with weight") { + /* + R code: + + A <- matrix(c(0, 1, 2, 3, 5, 2, 1, 3), 4, 2) + b <- c(1, 0.5, 1, 0) + w <- c(1, 2.0, 0.3, 4.7) + df <- as.data.frame(cbind(A, b)) + */ + val datasetWithWeight = Seq( + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), + Instance(0.0, 1.0, Vectors.dense(3.0, 3.0)) + ).toDF() + + val trainer = new GeneralizedLinearRegression() + .setFamily("tweedie") + .setVariancePower(1.5) + + val model = trainer.fit(datasetWithWeight) + val coefficientsR = Vectors.dense(Array(-1.536, -0.683)) + val interceptR = 3.155 + /* + val devianceResidualsR = Array(0.2404, 0.1965, 1.2824, -0.6916) + val pearsonResidualsR = Array(0.171217, 0.197406, 2.085864, -0.495332) + val workingResidualsR = Array(1.029315, 0.281881, 15.502768, -1.052203) + val responseResidualsR = Array(0.02848, 0.069123, 0.935495, -0.049613) + val seCoefR = Array(1.276417, 0.944934) + val tValsR = Array(-1.324124, 0.747068) + val pValsR = Array(0.185462, 0.455023) + val dispersionR = 1.0 + val nullDevianceR = 8.3178 + val residualDevianceR = 2.2193 + val residualDegreeOfFreedomNullR = 4 + val residualDegreeOfFreedomR = 2 + val aicR = 5.991537 + + val summary = model.summary + val devianceResiduals = summary.residuals() + .select(col("devianceResiduals")) + .collect() + .map(_.getDouble(0)) + val pearsonResiduals = summary.residuals("pearson") + .select(col("pearsonResiduals")) + .collect() + .map(_.getDouble(0)) + val workingResiduals = summary.residuals("working") + .select(col("workingResiduals")) + .collect() + .map(_.getDouble(0)) + val responseResiduals = summary.residuals("response") + .select(col("responseResiduals")) + .collect() + .map(_.getDouble(0)) +*/ + assert(model.coefficients ~== coefficientsR absTol 1E-3) + assert(model.intercept ~== interceptR absTol 1E-3) + /* devianceResiduals.zip(devianceResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + pearsonResiduals.zip(pearsonResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + workingResiduals.zip(workingResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + responseResiduals.zip(responseResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => + assert(x._1 ~== x._2 absTol 1E-3) } + summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } + summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } + assert(summary.dispersion ~== dispersionR absTol 1E-3) + assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3) + assert(summary.deviance ~== residualDevianceR absTol 1E-3) + assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR) + assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) + assert(summary.aic ~== aicR absTol 1E-3) + assert(summary.solver === "irls") + */ + } +} + From 4f184ec458f5ed7d70bc5b8165481425f911d2a3 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 19 Dec 2016 14:50:02 -0800 Subject: [PATCH 02/22] Fix calculation in dev resid; Add test for different var power --- .../GeneralizedLinearRegression.scala | 92 +++++++---- .../spark/ml/regression/TestSuite.scala | 155 ++++++++++++++---- 2 files changed, 184 insertions(+), 63 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index fb8afbde12981..2ea6764e92b6e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -63,17 +63,26 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam @Since("2.0.0") def getFamily: String = $(family) - /** - * Param for the power value in the tweedie distribution which provides the relationship - * between the variance function and the mean of the distribution function. - * Supported options: 1 < p < 2. + * Param for the power in the variance function of the Tweedie distribution which provides + * the relationship between the variance and mean of the distribution. + * Used only for the tweedie family. + * (see + * Tweedie Distribution (Wikipedia)) + * Supported value: (1, 2) and (2, Inf). * * @group param */ - final val variancePower: Param[Double] = new Param(this, "tweediePower", - "The power value of the power in the Tweedie distribution which provides the relationship " + - s"between the variance function and the mean of the distribution function. ") + @Since("2.2.0") + final val variancePower: Param[Double] = new Param(this, "variancePower", + "The power in the variance function of the Tweedie distribution which characterizes " + + "the relationship between the variance and mean of the distribution. " + + "Used only for the tweedie family. Supported value: (1, 2) and (2, Inf).", + (x: Double) => if (x > 1.0 && x != 2.0) true else false) + + /** @group getParam */ + @Since("2.2.0") + def getVariancePower: Double = $(variancePower) /** * Param for the name of link function which provides the relationship @@ -120,8 +129,9 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam featuresDataType: DataType): StructType = { if (isDefined(link)) { require(supportedFamilyAndLinkPairs.contains( - Family.fromName($(family)) -> Link.fromName($(link))), "Generalized Linear Regression " + - s"with ${$(family)} family does not support ${$(link)} link function.") + Family.fromName($(family), $(variancePower)) -> Link.fromName($(link))), + s"Generalized Linear Regression with ${$(family)} family " + + s"does not support ${$(link)} link function.") } val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType) if (hasLinkPredictionCol) { @@ -140,13 +150,14 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * Generalized linear model (Wikipedia)) * specified by giving a symbolic description of the linear * predictor (link function) and a description of the error distribution (family). - * It supports "gaussian", "binomial", "poisson" and "gamma" as family. + * It supports "gaussian", "binomial", "poisson", "gamma" and "tweedie" as family. * Valid link functions for each family is listed below. The first link function of each family * is the default one. * - "gaussian" : "identity", "log", "inverse" * - "binomial" : "logit", "probit", "cloglog" * - "poisson" : "log", "identity", "sqrt" * - "gamma" : "inverse", "identity", "log" + * - "tweedie" : "identity", "log" */ @Experimental @Since("2.0.0") @@ -171,12 +182,13 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val /** * Sets the value of param [[variancePower]]. - * Default is 1.5. + * Used only when family is "tweedie". * * @group setParam */ @Since("2.2.0") def setVariancePower(value: Double): this.type = set(variancePower, value) + setDefault(variancePower -> 1.5) /** * Sets the value of param [[link]]. @@ -263,9 +275,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { - val familyObj = Family.fromName($(family)) - if (familyObj == Tweedie) - Tweedie.variancePower = $(variancePower) + val familyObj = Family.fromName($(family), $(variancePower)) val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -428,15 +438,17 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * Gets the [[Family]] object from its name. * - * @param name family name: "gaussian", "binomial", "poisson" or "gamma". + * @param name family name: "gaussian", "binomial", "poisson", "gamma" or "tweedie". */ - def fromName(name: String): Family = { + def fromName(name: String, variancePower: Double): Family = { name match { case Gaussian.name => Gaussian case Binomial.name => Binomial case Poisson.name => Poisson case Gamma.name => Gamma - case Tweedie.name => Tweedie + case Tweedie.name => + Tweedie.variancePower = variancePower + Tweedie } } } @@ -627,27 +639,36 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine var variancePower: Double = 1.5 override def initialize(y: Double, weight: Double): Double = { - require(y >= 0.0, "The response variable of Tweedie family " + - s"should be non-negative, but got $y") - /* - Force Poisson mean > 0 to avoid numerical instability in IRLS. - R uses y + 0.1 for initialization. See poisson()$initialize. - */ - math.max(y, 0.1) + if (variancePower > 1.0 && variancePower < 2.0) { + require(y >= 0.0, "The response variable of the specified Tweedie distribution " + + s"should be non-negative, but got $y") + math.max(y, 0.1) + } else { + require(y > 0.0, "The response variable of the specified Tweedie distribution " + + s"should be non-negative, but got $y") + y + } } override def variance(mu: Double): Double = math.pow(mu, variancePower) + private def yp(y: Double, mu: Double, p: Double): Double = { + (math.pow(y, p) - math.pow(mu, p)) / p + } + + // Force y >= 0.1 for deviance to work for (1 - variancePower). see tweedie()$dev.resid override def deviance(y: Double, mu: Double, weight: Double): Double = { - 1.0 + 2.0 * weight * + (y * yp(math.max(y, 0.1), mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) } + // This depends on the density of the tweedie distribution. Not yet implemented. override def aic( - predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { - 1.0 + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + 0.0 } override def project(mu: Double): Double = { @@ -789,7 +810,7 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - private lazy val familyObj = Family.fromName($(family)) + private lazy val familyObj = Family.fromName($(family), $(variancePower)) private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -974,7 +995,8 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset) - private[regression] lazy val family: Family = Family.fromName(model.getFamily) + private[regression] lazy val family: Family = + Family.fromName(model.getFamily, model.getVariancePower) private[regression] lazy val link: Link = if (model.isDefined(model.link)) { Link.fromName(model.getLink) } else { @@ -1123,7 +1145,11 @@ class GeneralizedLinearRegressionSummary private[regression] ( case Row(label: Double, pred: Double, weight: Double) => (label, pred, weight) } - family.aic(t, deviance, numInstances, weightSum) + 2 * rank + if (model.getFamily == Tweedie.name) { + throw new UnsupportedOperationException("No AIC available for the tweedie family") + } else { + family.aic(t, deviance, numInstances, weightSum) + 2 * rank + } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala index 356994374aca6..2868b5dd921d4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala @@ -37,53 +37,146 @@ class TestSuite import testImplicits._ - private val seed: Int = 42 - @transient var datasetTweedie: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() } + test("generalized linear regression: tweedie family against glm") { + /* + R code: + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 1.0, 1.0, 2.0, + 1.0, 1.0, 2.0, 1.0, + 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) + + f1 <- V1 ~ -1 + V3 + V4 + f2 <- V1 ~ V3 + V4 + + for (f in c(f1, f2)) + for (lp in c(0, 1)) + for (vp in c(1.6, 2.5, 3.0, 4.0)){ + model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) + print(as.vector(coef(model))) + } + + [1] 0.1496480 -0.0122283 + [1] 0.1373567 -0.0120673 + [1] 0.13077402 -0.01181116 + [1] 0.11853618 -0.01118475 + [1] 0.3919109 0.1846094 + [1] 0.3684426 0.1810662 + [1] 0.3566982 0.1788412 + [1] 0.3370804 0.1740093 + [1] -1.3163732 0.4378139 0.2464114 + [1] -1.4396020 0.4817364 0.2680088 + [1] -1.5975930 0.5440060 0.2982824 + [1] -3.4044522 1.3557615 0.6797386 + [1] -0.7090230 0.6256309 0.3294324 + [1] -0.9524928 0.7304267 0.3792687 + [1] -1.1216622 0.8089538 0.4156152 + [1] -1.3594653 0.9262326 0.4682795 + */ + val datasetTweedie = Seq( + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), + Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), + Instance(2.0, 1.0, Vectors.dense(3.0, 3.0)) + ).toDF() + + val expected = Seq( + Vectors.dense(0, 0.149648, -0.0122283), + Vectors.dense(0, 0.1373567, -0.0120673), + Vectors.dense(0, 0.13077402, -0.01181116), + Vectors.dense(0, 0.11853618, -0.01118475), + Vectors.dense(0, 0.3919109, 0.1846094), + Vectors.dense(0, 0.3684426, 0.1810662), + Vectors.dense(0, 0.3566982, 0.1788412), + Vectors.dense(0, 0.3370804, 0.1740093), + Vectors.dense(-1.3163732, 0.4378139, 0.2464114), + Vectors.dense(-1.439602, 0.4817364, 0.2680088), + Vectors.dense(-1.597593, 0.544006, 0.2982824), + Vectors.dense(-3.4044522, 1.3557615, 0.6797386), + Vectors.dense(-0.709023, 0.6256309, 0.3294324), + Vectors.dense(-0.9524928, 0.7304267, 0.3792687), + Vectors.dense(-1.1216622, 0.8089538, 0.4156152), + Vectors.dense(-1.3594653, 0.9262326, 0.4682795)) + + import GeneralizedLinearRegression._ + + var idx = 0 + for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { + for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { + val trainer = new GeneralizedLinearRegression().setFamily("tweedie") + .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") + .setVariancePower(variancePower) + val model = trainer.fit(datasetTweedie) + val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) + assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + + s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") + + val familyLink = new FamilyAndLink(Tweedie, Link.fromName(link)) + model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() + .foreach { + case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => + val eta = BLAS.dot(features, model.coefficients) + model.intercept + val prediction2 = familyLink.fitted(eta) + val linkPrediction2 = eta + assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + + s"tweedie family, $link link, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + + s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + } + idx += 1 + } + } + } test("glm summary: tweedie family with weight") { /* - R code: + R code: - A <- matrix(c(0, 1, 2, 3, 5, 2, 1, 3), 4, 2) - b <- c(1, 0.5, 1, 0) - w <- c(1, 2.0, 0.3, 4.7) - df <- as.data.frame(cbind(A, b)) + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 2.0, 1.0, 2.0, + 1.0, 3.0, 2.0, 1.0, + 0.0, 4.0, 3.0, 3.0), 4, 4, byrow = TRUE)) */ val datasetWithWeight = Seq( - Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), - Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), - Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), - Instance(0.0, 1.0, Vectors.dense(3.0, 3.0)) + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), + Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)), + Instance(0.0, 4.0, Vectors.dense(3.0, 3.0)) ).toDF() val trainer = new GeneralizedLinearRegression() .setFamily("tweedie") - .setVariancePower(1.5) + .setVariancePower(1.6) + .setWeightCol("weight") + .setFitIntercept(false) val model = trainer.fit(datasetWithWeight) - val coefficientsR = Vectors.dense(Array(-1.536, -0.683)) - val interceptR = 3.155 - /* - val devianceResidualsR = Array(0.2404, 0.1965, 1.2824, -0.6916) - val pearsonResidualsR = Array(0.171217, 0.197406, 2.085864, -0.495332) - val workingResidualsR = Array(1.029315, 0.281881, 15.502768, -1.052203) - val responseResidualsR = Array(0.02848, 0.069123, 0.935495, -0.049613) - val seCoefR = Array(1.276417, 0.944934) - val tValsR = Array(-1.324124, 0.747068) - val pValsR = Array(0.185462, 0.455023) - val dispersionR = 1.0 - val nullDevianceR = 8.3178 - val residualDevianceR = 2.2193 + val coefficientsR = Vectors.dense(Array(-0.408746, -0.12125)) + val interceptR = 0.0 + val devianceResidualsR = Array(0.621047, -0.051515, 1.693473, -3.253946) + val pearsonResidualsR = Array(0.738362, -0.050946, 2.234834, -1.455209) + val workingResidualsR = Array(0.833541, -0.041036, 1.556764, -1.0) + val responseResidualsR = Array(0.454607, -0.021396, 0.608881, -0.203928) + val seCoefR = Array(0.520519, 0.408215) + val tValsR = Array(-0.785267, -0.297024) + val pValsR = Array(0.514549, 0.794457) + val dispersionR = 3.830036 + val nullDevianceR = 20.702 + val residualDevianceR = 13.844 val residualDegreeOfFreedomNullR = 4 val residualDegreeOfFreedomR = 2 - val aicR = 5.991537 + // val aicR = 0.0 val summary = model.summary + val devianceResiduals = summary.residuals() .select(col("devianceResiduals")) .collect() @@ -100,10 +193,10 @@ class TestSuite .select(col("responseResiduals")) .collect() .map(_.getDouble(0)) -*/ + assert(model.coefficients ~== coefficientsR absTol 1E-3) assert(model.intercept ~== interceptR absTol 1E-3) - /* devianceResiduals.zip(devianceResidualsR).foreach { x => + devianceResiduals.zip(devianceResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-3) } pearsonResiduals.zip(pearsonResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-3) } @@ -111,18 +204,20 @@ class TestSuite assert(x._1 ~== x._2 absTol 1E-3) } responseResiduals.zip(responseResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-3) } + summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } + assert(summary.dispersion ~== dispersionR absTol 1E-3) assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3) assert(summary.deviance ~== residualDevianceR absTol 1E-3) assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR) assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) - assert(summary.aic ~== aicR absTol 1E-3) + // assert(summary.aic ~== aicR absTol 1E-3) assert(summary.solver === "irls") - */ + } } From 7fe39106332663d3671b94a8ffac48ca61c48470 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 19 Dec 2016 15:14:37 -0800 Subject: [PATCH 03/22] Merge test into GLR --- .../GeneralizedLinearRegressionSuite.scala | 209 ++++++++++++++++ .../spark/ml/regression/TestSuite.scala | 223 ------------------ 2 files changed, 209 insertions(+), 223 deletions(-) delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index ed24c1e16a130..638222ff1ad41 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -578,6 +578,100 @@ class GeneralizedLinearRegressionSuite } } + test("generalized linear regression: tweedie family against glm") { + /* + R code: + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 1.0, 1.0, 2.0, + 1.0, 1.0, 2.0, 1.0, + 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) + + f1 <- V1 ~ -1 + V3 + V4 + f2 <- V1 ~ V3 + V4 + + for (f in c(f1, f2)){ + for (lp in c(0, 1)) + for (vp in c(1.6, 2.5, 3.0, 4.0)){ + model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) + print(as.vector(coef(model))) + } + } + + [1] 0.1496480 -0.0122283 + [1] 0.1373567 -0.0120673 + [1] 0.13077402 -0.01181116 + [1] 0.11853618 -0.01118475 + [1] 0.3919109 0.1846094 + [1] 0.3684426 0.1810662 + [1] 0.3566982 0.1788412 + [1] 0.3370804 0.1740093 + [1] -1.3163732 0.4378139 0.2464114 + [1] -1.4396020 0.4817364 0.2680088 + [1] -1.5975930 0.5440060 0.2982824 + [1] -3.4044522 1.3557615 0.6797386 + [1] -0.7090230 0.6256309 0.3294324 + [1] -0.9524928 0.7304267 0.3792687 + [1] -1.1216622 0.8089538 0.4156152 + [1] -1.3594653 0.9262326 0.4682795 + */ + val datasetTweedie = Seq( + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), + Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), + Instance(2.0, 1.0, Vectors.dense(3.0, 3.0)) + ).toDF() + + val expected = Seq( + Vectors.dense(0, 0.149648, -0.0122283), + Vectors.dense(0, 0.1373567, -0.0120673), + Vectors.dense(0, 0.13077402, -0.01181116), + Vectors.dense(0, 0.11853618, -0.01118475), + Vectors.dense(0, 0.3919109, 0.1846094), + Vectors.dense(0, 0.3684426, 0.1810662), + Vectors.dense(0, 0.3566982, 0.1788412), + Vectors.dense(0, 0.3370804, 0.1740093), + Vectors.dense(-1.3163732, 0.4378139, 0.2464114), + Vectors.dense(-1.439602, 0.4817364, 0.2680088), + Vectors.dense(-1.597593, 0.544006, 0.2982824), + Vectors.dense(-3.4044522, 1.3557615, 0.6797386), + Vectors.dense(-0.709023, 0.6256309, 0.3294324), + Vectors.dense(-0.9524928, 0.7304267, 0.3792687), + Vectors.dense(-1.1216622, 0.8089538, 0.4156152), + Vectors.dense(-1.3594653, 0.9262326, 0.4682795)) + + import GeneralizedLinearRegression._ + + var idx = 0 + for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { + for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { + val trainer = new GeneralizedLinearRegression().setFamily("tweedie") + .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") + .setVariancePower(variancePower) + val model = trainer.fit(datasetTweedie) + val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) + assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + + s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") + + val familyLink = new FamilyAndLink(Tweedie, Link.fromName(link)) + model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() + .foreach { + case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => + val eta = BLAS.dot(features, model.coefficients) + model.intercept + val prediction2 = familyLink.fitted(eta) + val linkPrediction2 = eta + assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + + s"tweedie family, $link link, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + + s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + } + idx += 1 + } + } + } + test("glm summary: gaussian family with weight") { /* R code: @@ -1052,6 +1146,121 @@ class GeneralizedLinearRegressionSuite assert(summary.solver === "irls") } + test("glm summary: tweedie family with weight") { + /* + R code: + + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 2.0, 1.0, 2.0, + 1.0, 3.0, 2.0, 1.0, + 0.0, 4.0, 3.0, 3.0), 4, 4, byrow = TRUE)) + + f <- glm(V1 ~ -1 + V3 + V4, data = df, weights = V2, + family = tweedie(var.power = 1.6, link.power = 0)) + + Deviance Residuals: + 1 2 3 4 + 0.6210 -0.0515 1.6935 -3.2539 + + Coefficients: + Estimate Std. Error t value Pr(>|t|) + V3 -0.4087 0.5205 -0.785 0.515 + V4 -0.1212 0.4082 -0.297 0.794 + + (Dispersion parameter for Tweedie family taken to be 3.830036) + + Null deviance: 20.702 on 4 degrees of freedom + Residual deviance: 13.844 on 2 degrees of freedom + AIC: NA + + Number of Fisher Scoring iterations: 11 + + residuals(model, type="pearson") + 1 2 3 4 + 0.01873881 -0.01312994 0.04190280 -0.10332690 + residuals(model, type="working") + 1 2 3 4 + 0.018067789 -0.003326304 0.038720616 -0.824070943 + residuals(model, type="response") + 1 2 3 4 + 0.018067789 -0.003326304 0.038720616 -0.824070943 + + */ + val datasetWithWeight = Seq( + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), + Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)), + Instance(0.0, 4.0, Vectors.dense(3.0, 3.0)) + ).toDF() + + val trainer = new GeneralizedLinearRegression() + .setFamily("tweedie") + .setVariancePower(1.6) + .setWeightCol("weight") + .setFitIntercept(false) + + val model = trainer.fit(datasetWithWeight) + val coefficientsR = Vectors.dense(Array(-0.408746, -0.12125)) + val interceptR = 0.0 + val devianceResidualsR = Array(0.621047, -0.051515, 1.693473, -3.253946) + val pearsonResidualsR = Array(0.738362, -0.050946, 2.234834, -1.455209) + val workingResidualsR = Array(0.833541, -0.041036, 1.556764, -1.0) + val responseResidualsR = Array(0.454607, -0.021396, 0.608881, -0.203928) + val seCoefR = Array(0.520519, 0.408215) + val tValsR = Array(-0.785267, -0.297024) + val pValsR = Array(0.514549, 0.794457) + val dispersionR = 3.830036 + val nullDevianceR = 20.702 + val residualDevianceR = 13.844 + val residualDegreeOfFreedomNullR = 4 + val residualDegreeOfFreedomR = 2 + // val aicR = 0.0 + + val summary = model.summary + + val devianceResiduals = summary.residuals() + .select(col("devianceResiduals")) + .collect() + .map(_.getDouble(0)) + val pearsonResiduals = summary.residuals("pearson") + .select(col("pearsonResiduals")) + .collect() + .map(_.getDouble(0)) + val workingResiduals = summary.residuals("working") + .select(col("workingResiduals")) + .collect() + .map(_.getDouble(0)) + val responseResiduals = summary.residuals("response") + .select(col("responseResiduals")) + .collect() + .map(_.getDouble(0)) + + assert(model.coefficients ~== coefficientsR absTol 1E-3) + assert(model.intercept ~== interceptR absTol 1E-3) + devianceResiduals.zip(devianceResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + pearsonResiduals.zip(pearsonResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + workingResiduals.zip(workingResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + responseResiduals.zip(responseResidualsR).foreach { x => + assert(x._1 ~== x._2 absTol 1E-3) } + + summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => + assert(x._1 ~== x._2 absTol 1E-3) } + summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } + summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } + + assert(summary.dispersion ~== dispersionR absTol 1E-3) + assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3) + assert(summary.deviance ~== residualDevianceR absTol 1E-3) + assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR) + assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) + // assert(summary.aic ~== aicR absTol 1E-3) + assert(summary.solver === "irls") + } + test("glm handle collinear features") { val collinearInstances = Seq( Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)), diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala deleted file mode 100644 index 2868b5dd921d4..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/TestSuite.scala +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.regression - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.classification.LogisticRegressionSuite._ -import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors} -import org.apache.spark.ml.param.{ParamMap, ParamsSuite} -import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.random._ -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.FloatType -import org.apache.spark.sql.{DataFrame, Row} - -import scala.util.Random - -class TestSuite - extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - - import testImplicits._ - - - override def beforeAll(): Unit = { - super.beforeAll() - } - - test("generalized linear regression: tweedie family against glm") { - /* - R code: - df <- as.data.frame(matrix(c( - 1.0, 1.0, 0.0, 5.0, - 0.5, 1.0, 1.0, 2.0, - 1.0, 1.0, 2.0, 1.0, - 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) - - f1 <- V1 ~ -1 + V3 + V4 - f2 <- V1 ~ V3 + V4 - - for (f in c(f1, f2)) - for (lp in c(0, 1)) - for (vp in c(1.6, 2.5, 3.0, 4.0)){ - model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) - print(as.vector(coef(model))) - } - - [1] 0.1496480 -0.0122283 - [1] 0.1373567 -0.0120673 - [1] 0.13077402 -0.01181116 - [1] 0.11853618 -0.01118475 - [1] 0.3919109 0.1846094 - [1] 0.3684426 0.1810662 - [1] 0.3566982 0.1788412 - [1] 0.3370804 0.1740093 - [1] -1.3163732 0.4378139 0.2464114 - [1] -1.4396020 0.4817364 0.2680088 - [1] -1.5975930 0.5440060 0.2982824 - [1] -3.4044522 1.3557615 0.6797386 - [1] -0.7090230 0.6256309 0.3294324 - [1] -0.9524928 0.7304267 0.3792687 - [1] -1.1216622 0.8089538 0.4156152 - [1] -1.3594653 0.9262326 0.4682795 - */ - val datasetTweedie = Seq( - Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), - Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), - Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), - Instance(2.0, 1.0, Vectors.dense(3.0, 3.0)) - ).toDF() - - val expected = Seq( - Vectors.dense(0, 0.149648, -0.0122283), - Vectors.dense(0, 0.1373567, -0.0120673), - Vectors.dense(0, 0.13077402, -0.01181116), - Vectors.dense(0, 0.11853618, -0.01118475), - Vectors.dense(0, 0.3919109, 0.1846094), - Vectors.dense(0, 0.3684426, 0.1810662), - Vectors.dense(0, 0.3566982, 0.1788412), - Vectors.dense(0, 0.3370804, 0.1740093), - Vectors.dense(-1.3163732, 0.4378139, 0.2464114), - Vectors.dense(-1.439602, 0.4817364, 0.2680088), - Vectors.dense(-1.597593, 0.544006, 0.2982824), - Vectors.dense(-3.4044522, 1.3557615, 0.6797386), - Vectors.dense(-0.709023, 0.6256309, 0.3294324), - Vectors.dense(-0.9524928, 0.7304267, 0.3792687), - Vectors.dense(-1.1216622, 0.8089538, 0.4156152), - Vectors.dense(-1.3594653, 0.9262326, 0.4682795)) - - import GeneralizedLinearRegression._ - - var idx = 0 - for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { - for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { - val trainer = new GeneralizedLinearRegression().setFamily("tweedie") - .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") - .setVariancePower(variancePower) - val model = trainer.fit(datasetTweedie) - val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) - assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + - s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") - - val familyLink = new FamilyAndLink(Tweedie, Link.fromName(link)) - model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() - .foreach { - case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => - val eta = BLAS.dot(features, model.coefficients) + model.intercept - val prediction2 = familyLink.fitted(eta) - val linkPrediction2 = eta - assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + - s"tweedie family, $link link, fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") - assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + - s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") - } - idx += 1 - } - } - } - - test("glm summary: tweedie family with weight") { - /* - R code: - - df <- as.data.frame(matrix(c( - 1.0, 1.0, 0.0, 5.0, - 0.5, 2.0, 1.0, 2.0, - 1.0, 3.0, 2.0, 1.0, - 0.0, 4.0, 3.0, 3.0), 4, 4, byrow = TRUE)) - */ - val datasetWithWeight = Seq( - Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), - Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)), - Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)), - Instance(0.0, 4.0, Vectors.dense(3.0, 3.0)) - ).toDF() - - val trainer = new GeneralizedLinearRegression() - .setFamily("tweedie") - .setVariancePower(1.6) - .setWeightCol("weight") - .setFitIntercept(false) - - val model = trainer.fit(datasetWithWeight) - val coefficientsR = Vectors.dense(Array(-0.408746, -0.12125)) - val interceptR = 0.0 - val devianceResidualsR = Array(0.621047, -0.051515, 1.693473, -3.253946) - val pearsonResidualsR = Array(0.738362, -0.050946, 2.234834, -1.455209) - val workingResidualsR = Array(0.833541, -0.041036, 1.556764, -1.0) - val responseResidualsR = Array(0.454607, -0.021396, 0.608881, -0.203928) - val seCoefR = Array(0.520519, 0.408215) - val tValsR = Array(-0.785267, -0.297024) - val pValsR = Array(0.514549, 0.794457) - val dispersionR = 3.830036 - val nullDevianceR = 20.702 - val residualDevianceR = 13.844 - val residualDegreeOfFreedomNullR = 4 - val residualDegreeOfFreedomR = 2 - // val aicR = 0.0 - - val summary = model.summary - - val devianceResiduals = summary.residuals() - .select(col("devianceResiduals")) - .collect() - .map(_.getDouble(0)) - val pearsonResiduals = summary.residuals("pearson") - .select(col("pearsonResiduals")) - .collect() - .map(_.getDouble(0)) - val workingResiduals = summary.residuals("working") - .select(col("workingResiduals")) - .collect() - .map(_.getDouble(0)) - val responseResiduals = summary.residuals("response") - .select(col("responseResiduals")) - .collect() - .map(_.getDouble(0)) - - assert(model.coefficients ~== coefficientsR absTol 1E-3) - assert(model.intercept ~== interceptR absTol 1E-3) - devianceResiduals.zip(devianceResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } - pearsonResiduals.zip(pearsonResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } - workingResiduals.zip(workingResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } - responseResiduals.zip(responseResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } - - summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => - assert(x._1 ~== x._2 absTol 1E-3) } - summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } - summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } - - assert(summary.dispersion ~== dispersionR absTol 1E-3) - assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3) - assert(summary.deviance ~== residualDevianceR absTol 1E-3) - assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR) - assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) - // assert(summary.aic ~== aicR absTol 1E-3) - assert(summary.solver === "irls") - - } -} - From bfcc4fb08d54156efc66b90d14c62ea7ff172afa Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 20 Dec 2016 14:59:05 -0800 Subject: [PATCH 04/22] Use Tweedie class instead of global object Tweedie; change variancePower to varPower --- .../GeneralizedLinearRegression.scala | 87 +++++++++++-------- .../GeneralizedLinearRegressionSuite.scala | 14 +-- 2 files changed, 57 insertions(+), 44 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 2ea6764e92b6e..a9c408d1b7fa9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -74,7 +74,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * @group param */ @Since("2.2.0") - final val variancePower: Param[Double] = new Param(this, "variancePower", + final val varPower: Param[Double] = new Param(this, "varPower", "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + "Used only for the tweedie family. Supported value: (1, 2) and (2, Inf).", @@ -82,7 +82,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** @group getParam */ @Since("2.2.0") - def getVariancePower: Double = $(variancePower) + def getVarPower: Double = $(varPower) /** * Param for the name of link function which provides the relationship @@ -129,7 +129,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam featuresDataType: DataType): StructType = { if (isDefined(link)) { require(supportedFamilyAndLinkPairs.contains( - Family.fromName($(family), $(variancePower)) -> Link.fromName($(link))), + $(family) -> Link.fromName($(link))), s"Generalized Linear Regression with ${$(family)} family " + s"does not support ${$(link)} link function.") } @@ -181,14 +181,14 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val setDefault(family -> Gaussian.name) /** - * Sets the value of param [[variancePower]]. + * Sets the value of param [[varPower]]. * Used only when family is "tweedie". * * @group setParam */ @Since("2.2.0") - def setVariancePower(value: Double): this.type = set(variancePower, value) - setDefault(variancePower -> 1.5) + def setVarPower(value: Double): this.type = set(varPower, value) + setDefault(varPower -> 1.5) /** * Sets the value of param [[link]]. @@ -275,7 +275,12 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { - val familyObj = Family.fromName($(family), $(variancePower)) + val familyObj = if ($(family) == "tweedie") { + new Tweedie($(varPower)) + } else { + Family.fromName($(family)) + } + val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -336,15 +341,15 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** Set of family and link pairs that GeneralizedLinearRegression supports. */ private[regression] lazy val supportedFamilyAndLinkPairs = Set( - Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse, - Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog, - Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt, - Gamma -> Inverse, Gamma -> Identity, Gamma -> Log, - Tweedie -> Identity, Tweedie -> Log + "gaussian" -> Identity, "gaussian" -> Log, "gaussian" -> Inverse, + "binomial" -> Logit, "binomial" -> Probit, "binomial" -> CLogLog, + "poisson" -> Log, "poisson" -> Identity, "poisson" -> Sqrt, + "gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log, + "tweedie" -> Identity, "tweedie" -> Log ) /** Set of family names that GeneralizedLinearRegression supports. */ - private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1.name) + private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1) /** Set of link names that GeneralizedLinearRegression supports. */ private[regression] lazy val supportedLinkNames = supportedFamilyAndLinkPairs.map(_._2.name) @@ -431,24 +436,26 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** Trim the fitted value so that it will be in valid range. */ def project(mu: Double): Double = mu + + /** Constant added to y = 0 for initialization or deviance to avoid numerical issues. */ + val delta: Double = 0.1 } private[regression] object Family { /** * Gets the [[Family]] object from its name. + * This does not work for the tweedie family as it depends on the variance power + * that is set by the user. * - * @param name family name: "gaussian", "binomial", "poisson", "gamma" or "tweedie". + * @param name family name: "gaussian", "binomial", "poisson" and "gamma". */ - def fromName(name: String, variancePower: Double): Family = { + def fromName(name: String): Family = { name match { case Gaussian.name => Gaussian case Binomial.name => Binomial case Poisson.name => Poisson case Gamma.name => Gamma - case Tweedie.name => - Tweedie.variancePower = variancePower - Tweedie } } } @@ -556,7 +563,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine Force Poisson mean > 0 to avoid numerical instability in IRLS. R uses y + 0.1 for initialization. See poisson()$initialize. */ - math.max(y, 0.1) + math.max(y, delta) } override def variance(mu: Double): Double = mu @@ -630,19 +637,21 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * Tweedie exponential family distribution. - * The default link for the Tweedie family is the log link. + * The default link for the Tweedie family is set as the log link. + * Define a class that takes the variance power as argument, as opposed to + * a global object for the other families. */ - private[regression] object Tweedie extends Family("tweedie") { + private[regression] class Tweedie(private val varPower: Double) + extends Family("tweedie") { + // The canonical link is 1 - varPower, which is barely used. val defaultLink: Link = Log - var variancePower: Double = 1.5 - override def initialize(y: Double, weight: Double): Double = { - if (variancePower > 1.0 && variancePower < 2.0) { + if (varPower > 1.0 && varPower < 2.0) { require(y >= 0.0, "The response variable of the specified Tweedie distribution " + s"should be non-negative, but got $y") - math.max(y, 0.1) + math.max(y, delta) } else { require(y > 0.0, "The response variable of the specified Tweedie distribution " + s"should be non-negative, but got $y") @@ -650,16 +659,16 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } - override def variance(mu: Double): Double = math.pow(mu, variancePower) + override def variance(mu: Double): Double = math.pow(mu, varPower) private def yp(y: Double, mu: Double, p: Double): Double = { (math.pow(y, p) - math.pow(mu, p)) / p } - // Force y >= 0.1 for deviance to work for (1 - variancePower). see tweedie()$dev.resid + // Force y >= 0.1 for deviance to work for (1 - varPower). see tweedie()$dev.resid override def deviance(y: Double, mu: Double, weight: Double): Double = { 2.0 * weight * - (y * yp(math.max(y, 0.1), mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) + (y * yp(math.max(y, delta), mu, 1.0 - varPower) - yp(y, mu, 2.0 - varPower)) } // This depends on the density of the tweedie distribution. Not yet implemented. @@ -668,7 +677,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine deviance: Double, numInstances: Double, weightSum: Double): Double = { - 0.0 + throw new UnsupportedOperationException("No AIC available for the tweedie family") } override def project(mu: Double): Double = { @@ -681,6 +690,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } } + /** * A description of the link function to be used in the model. * The link function provides the relationship between the linear predictor @@ -810,7 +820,11 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - private lazy val familyObj = Family.fromName($(family), $(variancePower)) + private lazy val familyObj = if ($(family) == "tweedie") { + new Tweedie($(varPower)) + } else { + Family.fromName($(family)) + } private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -995,8 +1009,11 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset) - private[regression] lazy val family: Family = - Family.fromName(model.getFamily, model.getVariancePower) + private[regression] lazy val family: Family = if (model.getFamily == "tweedie") { + new Tweedie(model.getVarPower) + } else { + Family.fromName(model.getFamily) + } private[regression] lazy val link: Link = if (model.isDefined(model.link)) { Link.fromName(model.getLink) } else { @@ -1145,11 +1162,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( case Row(label: Double, pred: Double, weight: Double) => (label, pred, weight) } - if (model.getFamily == Tweedie.name) { - throw new UnsupportedOperationException("No AIC available for the tweedie family") - } else { - family.aic(t, deviance, numInstances, weightSum) + 2 * rank - } + family.aic(t, deviance, numInstances, weightSum) + 2 * rank } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 638222ff1ad41..c3580ab8ef125 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -644,16 +644,16 @@ class GeneralizedLinearRegressionSuite var idx = 0 for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { - for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { + for (varPower <- Seq(1.6, 2.5, 3.0, 4.0)) { val trainer = new GeneralizedLinearRegression().setFamily("tweedie") .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") - .setVariancePower(variancePower) + .setVarPower(varPower) val model = trainer.fit(datasetTweedie) val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + - s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") + s"$link link, fitIntercept = $fitIntercept and varPower = $varPower.") - val familyLink = new FamilyAndLink(Tweedie, Link.fromName(link)) + val familyLink = new FamilyAndLink(new Tweedie(varPower), Link.fromName(link)) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -662,10 +662,10 @@ class GeneralizedLinearRegressionSuite val linkPrediction2 = eta assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + s"tweedie family, $link link, fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") + s"and varPower = $varPower.") assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") + s"and varPower = $varPower.") } idx += 1 } @@ -1196,7 +1196,7 @@ class GeneralizedLinearRegressionSuite val trainer = new GeneralizedLinearRegression() .setFamily("tweedie") - .setVariancePower(1.6) + .setVarPower(1.6) .setWeightCol("weight") .setFitIntercept(false) From a8feea7d8095170c1b5f18b7887f16a6d763e42c Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 21 Dec 2016 15:42:40 -0800 Subject: [PATCH 05/22] Allow Family to use GLRBase object directly --- .../GeneralizedLinearRegression.scala | 79 +++++++++---------- .../GeneralizedLinearRegressionSuite.scala | 16 ++-- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a9c408d1b7fa9..1072c5200549e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -66,23 +66,23 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * Param for the power in the variance function of the Tweedie distribution which provides * the relationship between the variance and mean of the distribution. - * Used only for the tweedie family. + * Used only for the Tweedie family. * (see * Tweedie Distribution (Wikipedia)) - * Supported value: (1, 2) and (2, Inf). + * Supported value: 0 and [1, Inf). * * @group param */ @Since("2.2.0") - final val varPower: Param[Double] = new Param(this, "varPower", + final val variancePower: Param[Double] = new Param(this, "variancePower", "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + - "Used only for the tweedie family. Supported value: (1, 2) and (2, Inf).", - (x: Double) => if (x > 1.0 && x != 2.0) true else false) + "Used for the Tweedie family. Supported value: 0 and [1, Inf).", + (x: Double) => x == 0.0 || x >= 1.0) /** @group getParam */ @Since("2.2.0") - def getVarPower: Double = $(varPower) + def getVariancePower: Double = $(variancePower) /** * Param for the name of link function which provides the relationship @@ -181,14 +181,14 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val setDefault(family -> Gaussian.name) /** - * Sets the value of param [[varPower]]. + * Sets the value of param [[variancePower]]. * Used only when family is "tweedie". * * @group setParam */ @Since("2.2.0") - def setVarPower(value: Double): this.type = set(varPower, value) - setDefault(varPower -> 1.5) + def setVariancePower(value: Double): this.type = set(variancePower, value) + setDefault(variancePower -> 1.5) /** * Sets the value of param [[link]]. @@ -275,12 +275,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { - val familyObj = if ($(family) == "tweedie") { - new Tweedie($(varPower)) - } else { - Family.fromName($(family)) - } - + val familyObj = Family.fromModel(this) val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -356,6 +351,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] val epsilon: Double = 1E-16 + /** Constant used in initialization and deviance to avoid numerical issues. */ + private[regression] val delta: Double = 0.1 + /** * Wrapper of family and link combination used in the model. */ @@ -437,25 +435,30 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** Trim the fitted value so that it will be in valid range. */ def project(mu: Double): Double = mu - /** Constant added to y = 0 for initialization or deviance to avoid numerical issues. */ - val delta: Double = 0.1 } private[regression] object Family { /** - * Gets the [[Family]] object from its name. + * Gets the [[Family]] object based on family and variancePower. * This does not work for the tweedie family as it depends on the variance power * that is set by the user. * - * @param name family name: "gaussian", "binomial", "poisson" and "gamma". + * @param model a GenerealizedLinearRegressionBase object */ - def fromName(name: String): Family = { - name match { - case Gaussian.name => Gaussian - case Binomial.name => Binomial - case Poisson.name => Poisson - case Gamma.name => Gamma + def fromModel(model: GeneralizedLinearRegressionBase): Family = { + model.getFamily match { + case "gaussian" => Gaussian + case "binomial" => Binomial + case "poisson" => Poisson + case "gamma" => Gamma + case "tweedie" => + model.getVariancePower match { + case 0.0 => Gaussian + case 1.0 => Poisson + case 2.0 => Gamma + case default => new Tweedie(default) + } } } } @@ -641,14 +644,14 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Define a class that takes the variance power as argument, as opposed to * a global object for the other families. */ - private[regression] class Tweedie(private val varPower: Double) + private[regression] class Tweedie(private val variancePower: Double) extends Family("tweedie") { - // The canonical link is 1 - varPower, which is barely used. + // The canonical link is 1 - variancePower, which is barely used. val defaultLink: Link = Log override def initialize(y: Double, weight: Double): Double = { - if (varPower > 1.0 && varPower < 2.0) { + if (variancePower > 1.0 && variancePower < 2.0) { require(y >= 0.0, "The response variable of the specified Tweedie distribution " + s"should be non-negative, but got $y") math.max(y, delta) @@ -659,16 +662,16 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } - override def variance(mu: Double): Double = math.pow(mu, varPower) + override def variance(mu: Double): Double = math.pow(mu, variancePower) private def yp(y: Double, mu: Double, p: Double): Double = { (math.pow(y, p) - math.pow(mu, p)) / p } - // Force y >= 0.1 for deviance to work for (1 - varPower). see tweedie()$dev.resid + // Force y >= 0.1 for deviance to work for (1 - variancePower). see tweedie()$dev.resid override def deviance(y: Double, mu: Double, weight: Double): Double = { 2.0 * weight * - (y * yp(math.max(y, delta), mu, 1.0 - varPower) - yp(y, mu, 2.0 - varPower)) + (y * yp(math.max(y, delta), mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) } // This depends on the density of the tweedie distribution. Not yet implemented. @@ -820,11 +823,8 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - private lazy val familyObj = if ($(family) == "tweedie") { - new Tweedie($(varPower)) - } else { - Family.fromName($(family)) - } + private lazy val familyObj = Family.fromModel(this) + private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -1009,11 +1009,8 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset) - private[regression] lazy val family: Family = if (model.getFamily == "tweedie") { - new Tweedie(model.getVarPower) - } else { - Family.fromName(model.getFamily) - } + private[regression] lazy val family: Family = Family.fromModel(model) + private[regression] lazy val link: Link = if (model.isDefined(model.link)) { Link.fromName(model.getLink) } else { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index c3580ab8ef125..7be428458d9ae 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -644,16 +644,16 @@ class GeneralizedLinearRegressionSuite var idx = 0 for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { - for (varPower <- Seq(1.6, 2.5, 3.0, 4.0)) { + for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { val trainer = new GeneralizedLinearRegression().setFamily("tweedie") .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") - .setVarPower(varPower) + .setVariancePower(variancePower) val model = trainer.fit(datasetTweedie) val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + - s"$link link, fitIntercept = $fitIntercept and varPower = $varPower.") + s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") - val familyLink = new FamilyAndLink(new Tweedie(varPower), Link.fromName(link)) + val familyLink = new FamilyAndLink(new Tweedie(variancePower), Link.fromName(link)) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -662,10 +662,10 @@ class GeneralizedLinearRegressionSuite val linkPrediction2 = eta assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + s"tweedie family, $link link, fitIntercept = $fitIntercept " + - s"and varPower = $varPower.") + s"and variancePower = $variancePower.") assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + - s"and varPower = $varPower.") + s"and variancePower = $variancePower.") } idx += 1 } @@ -1196,7 +1196,7 @@ class GeneralizedLinearRegressionSuite val trainer = new GeneralizedLinearRegression() .setFamily("tweedie") - .setVarPower(1.6) + .setVariancePower(1.6) .setWeightCol("weight") .setFitIntercept(false) @@ -1282,7 +1282,7 @@ class GeneralizedLinearRegressionSuite } } - test("read/write") { + ignore("read/write") { def checkModelData( model: GeneralizedLinearRegressionModel, model2: GeneralizedLinearRegressionModel): Unit = { From 233e2d338be8d36a74eaf578bfea804ae3617d4e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 21 Dec 2016 17:56:34 -0800 Subject: [PATCH 06/22] Add TweedieFamily and implement specific distn within Tweedie --- .../GeneralizedLinearRegression.scala | 263 +++++++----------- .../GeneralizedLinearRegressionSuite.scala | 2 +- 2 files changed, 109 insertions(+), 156 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 1072c5200549e..4ec056b547583 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -402,9 +402,10 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * A description of the error distribution to be used in the model. * - * @param name the name of the family. */ - private[regression] abstract class Family(val name: String) extends Serializable { + private[regression] abstract class Family extends Serializable { + /** The name of the family. */ + val name: String /** The default link instance of this family. */ val defaultLink: Link @@ -448,46 +449,122 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine */ def fromModel(model: GeneralizedLinearRegressionBase): Family = { model.getFamily match { - case "gaussian" => Gaussian - case "binomial" => Binomial - case "poisson" => Poisson - case "gamma" => Gamma - case "tweedie" => + case Gaussian.name => Gaussian + case Binomial.name => Binomial + case Poisson.name => Poisson + case Gamma.name => Gamma + case Tweedie.name => model.getVariancePower match { case 0.0 => Gaussian case 1.0 => Poisson case 2.0 => Gamma - case default => new Tweedie(default) + case default => new TweedieFamily(default) } } } } /** - * Gaussian exponential family distribution. - * The default link for the Gaussian family is the identity link. - */ - private[regression] object Gaussian extends Family("gaussian") { + * Tweedie exponential family distribution. + * The default link for the Tweedie family is set as the log link. + * Define a class that takes the variance power as argument, as opposed to + * a global object for the other families. + */ + private[regression] class TweedieFamily(private val variancePower: Double) + extends Family{ + + val name: String = variancePower match { + case 0.0 => "gaussian" + case 1.0 => "poisson" + case 2.0 => "gamma" + case default => "tweedie" + } + /* + The canonical link is 1 - variancePower, which is barely used when the distribution + is not Gaussian, Poisson or Gamma. For any other distributions in the Tweedie family, + we set the default to the Log link. + */ + val defaultLink: Link = variancePower match { + case 0.0 => Identity + case 1.0 => Log + case 2.0 => Inverse + case default => Log + } - val defaultLink: Link = Identity + override def initialize(y: Double, weight: Double): Double = { + if (variancePower >= 1.0 && variancePower < 2.0) { + require(y >= 0.0, s"The response variable of the specified $name distribution " + + s"should be non-negative, but got $y") + } else if (variancePower >= 2.0) { + require(y > 0.0, s"The response variable of the specified $name distribution " + + s"should be non-negative, but got $y") + } + if (y == 0) delta else y + } - override def initialize(y: Double, weight: Double): Double = y + override def variance(mu: Double): Double = { + variancePower match { + case 0.0 => 1.0 + case 1.0 => mu + case 2.0 => mu * mu + case default => math.pow(mu, default) + } + } - override def variance(mu: Double): Double = 1.0 + private def yp(y: Double, mu: Double, p: Double): Double = { + if (p == 0) { + math.log(y / mu) + } else { + (math.pow(y, p) - math.pow(mu, p)) / p + } + } override def deviance(y: Double, mu: Double, weight: Double): Double = { - weight * (y - mu) * (y - mu) + // Force y >= delta for Poisson or compound Poisson + val y1 = if (variancePower >= 1.0 && variancePower < 2.0) math.max(y, delta) else y + 2.0 * weight * + (y * yp(y1, mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) } - override def aic( - predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { - val wt = predictions.map(x => math.log(x._3)).sum() - numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt + override def aic(predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + if (variancePower == 0.0) { + val wt = predictions.map(x => math.log(x._3)).sum() + numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt + } else if (variancePower == 1.0) { + -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => + weight * dist.Poisson(mu).logProbabilityOf(y.toInt) + }.sum() + } else if (variancePower == 2.0) { + val disp = deviance / weightSum + -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => + weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) + }.sum() + 2.0 + } else { + // This depends on the density of the tweedie distribution. Not yet implemented. + throw new UnsupportedOperationException("No AIC available for the tweedie family") + } } + override def project(mu: Double): Double = { + if (mu < epsilon) { + epsilon + } else if (mu.isInfinity) { + Double.MaxValue + } else { + mu + } + } + } + + /** + * Gaussian exponential family distribution. + * The default link for the Gaussian family is the identity link. + */ + private[regression] object Gaussian extends TweedieFamily(0.0) { + override def project(mu: Double): Double = { if (mu.isNegInfinity) { Double.MinValue @@ -503,7 +580,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Binomial exponential family distribution. * The default link for the Binomial family is the logit link. */ - private[regression] object Binomial extends Family("binomial") { + private[regression] object Binomial extends Family { + + val name = "binomial" val defaultLink: Link = Logit @@ -555,144 +634,18 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Poisson exponential family distribution. * The default link for the Poisson family is the log link. */ - private[regression] object Poisson extends Family("poisson") { - - val defaultLink: Link = Log - - override def initialize(y: Double, weight: Double): Double = { - require(y >= 0.0, "The response variable of Poisson family " + - s"should be non-negative, but got $y") - /* - Force Poisson mean > 0 to avoid numerical instability in IRLS. - R uses y + 0.1 for initialization. See poisson()$initialize. - */ - math.max(y, delta) - } - - override def variance(mu: Double): Double = mu - - override def deviance(y: Double, mu: Double, weight: Double): Double = { - 2.0 * weight * (y * math.log(y / mu) - (y - mu)) - } - - override def aic( - predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { - -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => - weight * dist.Poisson(mu).logProbabilityOf(y.toInt) - }.sum() - } - - override def project(mu: Double): Double = { - if (mu < epsilon) { - epsilon - } else if (mu.isInfinity) { - Double.MaxValue - } else { - mu - } - } - } + private[regression] object Poisson extends TweedieFamily(1.0) /** * Gamma exponential family distribution. * The default link for the Gamma family is the inverse link. */ - private[regression] object Gamma extends Family("gamma") { - - val defaultLink: Link = Inverse - - override def initialize(y: Double, weight: Double): Double = { - require(y > 0.0, "The response variable of Gamma family " + - s"should be positive, but got $y") - y - } - - override def variance(mu: Double): Double = mu * mu - - override def deviance(y: Double, mu: Double, weight: Double): Double = { - -2.0 * weight * (math.log(y / mu) - (y - mu)/mu) - } - - override def aic( - predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { - val disp = deviance / weightSum - -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => - weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) - }.sum() + 2.0 - } - - override def project(mu: Double): Double = { - if (mu < epsilon) { - epsilon - } else if (mu.isInfinity) { - Double.MaxValue - } else { - mu - } - } - } + private[regression] object Gamma extends TweedieFamily(2.0) /** - * Tweedie exponential family distribution. - * The default link for the Tweedie family is set as the log link. - * Define a class that takes the variance power as argument, as opposed to - * a global object for the other families. - */ - private[regression] class Tweedie(private val variancePower: Double) - extends Family("tweedie") { - - // The canonical link is 1 - variancePower, which is barely used. - val defaultLink: Link = Log - - override def initialize(y: Double, weight: Double): Double = { - if (variancePower > 1.0 && variancePower < 2.0) { - require(y >= 0.0, "The response variable of the specified Tweedie distribution " + - s"should be non-negative, but got $y") - math.max(y, delta) - } else { - require(y > 0.0, "The response variable of the specified Tweedie distribution " + - s"should be non-negative, but got $y") - y - } - } - - override def variance(mu: Double): Double = math.pow(mu, variancePower) - - private def yp(y: Double, mu: Double, p: Double): Double = { - (math.pow(y, p) - math.pow(mu, p)) / p - } - - // Force y >= 0.1 for deviance to work for (1 - variancePower). see tweedie()$dev.resid - override def deviance(y: Double, mu: Double, weight: Double): Double = { - 2.0 * weight * - (y * yp(math.max(y, delta), mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) - } - - // This depends on the density of the tweedie distribution. Not yet implemented. - override def aic( - predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { - throw new UnsupportedOperationException("No AIC available for the tweedie family") - } - - override def project(mu: Double): Double = { - if (mu < epsilon) { - epsilon - } else if (mu.isInfinity) { - Double.MaxValue - } else { - mu - } - } - } + * Tweedie exponential family distribution. + */ + private[regression] object Tweedie extends TweedieFamily(1.5) /** * A description of the link function to be used in the model. diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 7be428458d9ae..42cb2189e8140 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -653,7 +653,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") - val familyLink = new FamilyAndLink(new Tweedie(variancePower), Link.fromName(link)) + val familyLink = new FamilyAndLink(new TweedieFamily(variancePower), Link.fromName(link)) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => From 17c55816c914bc96a8b5141356e3c117f343f303 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 21 Dec 2016 20:39:54 -0800 Subject: [PATCH 07/22] Clean up doc --- .../GeneralizedLinearRegression.scala | 37 ++++++++----------- .../GeneralizedLinearRegressionSuite.scala | 2 +- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 4ec056b547583..314da523c5dca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -69,7 +69,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * Used only for the Tweedie family. * (see * Tweedie Distribution (Wikipedia)) - * Supported value: 0 and [1, Inf). + * Supported value: 0 and [1, Inf). Note that when the value of the variance power is + * 0, 1, or 2, the Gaussian, Poisson or Gamma family is used, respectively. * * @group param */ @@ -78,7 +79,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + "Used for the Tweedie family. Supported value: 0 and [1, Inf).", - (x: Double) => x == 0.0 || x >= 1.0) + (x: Double) => x >= 1.0 || x == 0.0) /** @group getParam */ @Since("2.2.0") @@ -442,18 +443,18 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * Gets the [[Family]] object based on family and variancePower. - * This does not work for the tweedie family as it depends on the variance power - * that is set by the user. + * 1) retrieve object based on family name + * 2) if family name is tweedie, retrieve object based on variancePower * * @param model a GenerealizedLinearRegressionBase object */ def fromModel(model: GeneralizedLinearRegressionBase): Family = { model.getFamily match { - case Gaussian.name => Gaussian - case Binomial.name => Binomial - case Poisson.name => Poisson - case Gamma.name => Gamma - case Tweedie.name => + case "gaussian" => Gaussian + case "binomial" => Binomial + case "poisson" => Poisson + case "gamma" => Gamma + case "tweedie" => model.getVariancePower match { case 0.0 => Gaussian case 1.0 => Poisson @@ -466,9 +467,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * Tweedie exponential family distribution. - * The default link for the Tweedie family is set as the log link. - * Define a class that takes the variance power as argument, as opposed to - * a global object for the other families. + * This includes the special cases of Gaussian, Poisson and Gamma. */ private[regression] class TweedieFamily(private val variancePower: Double) extends Family{ @@ -480,15 +479,14 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine case default => "tweedie" } /* - The canonical link is 1 - variancePower, which is barely used when the distribution - is not Gaussian, Poisson or Gamma. For any other distributions in the Tweedie family, - we set the default to the Log link. + The canonical link is 1 - variancePower. Except for the special cases of Gaussian, + Poisson and Gamma, the canonical link is rarely used. Set Log as the default link. */ val defaultLink: Link = variancePower match { case 0.0 => Identity case 1.0 => Log case 2.0 => Inverse - case default => Log + case _ => Log } override def initialize(y: Double, weight: Double): Double = { @@ -543,7 +541,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) }.sum() + 2.0 } else { - // This depends on the density of the tweedie distribution. Not yet implemented. + // This depends on the density of the Tweedie distribution. Not yet implemented. throw new UnsupportedOperationException("No AIC available for the tweedie family") } } @@ -642,11 +640,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine */ private[regression] object Gamma extends TweedieFamily(2.0) - /** - * Tweedie exponential family distribution. - */ - private[regression] object Tweedie extends TweedieFamily(1.5) - /** * A description of the link function to be used in the model. * The link function provides the relationship between the linear predictor diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 42cb2189e8140..cd4ee975246d3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1282,7 +1282,7 @@ class GeneralizedLinearRegressionSuite } } - ignore("read/write") { + test("read/write") { def checkModelData( model: GeneralizedLinearRegressionModel, model2: GeneralizedLinearRegressionModel): Unit = { From 0b41825e99020976a34d8fe9c983f26de6c8c40f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 22 Dec 2016 09:52:01 -0800 Subject: [PATCH 08/22] Move defaultLink and name to subclass of TweedieFamily --- .../GeneralizedLinearRegression.scala | 108 ++++++++++-------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 314da523c5dca..3ff09341fa187 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -404,9 +404,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * A description of the error distribution to be used in the model. * */ - private[regression] abstract class Family extends Serializable { - /** The name of the family. */ - val name: String + private[regression] abstract class Family(val name: String) extends Serializable { /** The default link instance of this family. */ val defaultLink: Link @@ -470,44 +468,26 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * This includes the special cases of Gaussian, Poisson and Gamma. */ private[regression] class TweedieFamily(private val variancePower: Double) - extends Family{ + extends Family("tweedie") { - val name: String = variancePower match { - case 0.0 => "gaussian" - case 1.0 => "poisson" - case 2.0 => "gamma" - case default => "tweedie" - } /* The canonical link is 1 - variancePower. Except for the special cases of Gaussian, Poisson and Gamma, the canonical link is rarely used. Set Log as the default link. */ - val defaultLink: Link = variancePower match { - case 0.0 => Identity - case 1.0 => Log - case 2.0 => Inverse - case _ => Log - } + override val defaultLink: Link = Log override def initialize(y: Double, weight: Double): Double = { if (variancePower >= 1.0 && variancePower < 2.0) { - require(y >= 0.0, s"The response variable of the specified $name distribution " + + require(y >= 0.0, s"The response variable of the specified distribution " + s"should be non-negative, but got $y") } else if (variancePower >= 2.0) { - require(y > 0.0, s"The response variable of the specified $name distribution " + + require(y > 0.0, s"The response variable of the specified distribution " + s"should be non-negative, but got $y") } if (y == 0) delta else y } - override def variance(mu: Double): Double = { - variancePower match { - case 0.0 => 1.0 - case 1.0 => mu - case 2.0 => mu * mu - case default => math.pow(mu, default) - } - } + override def variance(mu: Double): Double = math.pow(mu, variancePower) private def yp(y: Double, mu: Double, p: Double): Double = { if (p == 0) { @@ -519,7 +499,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def deviance(y: Double, mu: Double, weight: Double): Double = { // Force y >= delta for Poisson or compound Poisson - val y1 = if (variancePower >= 1.0 && variancePower < 2.0) math.max(y, delta) else y + val y1 = if (variancePower >= 1.0 && variancePower < 2.0) { + math.max(y, delta) + } else { + y + } 2.0 * weight * (y * yp(y1, mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) } @@ -528,22 +512,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine deviance: Double, numInstances: Double, weightSum: Double): Double = { - if (variancePower == 0.0) { - val wt = predictions.map(x => math.log(x._3)).sum() - numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt - } else if (variancePower == 1.0) { - -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => - weight * dist.Poisson(mu).logProbabilityOf(y.toInt) - }.sum() - } else if (variancePower == 2.0) { - val disp = deviance / weightSum - -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => - weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) - }.sum() + 2.0 - } else { - // This depends on the density of the Tweedie distribution. Not yet implemented. - throw new UnsupportedOperationException("No AIC available for the tweedie family") - } + /* + This depends on the density of the Tweedie distribution. + Only implemented for Gaussian, Poisson and Gamma at this point. + */ + throw new UnsupportedOperationException("No AIC available for the tweedie family") } override def project(mu: Double): Double = { @@ -563,6 +536,18 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine */ private[regression] object Gaussian extends TweedieFamily(0.0) { + override val name: String = "gaussian" + + override val defaultLink: Link = Identity + + override def aic(predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + val wt = predictions.map(x => math.log(x._3)).sum() + numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt + } + override def project(mu: Double): Double = { if (mu.isNegInfinity) { Double.MinValue @@ -578,9 +563,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Binomial exponential family distribution. * The default link for the Binomial family is the logit link. */ - private[regression] object Binomial extends Family { - - val name = "binomial" + private[regression] object Binomial extends Family("binomial") { val defaultLink: Link = Logit @@ -632,13 +615,42 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Poisson exponential family distribution. * The default link for the Poisson family is the log link. */ - private[regression] object Poisson extends TweedieFamily(1.0) + private[regression] object Poisson extends TweedieFamily(1.0) { + + override val name: String = "poisson" + + override val defaultLink: Link = Log + + override def aic(predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => + weight * dist.Poisson(mu).logProbabilityOf(y.toInt) + }.sum() + } + } /** * Gamma exponential family distribution. * The default link for the Gamma family is the inverse link. */ - private[regression] object Gamma extends TweedieFamily(2.0) + private[regression] object Gamma extends TweedieFamily(2.0) { + + override val name: String = "gamma" + + override val defaultLink: Link = Inverse + + override def aic(predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { + val disp = deviance / weightSum + -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => + weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) + }.sum() + 2.0 + } + } /** * A description of the link function to be used in the model. From 6e8e60771afb4abe43e47c7fe186bad1541a8fac Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 22 Dec 2016 10:10:51 -0800 Subject: [PATCH 09/22] Change style for AIC --- .../GeneralizedLinearRegression.scala | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 3ff09341fa187..ea9633798f3bf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -508,10 +508,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine (y * yp(y1, mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower)) } - override def aic(predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { + override def aic( + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { /* This depends on the density of the Tweedie distribution. Only implemented for Gaussian, Poisson and Gamma at this point. @@ -540,10 +541,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override val defaultLink: Link = Identity - override def aic(predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { + override def aic( + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { val wt = predictions.map(x => math.log(x._3)).sum() numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt } @@ -621,10 +623,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override val defaultLink: Link = Log - override def aic(predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { + override def aic( + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => weight * dist.Poisson(mu).logProbabilityOf(y.toInt) }.sum() @@ -641,10 +644,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override val defaultLink: Link = Inverse - override def aic(predictions: RDD[(Double, Double, Double)], - deviance: Double, - numInstances: Double, - weightSum: Double): Double = { + override def aic( + predictions: RDD[(Double, Double, Double)], + deviance: Double, + numInstances: Double, + weightSum: Double): Double = { val disp = deviance / weightSum -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y) From 8d7d34e258f9c7c03c80754d837ce847fcb0526e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 23 Dec 2016 11:10:20 -0800 Subject: [PATCH 10/22] Rename Family methods and restore methods for tweedie subclasses --- .../GeneralizedLinearRegression.scala | 79 ++++++++++++++----- .../GeneralizedLinearRegressionSuite.scala | 2 +- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index ea9633798f3bf..088a892b4daaf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -69,8 +69,9 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * Used only for the Tweedie family. * (see * Tweedie Distribution (Wikipedia)) - * Supported value: 0 and [1, Inf). Note that when the value of the variance power is - * 0, 1, or 2, the Gaussian, Poisson or Gamma family is used, respectively. + * Supported values: 0 and [1, Inf). + * Note that variance power 0, 1, or 2 corresponds to the Gaussian, Poisson or Gamma + * family, respectively. * * @group param */ @@ -78,7 +79,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam final val variancePower: Param[Double] = new Param(this, "variancePower", "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + - "Used for the Tweedie family. Supported value: 0 and [1, Inf).", + "Used only for the Tweedie family. Supported values: 0 and [1, Inf).", (x: Double) => x >= 1.0 || x == 0.0) /** @group getParam */ @@ -158,7 +159,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * - "binomial" : "logit", "probit", "cloglog" * - "poisson" : "log", "identity", "sqrt" * - "gamma" : "inverse", "identity", "log" - * - "tweedie" : "identity", "log" + * - "tweedie" : "log", "identity" */ @Experimental @Since("2.0.0") @@ -276,7 +277,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { - val familyObj = Family.fromModel(this) + val familyObj = Family.fromParams(this) val linkObj = if (isDefined(link)) { Link.fromName($(link)) } else { @@ -444,20 +445,20 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * 1) retrieve object based on family name * 2) if family name is tweedie, retrieve object based on variancePower * - * @param model a GenerealizedLinearRegressionBase object + * @param params a GenerealizedLinearRegressionBase object */ - def fromModel(model: GeneralizedLinearRegressionBase): Family = { - model.getFamily match { + def fromParams(params: GeneralizedLinearRegressionBase): Family = { + params.getFamily match { case "gaussian" => Gaussian case "binomial" => Binomial case "poisson" => Poisson case "gamma" => Gamma case "tweedie" => - model.getVariancePower match { + params.getVariancePower match { case 0.0 => Gaussian case 1.0 => Poisson case 2.0 => Gamma - case default => new TweedieFamily(default) + case default => new Tweedie(default) } } } @@ -467,21 +468,23 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Tweedie exponential family distribution. * This includes the special cases of Gaussian, Poisson and Gamma. */ - private[regression] class TweedieFamily(private val variancePower: Double) + private[regression] class Tweedie(private val variancePower: Double) extends Family("tweedie") { /* - The canonical link is 1 - variancePower. Except for the special cases of Gaussian, - Poisson and Gamma, the canonical link is rarely used. Set Log as the default link. + The canonical link is 1 - variancePower, which becomes Identify for Gaussian, + Log for Poisson, and Inverse for Gamma. Except for these special cases, + the canonical link is not meaningful and rarely used. For example, the canonical + link is 1/Sqrt when variancePower = 1.5. We set Log as the default link. */ override val defaultLink: Link = Log override def initialize(y: Double, weight: Double): Double = { if (variancePower >= 1.0 && variancePower < 2.0) { - require(y >= 0.0, s"The response variable of the specified distribution " + + require(y >= 0.0, s"The response variable of $name($variancePower) family " + s"should be non-negative, but got $y") } else if (variancePower >= 2.0) { - require(y > 0.0, s"The response variable of the specified distribution " + + require(y > 0.0, s"The response variable of $name($variancePower) family " + s"should be non-negative, but got $y") } if (y == 0) delta else y @@ -535,12 +538,20 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Gaussian exponential family distribution. * The default link for the Gaussian family is the identity link. */ - private[regression] object Gaussian extends TweedieFamily(0.0) { + private[regression] object Gaussian extends Tweedie(0.0) { override val name: String = "gaussian" override val defaultLink: Link = Identity + override def initialize(y: Double, weight: Double): Double = y + + override def variance(mu: Double): Double = 1.0 + + override def deviance(y: Double, mu: Double, weight: Double): Double = { + weight * (y - mu) * (y - mu) + } + override def aic( predictions: RDD[(Double, Double, Double)], deviance: Double, @@ -617,12 +628,28 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Poisson exponential family distribution. * The default link for the Poisson family is the log link. */ - private[regression] object Poisson extends TweedieFamily(1.0) { + private[regression] object Poisson extends Tweedie(1.0) { override val name: String = "poisson" override val defaultLink: Link = Log + override def initialize(y: Double, weight: Double): Double = { + require(y >= 0.0, "The response variable of Poisson family " + + s"should be non-negative, but got $y") + /* + Force Poisson mean > 0 to avoid numerical instability in IRLS. + R uses y + delta for initialization. See poisson()$initialize. + */ + math.max(y, delta) + } + + override def variance(mu: Double): Double = mu + + override def deviance(y: Double, mu: Double, weight: Double): Double = { + 2.0 * weight * (y * math.log(y / mu) - (y - mu)) + } + override def aic( predictions: RDD[(Double, Double, Double)], deviance: Double, @@ -638,12 +665,24 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Gamma exponential family distribution. * The default link for the Gamma family is the inverse link. */ - private[regression] object Gamma extends TweedieFamily(2.0) { + private[regression] object Gamma extends Tweedie(2.0) { override val name: String = "gamma" override val defaultLink: Link = Inverse + override def initialize(y: Double, weight: Double): Double = { + require(y > 0.0, "The response variable of Gamma family " + + s"should be positive, but got $y") + y + } + + override def variance(mu: Double): Double = mu * mu + + override def deviance(y: Double, mu: Double, weight: Double): Double = { + -2.0 * weight * (math.log(y / mu) - (y - mu)/mu) + } + override def aic( predictions: RDD[(Double, Double, Double)], deviance: Double, @@ -785,7 +824,7 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - private lazy val familyObj = Family.fromModel(this) + private lazy val familyObj = Family.fromParams(this) private lazy val linkObj = if (isDefined(link)) { Link.fromName($(link)) @@ -971,7 +1010,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset) - private[regression] lazy val family: Family = Family.fromModel(model) + private[regression] lazy val family: Family = Family.fromParams(model) private[regression] lazy val link: Link = if (model.isDefined(model.link)) { Link.fromName(model.getLink) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index cd4ee975246d3..13be4bb46bca1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -653,7 +653,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") - val familyLink = new FamilyAndLink(new TweedieFamily(variancePower), Link.fromName(link)) + val familyLink = new FamilyAndLink(new Tweedie(variancePower), Link.fromName(link)) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => From 6da7e3068e2c45a0faf7ff35c10b2750784d765e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 23 Dec 2016 11:12:25 -0800 Subject: [PATCH 11/22] Update test --- .../spark/ml/regression/GeneralizedLinearRegressionSuite.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 13be4bb46bca1..f87ec92f42220 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -581,6 +581,8 @@ class GeneralizedLinearRegressionSuite test("generalized linear regression: tweedie family against glm") { /* R code: + + library(statmod) df <- as.data.frame(matrix(c( 1.0, 1.0, 0.0, 5.0, 0.5, 1.0, 1.0, 2.0, @@ -1150,6 +1152,7 @@ class GeneralizedLinearRegressionSuite /* R code: + library(statmod) df <- as.data.frame(matrix(c( 1.0, 1.0, 0.0, 5.0, 0.5, 2.0, 1.0, 2.0, From 9a71e89f629260c775922901a04c989f36ea4946 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 27 Dec 2016 09:16:40 -0800 Subject: [PATCH 12/22] Clean up doc --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 088a892b4daaf..7a3a03baa8fe2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -472,10 +472,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine extends Family("tweedie") { /* - The canonical link is 1 - variancePower, which becomes Identify for Gaussian, + The canonical link is 1 - variancePower, which becomes Identity for Gaussian, Log for Poisson, and Inverse for Gamma. Except for these special cases, - the canonical link is not meaningful and rarely used. For example, the canonical - link is 1/Sqrt when variancePower = 1.5. We set Log as the default link. + the canonical link is rarely used. For example, the canonical link is 1/Sqrt + when variancePower = 1.5. We set Log as the default link, which may be overridden + in subclasses. */ override val defaultLink: Link = Log From f461c09e65360f695ad3092b41bc26e0c61bbd95 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 27 Dec 2016 14:18:39 -0800 Subject: [PATCH 13/22] Put delta in Tweedie companion object --- .../GeneralizedLinearRegression.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 7a3a03baa8fe2..183360b9d9791 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -353,9 +353,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] val epsilon: Double = 1E-16 - /** Constant used in initialization and deviance to avoid numerical issues. */ - private[regression] val delta: Double = 0.1 - /** * Wrapper of family and link combination used in the model. */ @@ -445,7 +442,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * 1) retrieve object based on family name * 2) if family name is tweedie, retrieve object based on variancePower * - * @param params a GenerealizedLinearRegressionBase object + * @param params the parameter map containing family name and variance power */ def fromParams(params: GeneralizedLinearRegressionBase): Family = { params.getFamily match { @@ -488,7 +485,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine require(y > 0.0, s"The response variable of $name($variancePower) family " + s"should be non-negative, but got $y") } - if (y == 0) delta else y + if (y == 0) Tweedie.delta else y } override def variance(mu: Double): Double = math.pow(mu, variancePower) @@ -504,7 +501,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def deviance(y: Double, mu: Double, weight: Double): Double = { // Force y >= delta for Poisson or compound Poisson val y1 = if (variancePower >= 1.0 && variancePower < 2.0) { - math.max(y, delta) + math.max(y, Tweedie.delta) } else { y } @@ -535,6 +532,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } + private[regression] object Tweedie{ + + /** Constant used in initialization and deviance to avoid numerical issues. */ + private[regression] val delta: Double = 0.1 + } + /** * Gaussian exponential family distribution. * The default link for the Gaussian family is the identity link. @@ -642,7 +645,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine Force Poisson mean > 0 to avoid numerical instability in IRLS. R uses y + delta for initialization. See poisson()$initialize. */ - math.max(y, delta) + math.max(y, Tweedie.delta) } override def variance(mu: Double): Double = mu From a839c4631dd17c4f3d0a0cc99e1b0af81419dda4 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 27 Dec 2016 14:23:57 -0800 Subject: [PATCH 14/22] Clean up doc --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 183360b9d9791..da6c9f5820f67 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -535,7 +535,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object Tweedie{ /** Constant used in initialization and deviance to avoid numerical issues. */ - private[regression] val delta: Double = 0.1 + val delta: Double = 0.1 } /** From fab265278109eede4cce7ee506e8b29d481c4549 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 5 Jan 2017 11:32:06 -0800 Subject: [PATCH 15/22] Allow more link functions in tweedie --- .../regression/GeneralizedLinearRegression.scala | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index da6c9f5820f67..432b2d6c6d8b4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -159,7 +159,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * - "binomial" : "logit", "probit", "cloglog" * - "poisson" : "log", "identity", "sqrt" * - "gamma" : "inverse", "identity", "log" - * - "tweedie" : "log", "identity" + * - "tweedie" : "log", "identity", "inverse", "sqrt" */ @Experimental @Since("2.0.0") @@ -190,7 +190,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.2.0") def setVariancePower(value: Double): this.type = set(variancePower, value) - setDefault(variancePower -> 1.5) + setDefault(variancePower -> 0.0) /** * Sets the value of param [[link]]. @@ -342,7 +342,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine "binomial" -> Logit, "binomial" -> Probit, "binomial" -> CLogLog, "poisson" -> Log, "poisson" -> Identity, "poisson" -> Sqrt, "gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log, - "tweedie" -> Identity, "tweedie" -> Log + "tweedie" -> Identity, "tweedie" -> Log, "tweedie" -> Inverse, "tweedie" -> Sqrt ) /** Set of family names that GeneralizedLinearRegression supports. */ @@ -401,6 +401,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** * A description of the error distribution to be used in the model. * + * @param name the name of the family. */ private[regression] abstract class Family(val name: String) extends Serializable { @@ -455,7 +456,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine case 0.0 => Gaussian case 1.0 => Poisson case 2.0 => Gamma - case default => new Tweedie(default) + case others => new Tweedie(others) } } } @@ -472,8 +473,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine The canonical link is 1 - variancePower, which becomes Identity for Gaussian, Log for Poisson, and Inverse for Gamma. Except for these special cases, the canonical link is rarely used. For example, the canonical link is 1/Sqrt - when variancePower = 1.5. We set Log as the default link, which may be overridden - in subclasses. + when variancePower = 1.5. We set Log as the default link, which is used + for distributions in the Tweedie family other than Gaussian, Poisson or Gamma. + The default link is overridden in the subclass Gaussian, Poisson or Gamma. */ override val defaultLink: Link = Log @@ -483,7 +485,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine s"should be non-negative, but got $y") } else if (variancePower >= 2.0) { require(y > 0.0, s"The response variable of $name($variancePower) family " + - s"should be non-negative, but got $y") + s"should be positive, but got $y") } if (y == 0) Tweedie.delta else y } From 651ea6256208c71683e450d22a40e466f1892ff5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 9 Jan 2017 23:47:38 -0800 Subject: [PATCH 16/22] Implement link power --- .../GeneralizedLinearRegression.scala | 198 +++++++++++++----- .../GeneralizedLinearRegressionSuite.scala | 134 +++++++++--- 2 files changed, 246 insertions(+), 86 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 432b2d6c6d8b4..a9b5dfa5690e1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -57,7 +57,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam final val family: Param[String] = new Param(this, "family", "The name of family which is a description of the error distribution to be used in the " + s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.", - ParamValidators.inArray[String](supportedFamilyNames.toArray)) + ParamValidators.inArray[String](supportedFamilyNames)) /** @group getParam */ @Since("2.0.0") @@ -90,6 +90,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * Param for the name of link function which provides the relationship * between the linear predictor and the mean of the distribution function. * Supported options: "identity", "log", "inverse", "logit", "probit", "cloglog" and "sqrt". + * This is used only when family is not "tweedie". The link function for the "tweedie" family + * must be specified through [[linkPower]]. * * @group param */ @@ -97,12 +99,30 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam final val link: Param[String] = new Param(this, "link", "The name of link function " + "which provides the relationship between the linear predictor and the mean of the " + s"distribution function. Supported options: ${supportedLinkNames.mkString(", ")}", - ParamValidators.inArray[String](supportedLinkNames.toArray)) + ParamValidators.inArray[String](supportedLinkNames)) /** @group getParam */ @Since("2.0.0") def getLink: String = $(link) + /** + * Param for the index in the power link function. This is used to specify the link function + * in the Tweedie family. Supported values: [-10.0, 10.0]. + * Note that link power 0, 1, or -1 corresponds to the Log, Identity or Inverse + * link, respectively. + * + * @group param + */ + @Since("2.2.0") + final val linkPower: Param[Double] = new Param(this, "linkPower", + "The index in the power link function. This is used to specify the link function in the " + + "Tweedie family. Supported values: [-10, 10].", + ParamValidators.inRange[Double](-10.0, 10.0)) + + /** @group getParam */ + @Since("2.2.0") + def getLinkPower: Double = $(linkPower) + /** * Param for link prediction (linear predictor) column name. * Default is not set, which means we do not output link prediction. @@ -129,12 +149,20 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam schema: StructType, fitting: Boolean, featuresDataType: DataType): StructType = { - if (isDefined(link)) { - require(supportedFamilyAndLinkPairs.contains( - $(family) -> Link.fromName($(link))), - s"Generalized Linear Regression with ${$(family)} family " + - s"does not support ${$(link)} link function.") + if ($(family) == "tweedie") { + require(!isDefined(link), "The link function for the tweedie family must be " + + "specified using linkPower, not link.") + } else { + require(!isDefined(linkPower), s"The link function for the ${$(family)} family " + + "must be specified using link, not linkPower.") + if (isDefined(link)) { + require(supportedFamilyAndLinkPairs.contains( + $(family) -> Link.fromParams(this)), + s"Generalized Linear Regression with ${$(family)} family " + + s"does not support ${$(link)} link function.") + } } + val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType) if (hasLinkPredictionCol) { SchemaUtils.appendColumn(newSchema, $(linkPredictionCol), DoubleType) @@ -159,7 +187,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * - "binomial" : "logit", "probit", "cloglog" * - "poisson" : "log", "identity", "sqrt" * - "gamma" : "inverse", "identity", "log" - * - "tweedie" : "log", "identity", "inverse", "sqrt" + * - "tweedie" : power link function specified through "linkPower". The default link power in + * the tweedie family is 1 - variancePower. */ @Experimental @Since("2.0.0") @@ -183,17 +212,28 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val setDefault(family -> Gaussian.name) /** - * Sets the value of param [[variancePower]]. - * Used only when family is "tweedie". - * - * @group setParam - */ + * Sets the value of param [[variancePower]]. + * Used only when family is "tweedie". + * Default is 0.0, which corresponds to the "gaussian" family. + * + * @group setParam + */ @Since("2.2.0") def setVariancePower(value: Double): this.type = set(variancePower, value) setDefault(variancePower -> 0.0) + /** + * Sets the value of param [[linkPower]]. + * Used only when family is "tweedie". + * + * @group setParam + */ + @Since("2.2.0") + def setLinkPower(value: Double): this.type = set(linkPower, value) + /** * Sets the value of param [[link]]. + * Used only when family is not "tweedie". * * @group setParam */ @@ -278,8 +318,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { val familyObj = Family.fromParams(this) - val linkObj = if (isDefined(link)) { - Link.fromName($(link)) + val linkObj = if (isDefined(link) || isDefined(linkPower)) { + Link.fromParams(this) } else { familyObj.defaultLink } @@ -336,20 +376,24 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine @Since("2.0.0") override def load(path: String): GeneralizedLinearRegression = super.load(path) - /** Set of family and link pairs that GeneralizedLinearRegression supports. */ + /** + * Set of family and link pairs that GeneralizedLinearRegression supports. + * Tweedie family is specified through linkPower. + */ private[regression] lazy val supportedFamilyAndLinkPairs = Set( "gaussian" -> Identity, "gaussian" -> Log, "gaussian" -> Inverse, "binomial" -> Logit, "binomial" -> Probit, "binomial" -> CLogLog, "poisson" -> Log, "poisson" -> Identity, "poisson" -> Sqrt, - "gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log, - "tweedie" -> Identity, "tweedie" -> Log, "tweedie" -> Inverse, "tweedie" -> Sqrt + "gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log ) /** Set of family names that GeneralizedLinearRegression supports. */ - private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1) + private[regression] lazy val supportedFamilyNames = + supportedFamilyAndLinkPairs.map(_._1).toArray :+ "tweedie" /** Set of link names that GeneralizedLinearRegression supports. */ - private[regression] lazy val supportedLinkNames = supportedFamilyAndLinkPairs.map(_._2.name) + private[regression] lazy val supportedLinkNames = + supportedFamilyAndLinkPairs.map(_._2.name).toArray private[regression] val epsilon: Double = 1E-16 @@ -463,21 +507,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } /** - * Tweedie exponential family distribution. - * This includes the special cases of Gaussian, Poisson and Gamma. - */ - private[regression] class Tweedie(private val variancePower: Double) + * Tweedie exponential family distribution. + * This includes the special cases of Gaussian, Poisson and Gamma. + */ + private[regression] class Tweedie(val variancePower: Double) extends Family("tweedie") { - /* - The canonical link is 1 - variancePower, which becomes Identity for Gaussian, - Log for Poisson, and Inverse for Gamma. Except for these special cases, - the canonical link is rarely used. For example, the canonical link is 1/Sqrt - when variancePower = 1.5. We set Log as the default link, which is used - for distributions in the Tweedie family other than Gaussian, Poisson or Gamma. - The default link is overridden in the subclass Gaussian, Poisson or Gamma. - */ - override val defaultLink: Link = Log + override val defaultLink: Link = new PowerLink(1.0 - variancePower) override def initialize(y: Double, weight: Double): Double = { if (variancePower >= 1.0 && variancePower < 2.0) { @@ -723,25 +759,67 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object Link { /** - * Gets the [[Link]] object from its name. + * Gets the [[Link]] object based on link and linkPower. + * 1) if family is "tweedie", retrieve object using linkPower + * 2) otherwise, retrieve object based on link name * - * @param name link name: "identity", "logit", "log", - * "inverse", "probit", "cloglog" or "sqrt". + * @param params the parameter map containing link and link power */ - def fromName(name: String): Link = { - name match { - case Identity.name => Identity - case Logit.name => Logit - case Log.name => Log - case Inverse.name => Inverse - case Probit.name => Probit - case CLogLog.name => CLogLog - case Sqrt.name => Sqrt + def fromParams(params: GeneralizedLinearRegressionBase): Link = { + if (params.getFamily == "tweedie") { + params.getLinkPower match { + case 0.0 => Log + case 1.0 => Identity + case -1.0 => Inverse + case 0.5 => Sqrt + case others => new PowerLink(others) + } + } else { + params.getLink match { + case Identity.name => Identity + case Logit.name => Logit + case Log.name => Log + case Inverse.name => Inverse + case Probit.name => Probit + case CLogLog.name => CLogLog + case Sqrt.name => Sqrt + } + } + } + } + + /** Power link function class */ + private[regression] class PowerLink(val linkPower: Double) + extends Link("powerLink") { + + override def link(mu: Double): Double = { + if (linkPower == 0.0) { + math.log(mu) + } else { + math.pow(mu, linkPower) + } + } + + override def deriv(mu: Double): Double = { + if (linkPower == 0.0) { + 1.0 / mu + } else { + linkPower * math.pow(mu, linkPower - 1.0) + } + } + + override def unlink(eta: Double): Double = { + if (linkPower == 0.0) { + math.exp(eta) + } else { + math.pow(eta, 1.0 / linkPower) } } } - private[regression] object Identity extends Link("identity") { + private[regression] object Identity extends PowerLink(1.0) { + + override val name: String = "identity" override def link(mu: Double): Double = mu @@ -759,7 +837,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = 1.0 / (1.0 + math.exp(-1.0 * eta)) } - private[regression] object Log extends Link("log") { + private[regression] object Log extends PowerLink(0.0) { + + override val name: String = "log" override def link(mu: Double): Double = math.log(mu) @@ -768,7 +848,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = math.exp(eta) } - private[regression] object Inverse extends Link("inverse") { + private[regression] object Inverse extends PowerLink(-1.0) { + + override val name: String = "inverse" override def link(mu: Double): Double = 1.0 / mu @@ -797,7 +879,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta)) } - private[regression] object Sqrt extends Link("sqrt") { + private[regression] object Sqrt extends PowerLink(0.5) { + + override val name: String = "sqrt" override def link(mu: Double): Double = math.sqrt(mu) @@ -832,11 +916,12 @@ class GeneralizedLinearRegressionModel private[ml] ( private lazy val familyObj = Family.fromParams(this) - private lazy val linkObj = if (isDefined(link)) { - Link.fromName($(link)) + private lazy val linkObj = if (isDefined(link) || isDefined(linkPower)) { + Link.fromParams(this) } else { familyObj.defaultLink } + private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) override protected def predict(features: Vector): Double = { @@ -1018,11 +1103,12 @@ class GeneralizedLinearRegressionSummary private[regression] ( private[regression] lazy val family: Family = Family.fromParams(model) - private[regression] lazy val link: Link = if (model.isDefined(model.link)) { - Link.fromName(model.getLink) - } else { - family.defaultLink - } + private[regression] lazy val link: Link = + if (model.isDefined(model.link) || model.isDefined(model.linkPower)) { + Link.fromParams(model) + } else { + family.defaultLink + } /** Number of instances in DataFrame predictions. */ private[regression] lazy val numInstances: Long = predictions.count() diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index f87ec92f42220..376b7591e1b84 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -191,6 +191,8 @@ class GeneralizedLinearRegressionSuite assert(!glr.isDefined(glr.weightCol)) assert(glr.getRegParam === 0.0) assert(glr.getSolver == "irls") + assert(glr.getVariancePower === 0.0) + // TODO: Construct model directly instead of via fitting. val model = glr.setFamily("gaussian").setLink("identity") .fit(datasetGaussianIdentity) @@ -266,7 +268,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with gaussian family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Gaussian, Link.fromName(link)) + val familyLink = new FamilyAndLink(Gaussian, Link.fromParams(trainer)) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -382,7 +384,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with binomial family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Binomial, Link.fromName(link)) + val familyLink = new FamilyAndLink(Binomial, Link.fromParams(trainer)) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -454,7 +456,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Poisson, Link.fromName(link)) + val familyLink = new FamilyAndLink(Poisson, Link.fromParams(trainer)) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -560,7 +562,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with gamma family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Gamma, Link.fromName(link)) + val familyLink = new FamilyAndLink(Gamma, Link.fromParams(trainer)) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -593,29 +595,24 @@ class GeneralizedLinearRegressionSuite f2 <- V1 ~ V3 + V4 for (f in c(f1, f2)){ - for (lp in c(0, 1)) - for (vp in c(1.6, 2.5, 3.0, 4.0)){ + for (lp in c(0, 1, -1)) + for (vp in c(1.6, 2.5)){ model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) print(as.vector(coef(model))) } } - [1] 0.1496480 -0.0122283 [1] 0.1373567 -0.0120673 - [1] 0.13077402 -0.01181116 - [1] 0.11853618 -0.01118475 [1] 0.3919109 0.1846094 [1] 0.3684426 0.1810662 - [1] 0.3566982 0.1788412 - [1] 0.3370804 0.1740093 + [1] 0.1759887 0.2195818 + [1] 0.1108561 0.2059430 [1] -1.3163732 0.4378139 0.2464114 [1] -1.4396020 0.4817364 0.2680088 - [1] -1.5975930 0.5440060 0.2982824 - [1] -3.4044522 1.3557615 0.6797386 [1] -0.7090230 0.6256309 0.3294324 [1] -0.9524928 0.7304267 0.3792687 - [1] -1.1216622 0.8089538 0.4156152 - [1] -1.3594653 0.9262326 0.4682795 + [1] 2.1188978 -0.3360519 -0.2067023 + [1] 2.1659028 -0.3499170 -0.2128286 */ val datasetTweedie = Seq( Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), @@ -627,35 +624,110 @@ class GeneralizedLinearRegressionSuite val expected = Seq( Vectors.dense(0, 0.149648, -0.0122283), Vectors.dense(0, 0.1373567, -0.0120673), - Vectors.dense(0, 0.13077402, -0.01181116), - Vectors.dense(0, 0.11853618, -0.01118475), Vectors.dense(0, 0.3919109, 0.1846094), Vectors.dense(0, 0.3684426, 0.1810662), - Vectors.dense(0, 0.3566982, 0.1788412), - Vectors.dense(0, 0.3370804, 0.1740093), + Vectors.dense(0, 0.1759887, 0.2195818), + Vectors.dense(0, 0.1108561, 0.205943), Vectors.dense(-1.3163732, 0.4378139, 0.2464114), Vectors.dense(-1.439602, 0.4817364, 0.2680088), - Vectors.dense(-1.597593, 0.544006, 0.2982824), - Vectors.dense(-3.4044522, 1.3557615, 0.6797386), Vectors.dense(-0.709023, 0.6256309, 0.3294324), Vectors.dense(-0.9524928, 0.7304267, 0.3792687), - Vectors.dense(-1.1216622, 0.8089538, 0.4156152), - Vectors.dense(-1.3594653, 0.9262326, 0.4682795)) + Vectors.dense(2.1188978, -0.3360519, -0.2067023), + Vectors.dense(2.1659028, -0.349917, -0.2128286)) + + import GeneralizedLinearRegression._ + + var idx = 0 + for (fitIntercept <- Seq(false, true); linkPower <- Seq(0.0, 1.0, -1.0)) { + for (variancePower <- Seq(1.6, 2.5)) { + val trainer = new GeneralizedLinearRegression().setFamily("tweedie") + .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") + .setVariancePower(variancePower).setLinkPower(linkPower) + val model = trainer.fit(datasetTweedie) + val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) + assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + + s"linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + + val familyLink = new FamilyAndLink(Family.fromParams(trainer), Link.fromParams(trainer)) + model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() + .foreach { + case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => + val eta = BLAS.dot(features, model.coefficients) + model.intercept + val prediction2 = familyLink.fitted(eta) + val linkPrediction2 = eta + assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + + s"tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + + s"GLM with tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + } + idx += 1 + } + } + } + + test("generalized linear regression: tweedie family against glm (default power link)") { + /* + R code: + + library(statmod) + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 1.0, 1.0, 2.0, + 1.0, 1.0, 2.0, 1.0, + 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) + var.power <- c(0, 1, 2, 1.5) + f1 <- V1 ~ -1 + V3 + V4 + f2 <- V1 ~ V3 + V4 + for (f in c(f1, f2)){ + for (vp in var.power){ + model <- glm(f, df, family = tweedie(var.power = vp)) + print(as.vector(coef(model))) + } + } + [1] 0.4310345 0.1896552 + [1] 0.15776482 -0.01189032 + [1] 0.1468853 0.2116519 + [1] 0.2282601 0.2132775 + [1] -0.5158730 0.5555556 0.2936508 + [1] -1.2689559 0.4230934 0.2388465 + [1] 2.137852 -0.341431 -0.209090 + [1] 1.5953393 -0.1884985 -0.1106335 + */ + val datasetTweedie = Seq( + Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), + Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), + Instance(1.0, 1.0, Vectors.dense(2.0, 1.0)), + Instance(2.0, 1.0, Vectors.dense(3.0, 3.0)) + ).toDF() + + val expected = Seq( + Vectors.dense(0, 0.4310345, 0.1896552), + Vectors.dense(0, 0.15776482, -0.01189032), + Vectors.dense(0, 0.1468853, 0.2116519), + Vectors.dense(0, 0.2282601, 0.2132775), + Vectors.dense(-0.515873, 0.5555556, 0.2936508), + Vectors.dense(-1.2689559, 0.4230934, 0.2388465), + Vectors.dense(2.137852, -0.341431, -0.20909), + Vectors.dense(1.5953393, -0.1884985, -0.1106335)) import GeneralizedLinearRegression._ var idx = 0 - for (fitIntercept <- Seq(false, true); link <- Seq("log", "identity")) { - for (variancePower <- Seq(1.6, 2.5, 3.0, 4.0)) { + for (fitIntercept <- Seq(false, true)) { + for (variancePower <- Seq(0.0, 1.0, 2.0, 1.5)) { val trainer = new GeneralizedLinearRegression().setFamily("tweedie") - .setFitIntercept(fitIntercept).setLink(link).setLinkPredictionCol("linkPrediction") + .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") .setVariancePower(variancePower) val model = trainer.fit(datasetTweedie) val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + - s"$link link, fitIntercept = $fitIntercept and variancePower = $variancePower.") + s"fitIntercept = $fitIntercept and variancePower = $variancePower.") - val familyLink = new FamilyAndLink(new Tweedie(variancePower), Link.fromName(link)) + val familyObj = Family.fromParams(trainer) + val familyLink = new FamilyAndLink(familyObj, familyObj.defaultLink) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -663,15 +735,16 @@ class GeneralizedLinearRegressionSuite val prediction2 = familyLink.fitted(eta) val linkPrediction2 = eta assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + - s"tweedie family, $link link, fitIntercept = $fitIntercept " + + s"tweedie family, fitIntercept = $fitIntercept " + s"and variancePower = $variancePower.") assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + - s"GLM with tweedie family, $link link and fitIntercept = $fitIntercept " + + s"GLM with tweedie family, fitIntercept = $fitIntercept " + s"and variancePower = $variancePower.") } idx += 1 } } + } test("glm summary: gaussian family with weight") { @@ -1200,6 +1273,7 @@ class GeneralizedLinearRegressionSuite val trainer = new GeneralizedLinearRegression() .setFamily("tweedie") .setVariancePower(1.6) + .setLinkPower(0.0) .setWeightCol("weight") .setFitIntercept(false) From 0310e854eb20c36bd115800fdb093b2200b8f620 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 10 Jan 2017 07:20:53 -0800 Subject: [PATCH 17/22] remove restriction on link power; revert to use family object in check --- .../GeneralizedLinearRegression.scala | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a9b5dfa5690e1..8cd0bb2c8bbef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -107,8 +107,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * Param for the index in the power link function. This is used to specify the link function - * in the Tweedie family. Supported values: [-10.0, 10.0]. - * Note that link power 0, 1, or -1 corresponds to the Log, Identity or Inverse + * in the Tweedie family. + * Note that link power 0, 1, -1 or 0.5 corresponds to the Log, Identity, Inverse or Sqrt * link, respectively. * * @group param @@ -116,8 +116,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam @Since("2.2.0") final val linkPower: Param[Double] = new Param(this, "linkPower", "The index in the power link function. This is used to specify the link function in the " + - "Tweedie family. Supported values: [-10, 10].", - ParamValidators.inRange[Double](-10.0, 10.0)) + "Tweedie family.", (x: Double) => true) /** @group getParam */ @Since("2.2.0") @@ -157,7 +156,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam "must be specified using link, not linkPower.") if (isDefined(link)) { require(supportedFamilyAndLinkPairs.contains( - $(family) -> Link.fromParams(this)), + Family.fromParams(this) -> Link.fromParams(this)), s"Generalized Linear Regression with ${$(family)} family " + s"does not support ${$(link)} link function.") } @@ -381,15 +380,15 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine * Tweedie family is specified through linkPower. */ private[regression] lazy val supportedFamilyAndLinkPairs = Set( - "gaussian" -> Identity, "gaussian" -> Log, "gaussian" -> Inverse, - "binomial" -> Logit, "binomial" -> Probit, "binomial" -> CLogLog, - "poisson" -> Log, "poisson" -> Identity, "poisson" -> Sqrt, - "gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log + Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse, + Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog, + Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt, + Gamma -> Inverse, Gamma -> Identity, Gamma -> Log ) /** Set of family names that GeneralizedLinearRegression supports. */ private[regression] lazy val supportedFamilyNames = - supportedFamilyAndLinkPairs.map(_._1).toArray :+ "tweedie" + supportedFamilyAndLinkPairs.map(_._1.name).toArray :+ "tweedie" /** Set of link names that GeneralizedLinearRegression supports. */ private[regression] lazy val supportedLinkNames = From 6f4abebb7a327b508fdc48789793ca6a29dc9842 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 13 Jan 2017 11:03:28 -0800 Subject: [PATCH 18/22] create factory method for FamilyAndLink --- .../GeneralizedLinearRegression.scala | 83 ++++++++++--------- .../GeneralizedLinearRegressionSuite.scala | 15 ++-- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 8cd0bb2c8bbef..a88ced923bea1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -116,7 +116,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam @Since("2.2.0") final val linkPower: Param[Double] = new Param(this, "linkPower", "The index in the power link function. This is used to specify the link function in the " + - "Tweedie family.", (x: Double) => true) + "Tweedie family.") /** @group getParam */ @Since("2.2.0") @@ -149,12 +149,16 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam fitting: Boolean, featuresDataType: DataType): StructType = { if ($(family) == "tweedie") { - require(!isDefined(link), "The link function for the tweedie family must be " + - "specified using linkPower, not link.") + if (isSet(link)) { + logWarning("When family is tweedie, use param linkPower to specify link function. " + + "Setting param link will take no effect.") + } } else { - require(!isDefined(linkPower), s"The link function for the ${$(family)} family " + - "must be specified using link, not linkPower.") - if (isDefined(link)) { + if (isSet(linkPower)) { + logWarning("When family is not tweedie, use param link to specify link function. " + + "Setting param linkPower will take no effect.") + } + if (isSet(link)) { require(supportedFamilyAndLinkPairs.contains( Family.fromParams(this) -> Link.fromParams(this)), s"Generalized Linear Regression with ${$(family)} family " + @@ -316,13 +320,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = { - val familyObj = Family.fromParams(this) - val linkObj = if (isDefined(link) || isDefined(linkPower)) { - Link.fromParams(this) - } else { - familyObj.defaultLink - } - val familyAndLink = new FamilyAndLink(familyObj, linkObj) + val familyAndLink = FamilyAndLink(this) val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) { @@ -338,7 +336,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val Instance(label, weight, features) } - if (familyObj == Gaussian && linkObj == Identity) { + if (familyAndLink.family == Gaussian && familyAndLink.link == Identity) { // TODO: Make standardizeFeatures and standardizeLabel configurable. val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true) @@ -441,6 +439,23 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } + private[regression] object FamilyAndLink { + + /** + * Constructs the FamilyAndLink object from a parameter map + */ + def apply(params: GeneralizedLinearRegressionBase): FamilyAndLink = { + val familyObj = Family.fromParams(params) + val linkObj = if ((params.getFamily != "tweedie" && params.isDefined(params.link)) || + (params.getFamily == "tweedie" && params.isDefined(params.linkPower))) { + Link.fromParams(params) + } else { + familyObj.defaultLink + } + new FamilyAndLink(familyObj, linkObj) + } + } + /** * A description of the error distribution to be used in the model. * @@ -476,7 +491,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine /** Trim the fitted value so that it will be in valid range. */ def project(mu: Double): Double = mu - } private[regression] object Family { @@ -512,7 +526,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] class Tweedie(val variancePower: Double) extends Family("tweedie") { - override val defaultLink: Link = new PowerLink(1.0 - variancePower) + override val defaultLink: Link = new Power(1.0 - variancePower) override def initialize(y: Double, weight: Double): Double = { if (variancePower >= 1.0 && variancePower < 2.0) { @@ -758,7 +772,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object Link { /** - * Gets the [[Link]] object based on link and linkPower. + * Gets the [[Link]] object based on link or linkPower. * 1) if family is "tweedie", retrieve object using linkPower * 2) otherwise, retrieve object based on link name * @@ -771,7 +785,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine case 1.0 => Identity case -1.0 => Inverse case 0.5 => Sqrt - case others => new PowerLink(others) + case others => new Power(others) } } else { params.getLink match { @@ -788,8 +802,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } /** Power link function class */ - private[regression] class PowerLink(val linkPower: Double) - extends Link("powerLink") { + private[regression] class Power(val linkPower: Double) + extends Link("power") { override def link(mu: Double): Double = { if (linkPower == 0.0) { @@ -816,7 +830,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } - private[regression] object Identity extends PowerLink(1.0) { + private[regression] object Identity extends Power(1.0) { override val name: String = "identity" @@ -836,7 +850,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = 1.0 / (1.0 + math.exp(-1.0 * eta)) } - private[regression] object Log extends PowerLink(0.0) { + private[regression] object Log extends Power(0.0) { override val name: String = "log" @@ -847,7 +861,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = math.exp(eta) } - private[regression] object Inverse extends PowerLink(-1.0) { + private[regression] object Inverse extends Power(-1.0) { override val name: String = "inverse" @@ -878,7 +892,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta)) } - private[regression] object Sqrt extends PowerLink(0.5) { + private[regression] object Sqrt extends Power(0.5) { override val name: String = "sqrt" @@ -913,15 +927,7 @@ class GeneralizedLinearRegressionModel private[ml] ( import GeneralizedLinearRegression._ - private lazy val familyObj = Family.fromParams(this) - - private lazy val linkObj = if (isDefined(link) || isDefined(linkPower)) { - Link.fromParams(this) - } else { - familyObj.defaultLink - } - - private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj) + private lazy val familyAndLink = FamilyAndLink(this) override protected def predict(features: Vector): Double = { val eta = predictLink(features) @@ -1100,14 +1106,11 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset) - private[regression] lazy val family: Family = Family.fromParams(model) + private[regression] lazy val familyLink: FamilyAndLink = FamilyAndLink(model) - private[regression] lazy val link: Link = - if (model.isDefined(model.link) || model.isDefined(model.linkPower)) { - Link.fromParams(model) - } else { - family.defaultLink - } + private[regression] lazy val family: Family = familyLink.family + + private[regression] lazy val link: Link = familyLink.link /** Number of instances in DataFrame predictions. */ private[regression] lazy val numInstances: Long = predictions.count() diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 376b7591e1b84..e6b9877193b12 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -268,7 +268,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with gaussian family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Gaussian, Link.fromParams(trainer)) + val familyLink = FamilyAndLink(trainer) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -384,7 +384,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with binomial family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Binomial, Link.fromParams(trainer)) + val familyLink = FamilyAndLink(trainer) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -456,7 +456,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Poisson, Link.fromParams(trainer)) + val familyLink = FamilyAndLink(trainer) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -562,7 +562,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with gamma family, " + s"$link link and fitIntercept = $fitIntercept.") - val familyLink = new FamilyAndLink(Gamma, Link.fromParams(trainer)) + val familyLink = FamilyAndLink(trainer) model.transform(dataset).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -649,7 +649,7 @@ class GeneralizedLinearRegressionSuite s"linkPower = $linkPower, fitIntercept = $fitIntercept " + s"and variancePower = $variancePower.") - val familyLink = new FamilyAndLink(Family.fromParams(trainer), Link.fromParams(trainer)) + val familyLink = FamilyAndLink(trainer) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -726,8 +726,7 @@ class GeneralizedLinearRegressionSuite assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + s"fitIntercept = $fitIntercept and variancePower = $variancePower.") - val familyObj = Family.fromParams(trainer) - val familyLink = new FamilyAndLink(familyObj, familyObj.defaultLink) + val familyLink = FamilyAndLink(trainer) model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() .foreach { case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => @@ -1292,7 +1291,6 @@ class GeneralizedLinearRegressionSuite val residualDevianceR = 13.844 val residualDegreeOfFreedomNullR = 4 val residualDegreeOfFreedomR = 2 - // val aicR = 0.0 val summary = model.summary @@ -1334,7 +1332,6 @@ class GeneralizedLinearRegressionSuite assert(summary.deviance ~== residualDevianceR absTol 1E-3) assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR) assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) - // assert(summary.aic ~== aicR absTol 1E-3) assert(summary.solver === "irls") } From 7b5d4cd7a6f29337e2b8c7b7100bd4dbaa1b7909 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 14 Jan 2017 11:14:01 -0800 Subject: [PATCH 19/22] fix style issue in test --- .../GeneralizedLinearRegressionSuite.scala | 72 +++++++++---------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index e6b9877193b12..aa38ddb427a92 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -582,38 +582,37 @@ class GeneralizedLinearRegressionSuite test("generalized linear regression: tweedie family against glm") { /* - R code: - - library(statmod) - df <- as.data.frame(matrix(c( - 1.0, 1.0, 0.0, 5.0, - 0.5, 1.0, 1.0, 2.0, - 1.0, 1.0, 2.0, 1.0, - 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) - - f1 <- V1 ~ -1 + V3 + V4 - f2 <- V1 ~ V3 + V4 - - for (f in c(f1, f2)){ - for (lp in c(0, 1, -1)) - for (vp in c(1.6, 2.5)){ - model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) - print(as.vector(coef(model))) - } - } - [1] 0.1496480 -0.0122283 - [1] 0.1373567 -0.0120673 - [1] 0.3919109 0.1846094 - [1] 0.3684426 0.1810662 - [1] 0.1759887 0.2195818 - [1] 0.1108561 0.2059430 - [1] -1.3163732 0.4378139 0.2464114 - [1] -1.4396020 0.4817364 0.2680088 - [1] -0.7090230 0.6256309 0.3294324 - [1] -0.9524928 0.7304267 0.3792687 - [1] 2.1188978 -0.3360519 -0.2067023 - [1] 2.1659028 -0.3499170 -0.2128286 - */ + R code: + library(statmod) + df <- as.data.frame(matrix(c( + 1.0, 1.0, 0.0, 5.0, + 0.5, 1.0, 1.0, 2.0, + 1.0, 1.0, 2.0, 1.0, + 2.0, 1.0, 3.0, 3.0), 4, 4, byrow = TRUE)) + + f1 <- V1 ~ -1 + V3 + V4 + f2 <- V1 ~ V3 + V4 + + for (f in c(f1, f2)) { + for (lp in c(0, 1, -1)) + for (vp in c(1.6, 2.5)) { + model <- glm(f, df, family = tweedie(var.power = vp, link.power = lp)) + print(as.vector(coef(model))) + } + } + [1] 0.1496480 -0.0122283 + [1] 0.1373567 -0.0120673 + [1] 0.3919109 0.1846094 + [1] 0.3684426 0.1810662 + [1] 0.1759887 0.2195818 + [1] 0.1108561 0.2059430 + [1] -1.3163732 0.4378139 0.2464114 + [1] -1.4396020 0.4817364 0.2680088 + [1] -0.7090230 0.6256309 0.3294324 + [1] -0.9524928 0.7304267 0.3792687 + [1] 2.1188978 -0.3360519 -0.2067023 + [1] 2.1659028 -0.3499170 -0.2128286 + */ val datasetTweedie = Seq( Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), @@ -671,7 +670,6 @@ class GeneralizedLinearRegressionSuite test("generalized linear regression: tweedie family against glm (default power link)") { /* R code: - library(statmod) df <- as.data.frame(matrix(c( 1.0, 1.0, 0.0, 5.0, @@ -681,8 +679,8 @@ class GeneralizedLinearRegressionSuite var.power <- c(0, 1, 2, 1.5) f1 <- V1 ~ -1 + V3 + V4 f2 <- V1 ~ V3 + V4 - for (f in c(f1, f2)){ - for (vp in var.power){ + for (f in c(f1, f2)) { + for (vp in var.power) { model <- glm(f, df, family = tweedie(var.power = vp)) print(as.vector(coef(model))) } @@ -695,7 +693,7 @@ class GeneralizedLinearRegressionSuite [1] -1.2689559 0.4230934 0.2388465 [1] 2.137852 -0.341431 -0.209090 [1] 1.5953393 -0.1884985 -0.1106335 - */ + */ val datasetTweedie = Seq( Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), Instance(0.5, 1.0, Vectors.dense(1.0, 2.0)), @@ -743,7 +741,6 @@ class GeneralizedLinearRegressionSuite idx += 1 } } - } test("glm summary: gaussian family with weight") { @@ -1260,7 +1257,6 @@ class GeneralizedLinearRegressionSuite residuals(model, type="response") 1 2 3 4 0.018067789 -0.003326304 0.038720616 -0.824070943 - */ val datasetWithWeight = Seq( Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)), From 5a4007330c4c8187a00d6d31a9d8a3dcb1798fc9 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 16 Jan 2017 18:49:18 -0800 Subject: [PATCH 20/22] make model writable --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 4 ++-- .../ml/regression/GeneralizedLinearRegressionSuite.scala | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a88ced923bea1..ad2e1f41f92dc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -76,7 +76,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * @group param */ @Since("2.2.0") - final val variancePower: Param[Double] = new Param(this, "variancePower", + final val variancePower: DoubleParam = new DoubleParam(this, "variancePower", "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + "Used only for the Tweedie family. Supported values: 0 and [1, Inf).", @@ -114,7 +114,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * @group param */ @Since("2.2.0") - final val linkPower: Param[Double] = new Param(this, "linkPower", + final val linkPower: DoubleParam = new DoubleParam(this, "linkPower", "The index in the power link function. This is used to specify the link function in the " + "Tweedie family.") diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index aa38ddb427a92..66331a5f2ac30 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1462,7 +1462,8 @@ object GeneralizedLinearRegressionSuite { "maxIter" -> 2, // intentionally small "tol" -> 0.8, "regParam" -> 0.01, - "predictionCol" -> "myPrediction") + "predictionCol" -> "myPrediction", + "variancePower" -> 1.0) def generateGeneralizedLinearRegressionInput( intercept: Double, From 83deee352c46ec113554fccee4bdc14ead56072e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 17 Jan 2017 16:51:20 -0800 Subject: [PATCH 21/22] resolve conflicts --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 4cf15a535fcf9..b101e431396f7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -341,7 +341,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val Instance(label, weight, features) } - val model = if (familyAndLink.familyObj == Gaussian && familyAndLink.linkObj == Identity) { + val model = if (familyAndLink.family == Gaussian && familyAndLink.link == Identity) { // TODO: Make standardizeFeatures and standardizeLabel configurable. val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true) From 54da2cbbb53ddde3a91ef6d0d98128d8c7f3deb8 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 22 Jan 2017 18:29:56 -0800 Subject: [PATCH 22/22] update docs --- .../GeneralizedLinearRegression.scala | 45 ++++++------ .../GeneralizedLinearRegressionSuite.scala | 69 ++++++++++--------- 2 files changed, 60 insertions(+), 54 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index db203a11a7403..c4f41d07800c5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -66,7 +66,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * Param for the power in the variance function of the Tweedie distribution which provides * the relationship between the variance and mean of the distribution. - * Used only for the Tweedie family. + * Only applicable for the Tweedie family. * (see * Tweedie Distribution (Wikipedia)) * Supported values: 0 and [1, Inf). @@ -79,7 +79,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam final val variancePower: DoubleParam = new DoubleParam(this, "variancePower", "The power in the variance function of the Tweedie distribution which characterizes " + "the relationship between the variance and mean of the distribution. " + - "Used only for the Tweedie family. Supported values: 0 and [1, Inf).", + "Only applicable for the Tweedie family. Supported values: 0 and [1, Inf).", (x: Double) => x >= 1.0 || x == 0.0) /** @group getParam */ @@ -106,8 +106,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam def getLink: String = $(link) /** - * Param for the index in the power link function. This is used to specify the link function - * in the Tweedie family. + * Param for the index in the power link function. Only applicable for the Tweedie family. * Note that link power 0, 1, -1 or 0.5 corresponds to the Log, Identity, Inverse or Sqrt * link, respectively. * @@ -115,8 +114,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam */ @Since("2.2.0") final val linkPower: DoubleParam = new DoubleParam(this, "linkPower", - "The index in the power link function. This is used to specify the link function in the " + - "Tweedie family.") + "The index in the power link function. Only applicable for the Tweedie family.") /** @group getParam */ @Since("2.2.0") @@ -148,12 +146,15 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam schema: StructType, fitting: Boolean, featuresDataType: DataType): StructType = { - if ($(family) == "tweedie") { + if ($(family).toLowerCase == "tweedie") { if (isSet(link)) { logWarning("When family is tweedie, use param linkPower to specify link function. " + "Setting param link will take no effect.") } } else { + if (isSet(variancePower)) { + logWarning("When family is not tweedie, setting param variancePower will take no effect.") + } if (isSet(linkPower)) { logWarning("When family is not tweedie, use param link to specify link function. " + "Setting param linkPower will take no effect.") @@ -381,8 +382,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def load(path: String): GeneralizedLinearRegression = super.load(path) /** - * Set of family and link pairs that GeneralizedLinearRegression supports. - * Tweedie family is specified through linkPower. + * Set of family (except for tweedie) and link pairs that GeneralizedLinearRegression supports. + * The link function of the Tweedie family is specified through param linkPower. */ private[regression] lazy val supportedFamilyAndLinkPairs = Set( Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse, @@ -453,8 +454,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine */ def apply(params: GeneralizedLinearRegressionBase): FamilyAndLink = { val familyObj = Family.fromParams(params) - val linkObj = if ((params.getFamily != "tweedie" && params.isDefined(params.link)) || - (params.getFamily == "tweedie" && params.isDefined(params.linkPower))) { + val linkObj = if ((params.getFamily.toLowerCase != "tweedie" && + params.isSet(params.link)) || (params.getFamily.toLowerCase == "tweedie" && + params.isSet(params.linkPower))) { Link.fromParams(params) } else { familyObj.defaultLink @@ -503,9 +505,10 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object Family { /** - * Gets the [[Family]] object based on family and variancePower. - * 1) retrieve object based on family name - * 2) if family name is tweedie, retrieve object based on variancePower + * Gets the [[Family]] object based on param family and variancePower. + * If param family is set with "gaussian", "binomial", "poisson" or "gamma", + * return the corresponding object directly; otherwise, construct a Tweedie object + * according to variancePower. * * @param params the parameter map containing family name and variance power */ @@ -779,11 +782,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object Link { /** - * Gets the [[Link]] object based on link or linkPower. - * 1) if family is "tweedie", retrieve object using linkPower - * 2) otherwise, retrieve object based on link name + * Gets the [[Link]] object based on param family, link and linkPower. + * If param family is set with "tweedie", return or construct link function object + * according to linkPower; otherwise, return link function object according to link. * - * @param params the parameter map containing link and link power + * @param params the parameter map containing family, link and linkPower */ def fromParams(params: GeneralizedLinearRegressionBase): Link = { if (params.getFamily.toLowerCase == "tweedie") { @@ -1244,7 +1247,8 @@ class GeneralizedLinearRegressionSummary private[regression] ( */ @Since("2.0.0") lazy val dispersion: Double = if ( - model.getFamily == Binomial.name || model.getFamily == Poisson.name) { + model.getFamily.toLowerCase == Binomial.name || + model.getFamily.toLowerCase == Poisson.name) { 1.0 } else { val rss = pearsonResiduals.agg(sum(pow(col("pearsonResiduals"), 2.0))).first().getDouble(0) @@ -1347,7 +1351,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( @Since("2.0.0") lazy val pValues: Array[Double] = { if (isNormalSolver) { - if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) { + if (model.getFamily.toLowerCase == Binomial.name || + model.getFamily.toLowerCase == Poisson.name) { tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) } } else { tValues.map { x => diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 31bcc1b064305..1190a501fe5e5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -637,33 +637,33 @@ class GeneralizedLinearRegressionSuite import GeneralizedLinearRegression._ var idx = 0 - for (fitIntercept <- Seq(false, true); linkPower <- Seq(0.0, 1.0, -1.0)) { - for (variancePower <- Seq(1.6, 2.5)) { - val trainer = new GeneralizedLinearRegression().setFamily("tweedie") - .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") - .setVariancePower(variancePower).setLinkPower(linkPower) - val model = trainer.fit(datasetTweedie) - val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) - assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + - s"linkPower = $linkPower, fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") - - val familyLink = FamilyAndLink(trainer) - model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() - .foreach { - case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => - val eta = BLAS.dot(features, model.coefficients) + model.intercept - val prediction2 = familyLink.fitted(eta) - val linkPrediction2 = eta - assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + - s"tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") - assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + - s"GLM with tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + - s"and variancePower = $variancePower.") - } - idx += 1 - } + for (fitIntercept <- Seq(false, true); + linkPower <- Seq(0.0, 1.0, -1.0); + variancePower <- Seq(1.6, 2.5)) { + val trainer = new GeneralizedLinearRegression().setFamily("tweedie") + .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") + .setVariancePower(variancePower).setLinkPower(linkPower) + val model = trainer.fit(datasetTweedie) + val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) + assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with tweedie family, " + + s"linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + + val familyLink = FamilyAndLink(trainer) + model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect() + .foreach { + case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) => + val eta = BLAS.dot(features, model.coefficients) + model.intercept + val prediction2 = familyLink.fitted(eta) + val linkPrediction2 = eta + assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " + + s"tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " + + s"GLM with tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " + + s"and variancePower = $variancePower.") + } + idx += 1 } } @@ -1228,8 +1228,9 @@ class GeneralizedLinearRegressionSuite 1.0, 3.0, 2.0, 1.0, 0.0, 4.0, 3.0, 3.0), 4, 4, byrow = TRUE)) - f <- glm(V1 ~ -1 + V3 + V4, data = df, weights = V2, + model <- glm(V1 ~ -1 + V3 + V4, data = df, weights = V2, family = tweedie(var.power = 1.6, link.power = 0)) + summary(model) Deviance Residuals: 1 2 3 4 @@ -1249,14 +1250,14 @@ class GeneralizedLinearRegressionSuite Number of Fisher Scoring iterations: 11 residuals(model, type="pearson") - 1 2 3 4 - 0.01873881 -0.01312994 0.04190280 -0.10332690 + 1 2 3 4 + 0.7383616 -0.0509458 2.2348337 -1.4552090 residuals(model, type="working") - 1 2 3 4 - 0.018067789 -0.003326304 0.038720616 -0.824070943 + 1 2 3 4 + 0.83354150 -0.04103552 1.55676369 -1.00000000 residuals(model, type="response") - 1 2 3 4 - 0.018067789 -0.003326304 0.038720616 -0.824070943 + 1 2 3 4 + 0.45460738 -0.02139574 0.60888055 -0.20392801 */ val datasetWithWeight = Seq( Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)),