Skip to content
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
952887e
Add Tweedie family to GLM
actuaryzhang Dec 16, 2016
4f184ec
Fix calculation in dev resid; Add test for different var power
actuaryzhang Dec 19, 2016
7fe3910
Merge test into GLR
actuaryzhang Dec 19, 2016
bfcc4fb
Use Tweedie class instead of global object Tweedie; change variancePo…
actuaryzhang Dec 20, 2016
a8feea7
Allow Family to use GLRBase object directly
actuaryzhang Dec 21, 2016
233e2d3
Add TweedieFamily and implement specific distn within Tweedie
actuaryzhang Dec 22, 2016
17c5581
Clean up doc
actuaryzhang Dec 22, 2016
0b41825
Move defaultLink and name to subclass of TweedieFamily
actuaryzhang Dec 22, 2016
6e8e607
Change style for AIC
actuaryzhang Dec 22, 2016
8d7d34e
Rename Family methods and restore methods for tweedie subclasses
actuaryzhang Dec 23, 2016
6da7e30
Update test
actuaryzhang Dec 23, 2016
9a71e89
Clean up doc
actuaryzhang Dec 27, 2016
f461c09
Put delta in Tweedie companion object
actuaryzhang Dec 27, 2016
a839c46
Clean up doc
actuaryzhang Dec 27, 2016
fab2652
Allow more link functions in tweedie
actuaryzhang Jan 5, 2017
651ea62
Implement link power
actuaryzhang Jan 10, 2017
0310e85
remove restriction on link power; revert to use family object in check
actuaryzhang Jan 10, 2017
6f4abeb
create factory method for FamilyAndLink
actuaryzhang Jan 13, 2017
7b5d4cd
fix style issue in test
actuaryzhang Jan 14, 2017
5a40073
make model writable
actuaryzhang Jan 17, 2017
a6fe665
Merge branch 'master' into tweedie
actuaryzhang Jan 18, 2017
83deee3
resolve conflicts
actuaryzhang Jan 18, 2017
995e88f
Merge branch 'master' into tweedie
actuaryzhang Jan 23, 2017
54da2cb
update docs
actuaryzhang Jan 23, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
/**
* Param for the name of family which is a description of the error distribution
* to be used in the model.
* Supported options: "gaussian", "binomial", "poisson" and "gamma".
* Supported options: "gaussian", "binomial", "poisson", "gamma" and "tweedie".
* Default is "gaussian".
*
* @group param
Expand All @@ -63,6 +63,28 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
@Since("2.0.0")
def getFamily: String = $(family)

/**
* Param for the power in the variance function of the Tweedie distribution which provides
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nits: Param -> parameter, tweedie -> Tweedie (two lines below).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed tweedie. but other docs have been using Param..

* the relationship between the variance and mean of the distribution.
* Used only for the Tweedie family.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Only applicable for "tweedie" family. should be better?

* (see <a href="https://en.wikipedia.org/wiki/Tweedie_distribution">
* Tweedie Distribution (Wikipedia)</a>)
* Supported value: 0 and [1, Inf). Note that when the value of the variance power is
* 0, 1, or 2, the Gaussian, Poisson or Gamma family is used, respectively.
*
* @group param
*/
@Since("2.2.0")
final val variancePower: Param[Double] = new Param(this, "variancePower",
"The power in the variance function of the Tweedie distribution which characterizes " +
"the relationship between the variance and mean of the distribution. " +
"Used for the Tweedie family. Supported value: 0 and [1, Inf).",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used only for should be more clear?

(x: Double) => x >= 1.0 || x == 0.0)

/** @group getParam */
@Since("2.2.0")
def getVariancePower: Double = $(variancePower)

/**
* Param for the name of link function which provides the relationship
* between the linear predictor and the mean of the distribution function.
Expand Down Expand Up @@ -108,8 +130,9 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
featuresDataType: DataType): StructType = {
if (isDefined(link)) {
require(supportedFamilyAndLinkPairs.contains(
Family.fromName($(family)) -> Link.fromName($(link))), "Generalized Linear Regression " +
s"with ${$(family)} family does not support ${$(link)} link function.")
$(family) -> Link.fromName($(link))),
s"Generalized Linear Regression with ${$(family)} family " +
s"does not support ${$(link)} link function.")
}
val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType)
if (hasLinkPredictionCol) {
Expand All @@ -128,13 +151,14 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
* Generalized linear model (Wikipedia)</a>)
* specified by giving a symbolic description of the linear
* predictor (link function) and a description of the error distribution (family).
* It supports "gaussian", "binomial", "poisson" and "gamma" as family.
* It supports "gaussian", "binomial", "poisson", "gamma" and "tweedie" as family.
* Valid link functions for each family is listed below. The first link function of each family
* is the default one.
* - "gaussian" : "identity", "log", "inverse"
* - "binomial" : "logit", "probit", "cloglog"
* - "poisson" : "log", "identity", "sqrt"
* - "gamma" : "inverse", "identity", "log"
* - "tweedie" : "identity", "log"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- "tweedie" : "log", "identity", see L155: the first link function of each family is the default one.

*/
@Experimental
@Since("2.0.0")
Expand All @@ -157,6 +181,16 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
def setFamily(value: String): this.type = set(family, value)
setDefault(family -> Gaussian.name)

/**
* Sets the value of param [[variancePower]].
* Used only when family is "tweedie".
*
* @group setParam
*/
@Since("2.2.0")
def setVariancePower(value: Double): this.type = set(variancePower, value)
setDefault(variancePower -> 1.5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why set the default value to 1.5, AFAIK, R set the default variancePower with 0 which means gaussian family, and identity as default link function.

glm(formula = "b ~ .", family = tweedie, data = df, weights = w)

produces the same model with

glm(formula = "b ~ .", family = gaussian, data = df, weights = w)

h2o.glm has the consistent default values with R, should we keep consistent with them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. change default variancePower to 0.0, which will use Gaussian (with default identity link)


/**
* Sets the value of param [[link]].
*
Expand Down Expand Up @@ -242,7 +276,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value)

override protected def train(dataset: Dataset[_]): GeneralizedLinearRegressionModel = {
val familyObj = Family.fromName($(family))
val familyObj = Family.fromModel(this)
val linkObj = if (isDefined(link)) {
Link.fromName($(link))
} else {
Expand Down Expand Up @@ -303,20 +337,24 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine

/** Set of family and link pairs that GeneralizedLinearRegression supports. */
private[regression] lazy val supportedFamilyAndLinkPairs = Set(
Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse,
Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog,
Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt,
Gamma -> Inverse, Gamma -> Identity, Gamma -> Log
"gaussian" -> Identity, "gaussian" -> Log, "gaussian" -> Inverse,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

String is error-prone, I think we can construct a member object for Tweedie whose variancePower is the default value(1.5).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yanboliang The member object (in GeneralizedLinearRegressionBase) won't be accessible in Family, right? The method Family.fromName($(family)) uses global objects like Poisson, Gamma etc. To use Family.fromName, I need to create a Tweedie global object. Then we are back to the issue that @srowen pointed out of setting variancePower of the global object. Please advise. Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yanboliang Could you help me understand the issue caused by using string? If I use object, then I have to create a Tweedie object that is not used anywhere else. And also I have to write two methods in Family: one returns the global Tweedie object (where the variancePower is preset) and one returns the a TweedieFamily object created using the user-specified variancePower. I hope we are fine using string since there are only a few values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you are right. It needs an extra global object to avoid error-prone which may a little expensive. I'm ok with using string.

"binomial" -> Logit, "binomial" -> Probit, "binomial" -> CLogLog,
"poisson" -> Log, "poisson" -> Identity, "poisson" -> Sqrt,
"gamma" -> Inverse, "gamma" -> Identity, "gamma" -> Log,
"tweedie" -> Identity, "tweedie" -> Log
)

/** Set of family names that GeneralizedLinearRegression supports. */
private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1.name)
private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1)

/** Set of link names that GeneralizedLinearRegression supports. */
private[regression] lazy val supportedLinkNames = supportedFamilyAndLinkPairs.map(_._2.name)

private[regression] val epsilon: Double = 1E-16

/** Constant used in initialization and deviance to avoid numerical issues. */
private[regression] val delta: Double = 0.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should still be in an object IMHO; it's a constant right? epsilon really should be too. It's not a big deal but not quite right.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are already in the GeneralizedLinearRegression object, aren't they? Or do you mean creating a new object say Constant that stores these two constants, and using them like Constant.delta?

Since delta is only used in the TweedieFamily class, I can also move it there. Let me know what is best. Thanks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm OK to put it here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not a companion object for TweedieFamily though? that just seems easy and more correct


/**
* Wrapper of family and link combination used in the model.
*/
Expand Down Expand Up @@ -365,7 +403,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
/**
* A description of the error distribution to be used in the model.
*
* @param name the name of the family.
*/
private[regression] abstract class Family(val name: String) extends Serializable {

Expand Down Expand Up @@ -397,41 +434,113 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine

/** Trim the fitted value so that it will be in valid range. */
def project(mu: Double): Double = mu

}

private[regression] object Family {

/**
* Gets the [[Family]] object from its name.
* Gets the [[Family]] object based on family and variancePower.
* 1) retrieve object based on family name
* 2) if family name is tweedie, retrieve object based on variancePower
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the following document be better?

Gets the Family object based on param family and variancePower.
If param family was set with "gaussian", "binomial", "poisson" or "gamma", return the corresponding object directly; otherwise, construct a Tweedie object according to variancePower.

*
* @param name family name: "gaussian", "binomial", "poisson" or "gamma".
* @param model a GenerealizedLinearRegressionBase object
*/
def fromName(name: String): Family = {
name match {
case Gaussian.name => Gaussian
case Binomial.name => Binomial
case Poisson.name => Poisson
case Gamma.name => Gamma
def fromModel(model: GeneralizedLinearRegressionBase): Family = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to fromParams, we extract family and variancePower from the Params which is the superclass of GLR estimator and model. And actually we use this function for both estimator(L279) and model(L974).

model.getFamily match {
case "gaussian" => Gaussian
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Revert to Gaussian.name here and bellow, which is less error-prone.

case "binomial" => Binomial
case "poisson" => Poisson
case "gamma" => Gamma
case "tweedie" =>
model.getVariancePower match {
case 0.0 => Gaussian
case 1.0 => Poisson
case 2.0 => Gamma
case default => new TweedieFamily(default)
}
}
}
}

/**
* Gaussian exponential family distribution.
* The default link for the Gaussian family is the identity link.
*/
private[regression] object Gaussian extends Family("gaussian") {
* Tweedie exponential family distribution.
* This includes the special cases of Gaussian, Poisson and Gamma.
*/
private[regression] class TweedieFamily(private val variancePower: Double)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TweedieFamily -> Tweedie, we don't add suffix for other family class/object.

extends Family("tweedie") {

/*
The canonical link is 1 - variancePower. Except for the special cases of Gaussian,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The canonical link is 1 - variancePower, could you clarify this to make it more clear?

Poisson and Gamma, the canonical link is rarely used. Set Log as the default link.
*/
override val defaultLink: Link = Log
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my above comments for default link.


val defaultLink: Link = Identity
override def initialize(y: Double, weight: Double): Double = {
if (variancePower >= 1.0 && variancePower < 2.0) {
require(y >= 0.0, s"The response variable of the specified distribution " +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The response variable of $name family

s"should be non-negative, but got $y")
} else if (variancePower >= 2.0) {
require(y > 0.0, s"The response variable of the specified distribution " +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto.

s"should be non-negative, but got $y")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

y > 0.0 means positive rather than non-negative.

}
if (y == 0) delta else y
}

override def initialize(y: Double, weight: Double): Double = y
override def variance(mu: Double): Double = math.pow(mu, variancePower)

override def variance(mu: Double): Double = 1.0
private def yp(y: Double, mu: Double, p: Double): Double = {
if (p == 0) {
math.log(y / mu)
} else {
(math.pow(y, p) - math.pow(mu, p)) / p
}
}

override def deviance(y: Double, mu: Double, weight: Double): Double = {
weight * (y - mu) * (y - mu)
// Force y >= delta for Poisson or compound Poisson
val y1 = if (variancePower >= 1.0 && variancePower < 2.0) {
math.max(y, delta)
} else {
y
}
2.0 * weight *
(y * yp(y1, mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower))
}

override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
/*
This depends on the density of the Tweedie distribution.
Only implemented for Gaussian, Poisson and Gamma at this point.
*/
throw new UnsupportedOperationException("No AIC available for the tweedie family")
}

override def project(mu: Double): Double = {
if (mu < epsilon) {
epsilon
} else if (mu.isInfinity) {
Double.MaxValue
} else {
mu
}
}
}

/**
* Gaussian exponential family distribution.
* The default link for the Gaussian family is the identity link.
*/
private[regression] object Gaussian extends TweedieFamily(0.0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should keep the concrete implementation of variance, deviance and aic for Gaussian, Poisson and Gamma, the main reasons are:

  • These functions were called very frequently, the concrete implementation in subclasses should be more efficient.
  • It's helpful to locate errors or bugs.


override val name: String = "gaussian"

override val defaultLink: Link = Identity

override def aic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
Expand Down Expand Up @@ -508,25 +617,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
* Poisson exponential family distribution.
* The default link for the Poisson family is the log link.
*/
private[regression] object Poisson extends Family("poisson") {

val defaultLink: Link = Log
private[regression] object Poisson extends TweedieFamily(1.0) {

override def initialize(y: Double, weight: Double): Double = {
require(y >= 0.0, "The response variable of Poisson family " +
s"should be non-negative, but got $y")
/*
Force Poisson mean > 0 to avoid numerical instability in IRLS.
R uses y + 0.1 for initialization. See poisson()$initialize.
*/
math.max(y, 0.1)
}
override val name: String = "poisson"

override def variance(mu: Double): Double = mu

override def deviance(y: Double, mu: Double, weight: Double): Double = {
2.0 * weight * (y * math.log(y / mu) - (y - mu))
}
override val defaultLink: Link = Log

override def aic(
predictions: RDD[(Double, Double, Double)],
Expand All @@ -537,37 +632,17 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
weight * dist.Poisson(mu).logProbabilityOf(y.toInt)
}.sum()
}

override def project(mu: Double): Double = {
if (mu < epsilon) {
epsilon
} else if (mu.isInfinity) {
Double.MaxValue
} else {
mu
}
}
}

/**
* Gamma exponential family distribution.
* The default link for the Gamma family is the inverse link.
*/
private[regression] object Gamma extends Family("gamma") {
private[regression] object Gamma extends TweedieFamily(2.0) {

val defaultLink: Link = Inverse

override def initialize(y: Double, weight: Double): Double = {
require(y > 0.0, "The response variable of Gamma family " +
s"should be positive, but got $y")
y
}
override val name: String = "gamma"

override def variance(mu: Double): Double = mu * mu

override def deviance(y: Double, mu: Double, weight: Double): Double = {
-2.0 * weight * (math.log(y / mu) - (y - mu)/mu)
}
override val defaultLink: Link = Inverse

override def aic(
predictions: RDD[(Double, Double, Double)],
Expand All @@ -579,16 +654,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y)
}.sum() + 2.0
}

override def project(mu: Double): Double = {
if (mu < epsilon) {
epsilon
} else if (mu.isInfinity) {
Double.MaxValue
} else {
mu
}
}
}

/**
Expand Down Expand Up @@ -720,7 +785,8 @@ class GeneralizedLinearRegressionModel private[ml] (

import GeneralizedLinearRegression._

private lazy val familyObj = Family.fromName($(family))
private lazy val familyObj = Family.fromModel(this)

private lazy val linkObj = if (isDefined(link)) {
Link.fromName($(link))
} else {
Expand Down Expand Up @@ -905,7 +971,8 @@ class GeneralizedLinearRegressionSummary private[regression] (
*/
@Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset)

private[regression] lazy val family: Family = Family.fromName(model.getFamily)
private[regression] lazy val family: Family = Family.fromModel(model)

private[regression] lazy val link: Link = if (model.isDefined(model.link)) {
Link.fromName(model.getLink)
} else {
Expand Down
Loading