Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset}
Expand All @@ -37,6 +37,37 @@ import org.apache.spark.sql.types._
*/
private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {

/**
* Param for how to order categories of a FEATURE string column used by `StringIndexer`.
* The last category after ordering is dropped when encoding strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct? Do you have some references? AFAIK, R formula drop the first category alphabetically ascending order when encode string/category feature (which is consistent with your PR description). I think test("StringIndexer order types") in #17879 is correct. Could you double check this?

* The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
* |
* | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this going to generate the right format, @HyukjinKwon do you know?
I understand not all markdown style is supported

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, up to my knowledge, it looks not. I guess @actuaryzhang meant to just write these out as they are? Let me double check by myself ....

Scaladoc

2017-05-20 1 40 35

Javadoc

2017-05-20 1 40 53

Copy link
Member

@HyukjinKwon HyukjinKwon May 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My humble suggestion is prose with simple - or resembling any other formats in this package if there are similar instances.

* | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
* | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
* | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
* | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
* |
* The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
* drops the same category as R when encoding strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The order should be alphabetAsc to match R.

* Note that this ordering option is NOT used for the label column. When the label column is
* indexed, it uses the default descending frequency ordering in `StringIndexer`.
*
* @group param
*/
@Since("2.3.0")
final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
"How to order categories of a FEATURE string column used by StringIndexer. " +
"The last category after ordering is dropped when encoding strings. " +
"The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
"RFormula drops the same category as R when encoding strings." +
s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
ParamValidators.inArray(StringIndexer.supportedStringOrderType))

/** @group getParam */
@Since("2.3.0")
def getStringIndexerOrderType: String = $(stringIndexerOrderType)

protected def hasLabelCol(schema: StructType): Boolean = {
schema.map(_.name).contains($(labelCol))
}
Expand Down Expand Up @@ -125,6 +156,11 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@Since("2.1.0")
def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value)

/** @group setParam */
@Since("2.3.0")
def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)

/** Whether the formula specifies fitting an intercept. */
private[ml] def hasIntercept: Boolean = {
require(isDefined(formula), "Formula must be defined first.")
Expand Down Expand Up @@ -155,6 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
encoderStages += new StringIndexer()
.setInputCol(term)
.setOutputCol(indexCol)
.setStringOrderType($(stringIndexerOrderType))
prefixesToRewrite(indexCol + "_") = term + "_"
(term, indexCol)
case _ =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
* @group param
*/
@Since("1.6.0")
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
"invalid data (unseen labels or NULL values). " +
"Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
"or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
Expand All @@ -73,7 +73,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
*/
@Since("2.3.0")
final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
"how to order labels of string column. " +
"How to order labels of string column. " +
"The first label after ordering is assigned an index of 0. " +
s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
ParamValidators.inArray(StringIndexer.supportedStringOrderType))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,90 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
assert(result.collect() === expected.collect())
}

test("encodes string terms with string indexer order type") {
val formula = new RFormula().setFormula("id ~ a + b")
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
.toDF("id", "a", "b")

val expected = Seq(
Seq(
(1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
(2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
(3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
(4, "aaz", 5, Vectors.dense(0.0, 1.0, 5.0), 4.0)
).toDF("id", "a", "b", "features", "label"),
Seq(
(1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
(2, "bar", 4, Vectors.dense(0.0, 0.0, 4.0), 2.0),
(3, "bar", 5, Vectors.dense(0.0, 0.0, 5.0), 3.0),
(4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
).toDF("id", "a", "b", "features", "label"),
Seq(
(1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
(2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
(3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
(4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
).toDF("id", "a", "b", "features", "label"),
Seq(
(1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
(2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
(3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
(4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
).toDF("id", "a", "b", "features", "label")
)

var idx = 0
for (orderType <- StringIndexer.supportedStringOrderType) {
val model = formula.setStringIndexerOrderType(orderType).fit(original)
val result = model.transform(original)
val resultSchema = model.transformSchema(original.schema)
assert(result.schema.toString == resultSchema.toString)
assert(result.collect() === expected(idx).collect())
idx += 1
}
}

test("test consistency with R when encoding string terms") {
/*
R code:

df <- data.frame(id = c(1, 2, 3, 4),
a = c("foo", "bar", "bar", "aaz"),
b = c(4, 4, 5, 5))
model.matrix(id ~ a + b, df)[, -1]

abar afoo b
0 1 4
1 0 4
1 0 5
0 0 5
*/
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
.toDF("id", "a", "b")
val formula = new RFormula().setFormula("id ~ a + b")
.setStringIndexerOrderType(StringIndexer.alphabetDesc)

/*
Note that the category dropped after encoding is the same between R and Spark
(i.e., "aaz" is treated as the reference level).
However, the column order is still different:
R renders the columns in ascending alphabetical order ("bar", "foo"), while
RFormula renders the columns in descending alphabetical order ("foo", "bar").
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

R and RFormula should behavior consistent if you fix the issue I mentioned above.

*/
val expected = Seq(
(1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
(2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
(3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
(4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
).toDF("id", "a", "b", "features", "label")

val model = formula.fit(original)
val result = model.transform(original)
val resultSchema = model.transformSchema(original.schema)
assert(result.schema.toString == resultSchema.toString)
assert(result.collect() === expected.collect())
}

test("index string label") {
val formula = new RFormula().setFormula("id ~ a + b")
val original =
Expand Down