Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 23 additions & 17 deletions mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,29 +38,35 @@ import org.apache.spark.sql.types._
private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {

/**
* Param for how to order labels of string column. The first label after ordering is assigned
* an index of 0.
* Options are:
* - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
* - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
* - 'alphabetDesc': descending alphabetical order
* - 'alphabetAsc': ascending alphabetical order
* Default is 'frequencyDesc'.
* When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R
* when encoding strings.
* Param for how to order categories of a FEATURE string column used by `StringIndexer`.
* The last category after ordering is dropped when encoding strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct? Do you have some references? AFAIK, R formula drop the first category alphabetically ascending order when encode string/category feature (which is consistent with your PR description). I think test("StringIndexer order types") in #17879 is correct. Could you double check this?

* The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
* |
* | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this going to generate the right format, @HyukjinKwon do you know?
I understand not all markdown style is supported

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, up to my knowledge, it looks not. I guess @actuaryzhang meant to just write these out as they are? Let me double check by myself ....

Scaladoc

2017-05-20 1 40 35

Javadoc

2017-05-20 1 40 53

Copy link
Member

@HyukjinKwon HyukjinKwon May 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My humble suggestion is prose with simple - or resembling any other formats in this package if there are similar instances.

* | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
* | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
* | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
* | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
* |
* The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
* drops the same category as R when encoding strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The order should be alphabetAsc to match R.

* Note that this ordering option is NOT used for the label column. When the label column is
* indexed, it uses the default descending frequency ordering in `StringIndexer`.
*
* @group param
*/
@Since("2.3.0")
final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
"How to order labels of string column. " +
"The first label after ordering is assigned an index of 0. " +
final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
"How to order categories of a FEATURE string column used by StringIndexer. " +
"The last category after ordering is dropped when encoding strings. " +
"The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
"RFormula drops the same category as R when encoding strings." +
s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
ParamValidators.inArray(StringIndexer.supportedStringOrderType))

/** @group getParam */
@Since("2.3.0")
def getStringOrderType: String = $(stringOrderType)
def getStringIndexerOrderType: String = $(stringIndexerOrderType)

protected def hasLabelCol(schema: StructType): Boolean = {
schema.map(_.name).contains($(labelCol))
Expand Down Expand Up @@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)

/** @group setParam */
@Since("2.3.0")
def setStringOrderType(value: String): this.type = set(stringOrderType, value)
setDefault(stringOrderType, StringIndexer.frequencyDesc)
def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)

/** Whether the formula specifies fitting an intercept. */
private[ml] def hasIntercept: Boolean = {
Expand Down Expand Up @@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
encoderStages += new StringIndexer()
.setInputCol(term)
.setOutputCol(indexCol)
.setStringOrderType($(stringOrderType))
.setStringOrderType($(stringIndexerOrderType))
prefixesToRewrite(indexCol + "_") = term + "_"
(term, indexCol)
case _ =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
assert(result.collect() === expected.collect())
}

test("encodes string terms with string order type") {
test("encodes string terms with string indexer order type") {
val formula = new RFormula().setFormula("id ~ a + b")
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
.toDF("id", "a", "b")
Expand Down Expand Up @@ -163,7 +163,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul

var idx = 0
for (orderType <- StringIndexer.supportedStringOrderType) {
val model = formula.setStringOrderType(orderType).fit(original)
val model = formula.setStringIndexerOrderType(orderType).fit(original)
val result = model.transform(original)
val resultSchema = model.transformSchema(original.schema)
assert(result.schema.toString == resultSchema.toString)
Expand All @@ -190,7 +190,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
.toDF("id", "a", "b")
val formula = new RFormula().setFormula("id ~ a + b")
.setStringOrderType(StringIndexer.alphabetDesc)
.setStringIndexerOrderType(StringIndexer.alphabetDesc)

/*
Note that the category dropped after encoding is the same between R and Spark
Expand Down