-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14659][ML] RFormula consistent with R when handling strings #17967
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4d27123
6841c33
77fe864
a1be94c
698588e
147311b
5f31d31
341949c
24818a7
1a1e06c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,29 +38,35 @@ import org.apache.spark.sql.types._ | |
| private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { | ||
|
|
||
| /** | ||
| * Param for how to order labels of string column. The first label after ordering is assigned | ||
| * an index of 0. | ||
| * Options are: | ||
| * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) | ||
| * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) | ||
| * - 'alphabetDesc': descending alphabetical order | ||
| * - 'alphabetAsc': ascending alphabetical order | ||
| * Default is 'frequencyDesc'. | ||
| * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R | ||
| * when encoding strings. | ||
| * Param for how to order categories of a FEATURE string column used by `StringIndexer`. | ||
| * The last category after ordering is dropped when encoding strings. | ||
| * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b' | ||
| * | | ||
| * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula | ||
|
||
| * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') | ||
| * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') | ||
| * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c') | ||
| * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a') | ||
| * | | ||
| * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula` | ||
| * drops the same category as R when encoding strings. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The order should be |
||
| * Note that this ordering option is NOT used for the label column. When the label column is | ||
| * indexed, it uses the default descending frequency ordering in `StringIndexer`. | ||
| * | ||
| * @group param | ||
| */ | ||
| @Since("2.3.0") | ||
| final val stringOrderType: Param[String] = new Param(this, "stringOrderType", | ||
| "How to order labels of string column. " + | ||
| "The first label after ordering is assigned an index of 0. " + | ||
| final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType", | ||
| "How to order categories of a FEATURE string column used by StringIndexer. " + | ||
| "The last category after ordering is dropped when encoding strings. " + | ||
| "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " + | ||
| "RFormula drops the same category as R when encoding strings." + | ||
| s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", | ||
| ParamValidators.inArray(StringIndexer.supportedStringOrderType)) | ||
|
|
||
| /** @group getParam */ | ||
| @Since("2.3.0") | ||
| def getStringOrderType: String = $(stringOrderType) | ||
| def getStringIndexerOrderType: String = $(stringIndexerOrderType) | ||
|
|
||
| protected def hasLabelCol(schema: StructType): Boolean = { | ||
| schema.map(_.name).contains($(labelCol)) | ||
|
|
@@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) | |
|
|
||
| /** @group setParam */ | ||
| @Since("2.3.0") | ||
| def setStringOrderType(value: String): this.type = set(stringOrderType, value) | ||
| setDefault(stringOrderType, StringIndexer.frequencyDesc) | ||
| def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value) | ||
| setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc) | ||
|
|
||
| /** Whether the formula specifies fitting an intercept. */ | ||
| private[ml] def hasIntercept: Boolean = { | ||
|
|
@@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) | |
| encoderStages += new StringIndexer() | ||
| .setInputCol(term) | ||
| .setOutputCol(indexCol) | ||
| .setStringOrderType($(stringOrderType)) | ||
| .setStringOrderType($(stringIndexerOrderType)) | ||
| prefixesToRewrite(indexCol + "_") = term + "_" | ||
| (term, indexCol) | ||
| case _ => | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this correct? Do you have some references? AFAIK, R formula drop the first category alphabetically ascending order when encode string/category feature (which is consistent with your PR description). I think
test("StringIndexer order types")in #17879 is correct. Could you double check this?