-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3614][MLLIB] Add minimumOccurence filtering to IDF #2494
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
c0cc643
4b974f5
a200bab
6897252
1801fd2
1fc09d8
9fb4093
47850ab
40fd70c
79978fc
9013447
30d20b3
bfa82ec
e6523a8
0aa3c63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,9 +30,20 @@ import org.apache.spark.rdd.RDD | |
| * Inverse document frequency (IDF). | ||
| * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total | ||
| * number of documents and `d(t)` is the number of documents that contain term `t`. | ||
| * | ||
| * This implementation supports filtering out terms which do not appear in a minimum number | ||
| * of documents (controlled by the variable minimumOccurence). For terms that are not in | ||
| * at least `minimumOccurence` documents, the IDF is found as 0, resulting in TF-IDFs of 0. | ||
| * | ||
| * @param minimumOccurence minimum of documents in which a term | ||
| * should appear for filtering | ||
| * | ||
| * | ||
| */ | ||
| @Experimental | ||
| class IDF { | ||
| class IDF(minimumOccurence: Long) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can add a val before minimumOccurence. Alternatively, if you want to set set minimumOccurence after new IDF(), you can define a private field and use a setter to set the value. |
||
|
|
||
| def this() = this(0L) | ||
|
|
||
| // TODO: Allow different IDF formulations. | ||
|
|
||
|
|
@@ -41,7 +52,7 @@ class IDF { | |
| * @param dataset an RDD of term frequency vectors | ||
| */ | ||
| def fit(dataset: RDD[Vector]): IDFModel = { | ||
| val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator)( | ||
| val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(minimumOccurence=minimumOccurence))( | ||
| seqOp = (df, v) => df.add(v), | ||
| combOp = (df1, df2) => df1.merge(df2) | ||
| ).idf() | ||
|
|
@@ -60,7 +71,7 @@ class IDF { | |
| private object IDF { | ||
|
|
||
| /** Document frequency aggregator. */ | ||
| class DocumentFrequencyAggregator extends Serializable { | ||
| class DocumentFrequencyAggregator(minimumOccurence: Long) extends Serializable { | ||
|
|
||
| /** number of documents */ | ||
| private var m = 0L | ||
|
|
@@ -123,7 +134,17 @@ private object IDF { | |
| val inv = new Array[Double](n) | ||
| var j = 0 | ||
| while (j < n) { | ||
| inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) | ||
| /* | ||
| * If the term is not present in the minimum | ||
| * number of documents, set IDF to 0. This | ||
| * will cause multiplication in IDFModel to | ||
| * set TF-IDF to 0. | ||
| */ | ||
| if(df(j) >= minimumOccurence) { | ||
| inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add space before |
||
| } else { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. else branch is not needed hers
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the branch is needed. I don't modify the df vector -- I perform the filtering in idf().
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when you allocate your inv array, by default, all values are 0. You do not need to set it to 0 again.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I didn't consider that. I'll remove the else and add a comment to clarify the default behavior. Thanks. |
||
| inv(j) = 0.0 | ||
| } | ||
| j += 1 | ||
| } | ||
| Vectors.dense(inv) | ||
|
|
@@ -140,6 +161,11 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable { | |
|
|
||
| /** | ||
| * Transforms term frequency (TF) vectors to TF-IDF vectors. | ||
| * | ||
| * If minimumOccurence was set for the IDF calculation, | ||
| * the terms which occur in fewer than minimumOccurence | ||
| * documents will have an entry of 0. | ||
| * | ||
| * @param dataset an RDD of term frequency vectors | ||
| * @return an RDD of TF-IDF vectors | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,4 +54,38 @@ class IDFSuite extends FunSuite with LocalSparkContext { | |
| assert(tfidf2.indices === Array(1)) | ||
| assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) | ||
| } | ||
|
|
||
| test("idf minimum occurence filtering") { | ||
| val n = 4 | ||
| val localTermFrequencies = Seq( | ||
| Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), | ||
| Vectors.dense(0.0, 1.0, 2.0, 3.0), | ||
| Vectors.sparse(n, Array(1), Array(1.0)) | ||
| ) | ||
| val m = localTermFrequencies.size | ||
| val termFrequencies = sc.parallelize(localTermFrequencies, 2) | ||
| val idf = new IDF(minimumOccurence=1L) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add spaces around |
||
| val model = idf.fit(termFrequencies) | ||
| val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => | ||
| if(x > 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. space after |
||
| math.log((m.toDouble + 1.0) / (x + 1.0)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. toDouble is not needed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I copied that line from the original test suite function -- I kept it to be consistent. I don't think this change is relevant to this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you remove |
||
| } else { | ||
| 0 | ||
| } | ||
| }) | ||
| assert(model.idf ~== expected absTol 1e-12) | ||
| val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap() | ||
| assert(tfidf.size === 3) | ||
| val tfidf0 = tfidf(0L).asInstanceOf[SparseVector] | ||
| assert(tfidf0.indices === Array(1, 3)) | ||
| assert(Vectors.dense(tfidf0.values) ~== | ||
| Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) | ||
| val tfidf1 = tfidf(1L).asInstanceOf[DenseVector] | ||
| assert(Vectors.dense(tfidf1.values) ~== | ||
| Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) | ||
| val tfidf2 = tfidf(2L).asInstanceOf[SparseVector] | ||
| assert(tfidf2.indices === Array(1)) | ||
| assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove extra lines