Skip to content

Commit 96c5eee

Browse files
committed
Revert "[SPARK-7212] [MLLIB] Add sequence learning flag"
This reverts commit 25f574e. After speaking to some users and developers, we realized that FP-growth doesn't meet the requirement for frequent sequence mining. PrefixSpan (SPARK-6487) would be the correct algorithm for it. feynmanliang Author: Xiangrui Meng <meng@databricks.com> Closes #7240 from mengxr/SPARK-7212.revert and squashes the following commits: 2b3d66b [Xiangrui Meng] Revert "[SPARK-7212] [MLLIB] Add sequence learning flag"
1 parent 1165b17 commit 96c5eee

3 files changed

Lines changed: 12 additions & 82 deletions

File tree

mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ import org.apache.spark.storage.StorageLevel
3636
* :: Experimental ::
3737
*
3838
* Model trained by [[FPGrowth]], which holds frequent itemsets.
39-
* @param freqItemsets frequent itemsets, which is an RDD of [[FreqItemset]]
39+
* @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
4040
* @tparam Item item type
4141
*/
4242
@Experimental
@@ -62,14 +62,13 @@ class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) ex
6262
@Experimental
6363
class FPGrowth private (
6464
private var minSupport: Double,
65-
private var numPartitions: Int,
66-
private var ordered: Boolean) extends Logging with Serializable {
65+
private var numPartitions: Int) extends Logging with Serializable {
6766

6867
/**
6968
* Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
70-
* as the input data, ordered: `false`}.
69+
* as the input data}.
7170
*/
72-
def this() = this(0.3, -1, false)
71+
def this() = this(0.3, -1)
7372

7473
/**
7574
* Sets the minimal support level (default: `0.3`).
@@ -87,15 +86,6 @@ class FPGrowth private (
8786
this
8887
}
8988

90-
/**
91-
* Indicates whether to mine itemsets (unordered) or sequences (ordered) (default: false, mine
92-
* itemsets).
93-
*/
94-
def setOrdered(ordered: Boolean): this.type = {
95-
this.ordered = ordered
96-
this
97-
}
98-
9989
/**
10090
* Computes an FP-Growth model that contains frequent itemsets.
10191
* @param data input data set, each element contains a transaction
@@ -165,7 +155,7 @@ class FPGrowth private (
165155
.flatMap { case (part, tree) =>
166156
tree.extract(minCount, x => partitioner.getPartition(x) == part)
167157
}.map { case (ranks, count) =>
168-
new FreqItemset(ranks.map(i => freqItems(i)).reverse.toArray, count, ordered)
158+
new FreqItemset(ranks.map(i => freqItems(i)).toArray, count)
169159
}
170160
}
171161

@@ -181,12 +171,9 @@ class FPGrowth private (
181171
itemToRank: Map[Item, Int],
182172
partitioner: Partitioner): mutable.Map[Int, Array[Int]] = {
183173
val output = mutable.Map.empty[Int, Array[Int]]
184-
// Filter the basket by frequent items pattern
174+
// Filter the basket by frequent items pattern and sort their ranks.
185175
val filtered = transaction.flatMap(itemToRank.get)
186-
if (!this.ordered) {
187-
ju.Arrays.sort(filtered)
188-
}
189-
// Generate conditional transactions
176+
ju.Arrays.sort(filtered)
190177
val n = filtered.length
191178
var i = n - 1
192179
while (i >= 0) {
@@ -211,18 +198,9 @@ object FPGrowth {
211198
* Frequent itemset.
212199
* @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
213200
* @param freq frequency
214-
* @param ordered indicates if items represents an itemset (false) or sequence (true)
215201
* @tparam Item item type
216202
*/
217-
class FreqItemset[Item](val items: Array[Item], val freq: Long, val ordered: Boolean)
218-
extends Serializable {
219-
220-
/**
221-
* Auxillary constructor, assumes unordered by default.
222-
*/
223-
def this(items: Array[Item], freq: Long) {
224-
this(items, freq, false)
225-
}
203+
class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
226204

227205
/**
228206
* Returns items in a Java List.

mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
2222
class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
2323

2424

25-
test("FP-Growth frequent itemsets using String type") {
25+
test("FP-Growth using String type") {
2626
val transactions = Seq(
2727
"r z h k p",
2828
"z y x w v u t s",
@@ -38,14 +38,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
3838
val model6 = fpg
3939
.setMinSupport(0.9)
4040
.setNumPartitions(1)
41-
.setOrdered(false)
4241
.run(rdd)
4342
assert(model6.freqItemsets.count() === 0)
4443

4544
val model3 = fpg
4645
.setMinSupport(0.5)
4746
.setNumPartitions(2)
48-
.setOrdered(false)
4947
.run(rdd)
5048
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
5149
(itemset.items.toSet, itemset.freq)
@@ -63,59 +61,17 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
6361
val model2 = fpg
6462
.setMinSupport(0.3)
6563
.setNumPartitions(4)
66-
.setOrdered(false)
6764
.run(rdd)
6865
assert(model2.freqItemsets.count() === 54)
6966

7067
val model1 = fpg
7168
.setMinSupport(0.1)
7269
.setNumPartitions(8)
73-
.setOrdered(false)
7470
.run(rdd)
7571
assert(model1.freqItemsets.count() === 625)
7672
}
7773

78-
test("FP-Growth frequent sequences using String type"){
79-
val transactions = Seq(
80-
"r z h k p",
81-
"z y x w v u t s",
82-
"s x o n r",
83-
"x z y m t s q e",
84-
"z",
85-
"x z y r q t p")
86-
.map(_.split(" "))
87-
val rdd = sc.parallelize(transactions, 2).cache()
88-
89-
val fpg = new FPGrowth()
90-
91-
val model1 = fpg
92-
.setMinSupport(0.5)
93-
.setNumPartitions(2)
94-
.setOrdered(true)
95-
.run(rdd)
96-
97-
/*
98-
Use the following R code to verify association rules using arulesSequences package.
99-
100-
data = read_baskets("path", info = c("sequenceID","eventID","SIZE"))
101-
freqItemSeq = cspade(data, parameter = list(support = 0.5))
102-
resSeq = as(freqItemSeq, "data.frame")
103-
resSeq$support = resSeq$support * length(transactions)
104-
names(resSeq)[names(resSeq) == "support"] = "freq"
105-
resSeq
106-
*/
107-
val expected = Set(
108-
(Seq("r"), 3L), (Seq("s"), 3L), (Seq("t"), 3L), (Seq("x"), 4L), (Seq("y"), 3L),
109-
(Seq("z"), 5L), (Seq("z", "y"), 3L), (Seq("x", "t"), 3L), (Seq("y", "t"), 3L),
110-
(Seq("z", "t"), 3L), (Seq("z", "y", "t"), 3L)
111-
)
112-
val freqItemseqs1 = model1.freqItemsets.collect().map { itemset =>
113-
(itemset.items.toSeq, itemset.freq)
114-
}.toSet
115-
assert(freqItemseqs1 == expected)
116-
}
117-
118-
test("FP-Growth frequent itemsets using Int type") {
74+
test("FP-Growth using Int type") {
11975
val transactions = Seq(
12076
"1 2 3",
12177
"1 2 3 4",
@@ -132,14 +88,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
13288
val model6 = fpg
13389
.setMinSupport(0.9)
13490
.setNumPartitions(1)
135-
.setOrdered(false)
13691
.run(rdd)
13792
assert(model6.freqItemsets.count() === 0)
13893

13994
val model3 = fpg
14095
.setMinSupport(0.5)
14196
.setNumPartitions(2)
142-
.setOrdered(false)
14397
.run(rdd)
14498
assert(model3.freqItemsets.first().items.getClass === Array(1).getClass,
14599
"frequent itemsets should use primitive arrays")
@@ -155,14 +109,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
155109
val model2 = fpg
156110
.setMinSupport(0.3)
157111
.setNumPartitions(4)
158-
.setOrdered(false)
159112
.run(rdd)
160113
assert(model2.freqItemsets.count() === 15)
161114

162115
val model1 = fpg
163116
.setMinSupport(0.1)
164117
.setNumPartitions(8)
165-
.setOrdered(false)
166118
.run(rdd)
167119
assert(model1.freqItemsets.count() === 65)
168120
}

python/pyspark/mllib/fpm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ class FPGrowthModel(JavaModelWrapper):
3939
>>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
4040
>>> rdd = sc.parallelize(data, 2)
4141
>>> model = FPGrowth.train(rdd, 0.6, 2)
42-
>>> sorted(model.freqItemsets().collect(), key=lambda x: x.items)
43-
[FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'a', u'c'], freq=3), ...
42+
>>> sorted(model.freqItemsets().collect())
43+
[FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
4444
"""
4545

4646
def freqItemsets(self):

0 commit comments

Comments
 (0)