-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24146][PySpark][ML] spark.ml parity for sequential pattern mining - PrefixSpan: Python API #21265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-24146][PySpark][ML] spark.ml parity for sequential pattern mining - PrefixSpan: Python API #21265
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,8 +16,9 @@ | |
| # | ||
|
|
||
| from pyspark import keyword_only, since | ||
| from pyspark.sql import DataFrame | ||
| from pyspark.ml.util import * | ||
| from pyspark.ml.wrapper import JavaEstimator, JavaModel | ||
| from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, _jvm | ||
| from pyspark.ml.param.shared import * | ||
|
|
||
| __all__ = ["FPGrowth", "FPGrowthModel"] | ||
|
|
@@ -243,3 +244,104 @@ def setParams(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", | |
|
|
||
| def _create_model(self, java_model): | ||
| return FPGrowthModel(java_model) | ||
|
|
||
|
|
||
| class PrefixSpan(JavaParams): | ||
| """ | ||
| .. note:: Experimental | ||
|
|
||
| A parallel PrefixSpan algorithm to mine frequent sequential patterns. | ||
| The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns | ||
| Efficiently by Prefix-Projected Pattern Growth | ||
| (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>). | ||
| This class is not yet an Estimator/Transformer, use :py:func:`findFrequentSequentialPatterns` | ||
| method to run the PrefixSpan algorithm. | ||
|
|
||
| @see <a href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential Pattern Mining | ||
| (Wikipedia)</a> | ||
| .. versionadded:: 2.4.0 | ||
|
|
||
| """ | ||
|
|
||
| minSupport = Param(Params._dummy(), "minSupport", "The minimal support level of the " + | ||
| "sequential pattern. Sequential pattern that appears more than " + | ||
| "(minSupport * size-of-the-dataset) times will be output. Must be >= 0.", | ||
| typeConverter=TypeConverters.toFloat) | ||
|
|
||
| maxPatternLength = Param(Params._dummy(), "maxPatternLength", | ||
| "The maximal length of the sequential pattern. Must be > 0.", | ||
| typeConverter=TypeConverters.toInt) | ||
|
|
||
| maxLocalProjDBSize = Param(Params._dummy(), "maxLocalProjDBSize", | ||
| "The maximum number of items (including delimiters used in the " + | ||
| "internal storage format) allowed in a projected database before " + | ||
| "local processing. If a projected database exceeds this size, " + | ||
| "another iteration of distributed prefix growth is run. " + | ||
| "Must be > 0.", | ||
| typeConverter=TypeConverters.toInt) | ||
|
|
||
| sequenceCol = Param(Params._dummy(), "sequenceCol", "The name of the sequence column in " + | ||
| "dataset, rows with nulls in this column are ignored.", | ||
| typeConverter=TypeConverters.toString) | ||
|
|
||
| @keyword_only | ||
| def __init__(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, | ||
| sequenceCol="sequence"): | ||
| """ | ||
| __init__(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, \ | ||
| sequenceCol="sequence") | ||
| """ | ||
| super(PrefixSpan, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.fpm.PrefixSpan", self.uid) | ||
| self._setDefault(minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, | ||
| sequenceCol="sequence") | ||
| kwargs = self._input_kwargs | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
| @since("2.4.0") | ||
| def setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, | ||
| sequenceCol="sequence"): | ||
| """ | ||
| setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, \ | ||
| sequenceCol="sequence") | ||
| """ | ||
| kwargs = self._input_kwargs | ||
| return self._set(**kwargs) | ||
|
|
||
| @since("2.4.0") | ||
| def findFrequentSequentialPatterns(self, dataset): | ||
| """ | ||
| .. note:: Experimental | ||
| Finds the complete set of frequent sequential patterns in the input sequences of itemsets. | ||
|
|
||
| :param dataset: A dataframe containing a sequence column which is | ||
| `ArrayType(ArrayType(T))` type, T is the item type for the input dataset. | ||
| :return: A `DataFrame` that contains columns of sequence and corresponding frequency. | ||
| The schema of it will be: | ||
| - `sequence: ArrayType(ArrayType(T))` (T is the item type) | ||
| - `freq: Long` | ||
|
|
||
| >>> from pyspark.ml.fpm import PrefixSpan | ||
| >>> from pyspark.sql import Row | ||
| >>> df = sc.parallelize([Row(sequence=[[1, 2], [3]]), | ||
|
||
| ... Row(sequence=[[1], [3, 2], [1, 2]]), | ||
| ... Row(sequence=[[1, 2], [5]]), | ||
| ... Row(sequence=[[6]])]).toDF() | ||
| >>> prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5) | ||
| >>> prefixSpan.findFrequentSequentialPatterns(df).sort("sequence").show(truncate=False) | ||
| +----------+----+ | ||
| |sequence |freq| | ||
| +----------+----+ | ||
| |[[1]] |3 | | ||
| |[[1], [3]]|2 | | ||
| |[[1, 2]] |3 | | ||
| |[[2]] |3 | | ||
| |[[3]] |2 | | ||
| +----------+----+ | ||
|
|
||
| .. versionadded:: 2.4.0 | ||
| """ | ||
| self._transfer_params_to_java() | ||
| jdf = self._java_obj.findFrequentSequentialPatterns(dataset._jdf) | ||
| return DataFrame(jdf, dataset.sql_ctx) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There isn't
TypeConverters.toLong, do I need to add it ?My idea is that
TypeConverters.toIntalso fit to Long type in python side so I do not add it for now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just test that python 'int' type range is the same with java 'long' type.