apache · Yunni · Nov 4, 2016 · Nov 4, 2016 · Jan 25, 2017 · Jan 26, 2017
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1558,6 +1558,15 @@ for more details on the API.
 
 {% include_example java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java %}
 </div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [BucketedRandomProjectionLSH Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.BucketedRandomProjectionLSH)
+for more details on the API.
+
+{% include_example python/ml/bucketed_random_projection_lsh.py %}
+</div>
+
 </div>
 
 ### MinHash for Jaccard Distance
@@ -1590,4 +1599,12 @@ for more details on the API.
 
 {% include_example java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java %}
 </div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [MinHashLSH Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.MinHashLSH)
+for more details on the API.
+
+{% include_example python/ml/min_hash_lsh.py %}
+</div>
 </div>
diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh.py b/examples/src/main/python/ml/bucketed_random_projection_lsh.py
@@ -0,0 +1,76 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.feature import BucketedRandomProjectionLSH
+from pyspark.ml.linalg import Vectors
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating BucketedRandomProjectionLSH.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh.py
+"""
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("BucketedRandomProjectionLSHExample") \
+        .getOrCreate()
+
+    # $example on$
+    dataA = [(0, Vectors.dense([1.0, 1.0]),),
+             (1, Vectors.dense([1.0, -1.0]),),
+             (2, Vectors.dense([-1.0, -1.0]),),
+             (3, Vectors.dense([-1.0, 1.0]),)]
+    dfA = spark.createDataFrame(dataA, ["id", "keys"])
+
+    dataB = [(4, Vectors.dense([1.0, 0.0]),),
+             (5, Vectors.dense([-1.0, 0.0]),),
+             (6, Vectors.dense([0.0, 1.0]),),
+             (7, Vectors.dense([0.0, -1.0]),)]
+    dfB = spark.createDataFrame(dataB, ["id", "keys"])
+
+    key = Vectors.dense([1.0, 0.0])
+
+    brp = BucketedRandomProjectionLSH(inputCol="keys", outputCol="values", bucketLength=2.0,
+                                      numHashTables=3)
+    model = brp.fit(dfA)
+
+    # Feature Transformation
+    model.transform(dfA).show()
+    # Cache the transformed columns
+    transformedA = model.transform(dfA).cache()
+    transformedB = model.transform(dfB).cache()
+
+    # Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 1.5).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show()
+    # Self Join
+    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show()
+
+    # Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/python/ml/min_hash_lsh.py b/examples/src/main/python/ml/min_hash_lsh.py
@@ -0,0 +1,75 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.feature import MinHashLSH
+from pyspark.ml.linalg import Vectors
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating MinHashLSH.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/min_hash_lsh.py
+"""
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("MinHashLSHExample") \
+        .getOrCreate()
+
+    # $example on$
+    dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
+             (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
+             (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
+    dfA = spark.createDataFrame(dataA, ["id", "keys"])
+
+    dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
+             (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
+             (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
+    dfB = spark.createDataFrame(dataB, ["id", "keys"])
+
+    key = Vectors.sparse(6, [1, 3], [1.0, 1.0])
+
+    mh = MinHashLSH(inputCol="keys", outputCol="values", numHashTables=3)
+    model = mh.fit(dfA)
+
+    # Feature Transformation
+    model.transform(dfA).show()
+
+    # Cache the transformed columns
+    transformedA = model.transform(dfA).cache()
+    transformedB = model.transform(dfB).cache()
+
+    # Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 0.6).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 0.6).show()
+
+    # Self Join
+    model.approxSimilarityJoin(dfA, dfA, 0.6).filter("datasetA.id < datasetB.id").show()
+
+    # Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+
+    # $example off$
+
+    spark.stop()