@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
1919
2020import org .apache .spark .annotation .AlphaComponent
2121import org .apache .spark .ml .UnaryTransformer
22- import org .apache .spark .ml .param .{ ParamMap , IntParam , BooleanParam , Param }
22+ import org .apache .spark .ml .param ._
2323import org .apache .spark .sql .types .{DataType , StringType , ArrayType }
2424
2525/**
@@ -43,20 +43,20 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
4343/**
4444 * :: AlphaComponent ::
4545 * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
46- * or using it to split the text (set matching to false). Optional parameters also allow to fold
47- * the text to lowercase prior to it being tokenized and to filer tokens using a minimal length.
46+ * or using it to split the text (set matching to false). Optional parameters also allow filtering
47+ * tokens using a minimal length.
4848 * It returns an array of strings that can be empty.
49- * The default parameters are regex = "\\p{L}+|[^\\p{L}\\s]+", matching = true,
50- * lowercase = false, minTokenLength = 1
5149 */
5250@ AlphaComponent
5351class RegexTokenizer extends UnaryTransformer [String , Seq [String ], RegexTokenizer ] {
5452
5553 /**
56- * param for minimum token length, default is one to avoid returning empty strings
54+ * Minimum token length, >= 0.
55+ * Default: 1, to avoid returning empty strings
5756 * @group param
5857 */
59- val minTokenLength : IntParam = new IntParam (this , " minLength" , " minimum token length" )
58+ val minTokenLength : IntParam = new IntParam (this , " minLength" , " minimum token length (>= 0)" ,
59+ ParamValidators .gtEq(0 ))
6060
6161 /** @group setParam */
6262 def setMinTokenLength (value : Int ): this .type = set(minTokenLength, value)
@@ -65,7 +65,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
6565 def getMinTokenLength : Int = getOrDefault(minTokenLength)
6666
6767 /**
68- * param sets regex as splitting on gaps (true) or matching tokens (false)
68+ * Indicates whether regex splits on gaps (true) or matching tokens (false).
69+ * Default: false
6970 * @group param
7071 */
7172 val gaps : BooleanParam = new BooleanParam (this , " gaps" , " Set regex to match gaps or tokens" )
@@ -77,7 +78,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
7778 def getGaps : Boolean = getOrDefault(gaps)
7879
7980 /**
80- * param sets regex pattern used by tokenizer
81+ * Regex pattern used by tokenizer.
82+ * Default: `"\\p{L}+|[^\\p{L}\\s]+"`
8183 * @group param
8284 */
8385 val pattern : Param [String ] = new Param (this , " pattern" , " regex pattern used for tokenizing" )
0 commit comments