-
Notifications
You must be signed in to change notification settings - Fork 29k
Implement the RLike & Like in catalyst #224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
2c8929e
91cfd33
319edb7
aeeb1d7
84f72e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,11 +19,113 @@ package org.apache.spark.sql | |
| package catalyst | ||
| package expressions | ||
|
|
||
| import java.util.regex.Pattern | ||
|
|
||
| import org.apache.spark.sql.catalyst.types.DataType | ||
| import org.apache.spark.sql.catalyst.types.StringType | ||
| import org.apache.spark.sql.catalyst.types.BooleanType | ||
| import org.apache.spark.sql.catalyst.trees.TreeNode | ||
| import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException | ||
|
|
||
|
|
||
| /** | ||
| * Thrown when an invalid RegEx string is found. | ||
| */ | ||
| class InvalidRegExException[TreeType <: TreeNode[_]](tree: TreeType, reason: String) extends | ||
| errors.TreeNodeException(tree, s"$reason", null) | ||
|
|
||
| trait StringRegexExpression { | ||
| self: BinaryExpression => | ||
|
|
||
| type EvaluatedType = Any | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, forgot that evaluation result can be |
||
|
|
||
| def escape(v: String): String | ||
| def nullable: Boolean = true | ||
| def dataType: DataType = BooleanType | ||
|
|
||
| // try cache the pattern for Literal | ||
| private lazy val cache: Pattern = right match { | ||
| case x @ Literal(value: String, StringType) => compile(value) | ||
| case _ => null | ||
| } | ||
|
|
||
| protected def compile(str: Any): Pattern = str match { | ||
| // TODO or let it be null if couldn't compile the regex? | ||
| case x: String if(x != null) => Pattern.compile(escape(x)) | ||
| case x: String => null | ||
| case _ => throw new InvalidRegExException(this, "$str can not be compiled to regex pattern") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the right thing to do throwing an exception, or should we just return null? I'm not sure what Hive's semantics are.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The parameter type is String(Text) of the HIVE UDFLike.evaluate, which means the UDF initialization will fail if the wrong type passed in. I put an exception throwing here just to guarantee we passed the right type. You are right, it also will throw exception if mis-match case happens, or it maybe never happens if the HiveTypeCoercion always does the right type casting.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I was also thinking of the case where the regex itself is invalid, but it looks like Hive is going to fail here too. [marmbrus@michaels-mbp spark (javaSchemaRDD)]$ sbt hive/console
[info] Starting scala interpreter...
[info]
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.dsl._
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.catalyst.types._
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution
import org.apache.spark.sql.hive._
import org.apache.spark.sql.hive.TestHive._
import org.apache.spark.sql.parquet.ParquetTestData
Welcome to Scala version 2.10.3 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45).
Type in expressions to have them evaluated.
Type :help for more information.
scala> TestHive.runSqlHive("SELECT 'a' RLIKE '**' FROM src LIMIT 1")
======================
HIVE FAILURE OUTPUT
======================
set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=/private/var/folders/36/cjkbrr953xg2p_krwrmn8h_r0000gn/T/sparkHiveMetastore5920310799452446901;create=true
set hive.metastore.warehouse.dir=/private/var/folders/36/cjkbrr953xg2p_krwrmn8h_r0000gn/T/sparkHiveWarehouse505596429372573669
OK
Copying data from file:/Users/marmbrus/workspace/hive/data/files/kv1.txt
Copying file: file:/Users/marmbrus/workspace/hive/data/files/kv1.txt
Loading data to table default.src
Table default.src stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 5812, raw_data_size: 0]
OK
FAILED: ParseException line 1:0 cannot recognize input near 'test' '<EOF>' '<EOF>'
FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments ''**'': org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public org.apache.hadoop.io.BooleanWritable org.apache.hadoop.hive.ql.udf.UDFRegExp.evaluate(org.apache.hadoop.io.Text,org.apache.hadoop.io.Text) on object org.apache.hadoop.hive.ql.udf.UDFRegExp@37348663 of class org.apache.hadoop.hive.ql.udf.UDFRegExp with arguments {a:org.apache.hadoop.io.Text, **:org.apache.hadoop.io.Text} of size 2
======================
END HIVE FAILURE OUTPUT
======================
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but in some of the database system, it results "null". |
||
| } | ||
|
|
||
| protected def pattern(str: String) = if(cache == null) compile(str) else cache | ||
|
|
||
| protected def filter: PartialFunction[(Row, (String, String)), Any] = { | ||
| case (row, (null, r)) => { false } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A few comments on this function:
|
||
| case (row, (l, null)) => { false } | ||
| case (row, (l, r)) => { | ||
| val regex = pattern(r) | ||
| if(regex == null) { | ||
| null | ||
| } else { | ||
| regex.matcher(l).matches | ||
| } | ||
| } | ||
| } | ||
|
|
||
| case class Like(left: Expression, right: Expression) extends BinaryExpression { | ||
| def dataType = BooleanType | ||
| def nullable = left.nullable // Right cannot be null. | ||
| override def apply(input: Row): Any = { | ||
| val l = left.apply(input) | ||
| if(l == null) { | ||
| null | ||
| } else { | ||
| val r = right.apply(input) | ||
| if(r == null) { | ||
| null | ||
| } else { | ||
| filter.lift(input, (l.asInstanceOf[String], r.asInstanceOf[String])).get | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Simple RegEx pattern matching function | ||
| */ | ||
| case class Like(left: Expression, right: Expression) | ||
| extends BinaryExpression with StringRegexExpression { | ||
|
|
||
| def symbol = "LIKE" | ||
|
|
||
| // replace the _ with .{1} exactly match 1 time of any character | ||
| // replace the % with .*, match 0 or more times with any character | ||
| override def escape(v: String) = { | ||
| val sb = new StringBuilder() | ||
| var i = 0; | ||
| while (i < v.length) { | ||
| // Make a special case for "\\_" and "\\%" | ||
| val n = v.charAt(i); | ||
| if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) { | ||
| sb.append(v.charAt(i + 1)) | ||
| i += 1 | ||
| } else { | ||
| if (n == '_') { | ||
| sb.append("."); | ||
| } else if (n == '%') { | ||
| sb.append(".*"); | ||
| } else { | ||
| sb.append(Pattern.quote(Character.toString(n))); | ||
| } | ||
| } | ||
|
|
||
| i += 1 | ||
| } | ||
|
|
||
| sb.toString() | ||
| } | ||
| } | ||
|
|
||
| case class RLike(left: Expression, right: Expression) | ||
| extends BinaryExpression with StringRegexExpression { | ||
|
|
||
| def symbol = "RLIKE" | ||
|
|
||
| override def escape(v: String) = v | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add scala doc to this trait and the classes below.