Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ class SqlParser extends StandardTokenParsers {
protected val NULL = Keyword("NULL")
protected val ON = Keyword("ON")
protected val OR = Keyword("OR")
protected val LIKE = Keyword("LIKE")
protected val RLIKE = Keyword("RLIKE")
protected val ORDER = Keyword("ORDER")
protected val OUTER = Keyword("OUTER")
protected val RIGHT = Keyword("RIGHT")
Expand Down Expand Up @@ -267,6 +269,8 @@ class SqlParser extends StandardTokenParsers {
termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } |
termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } |
termExpression ~ IN ~ "(" ~ rep1sep(termExpression, ",") <~ ")" ^^ {
case e1 ~ _ ~ _ ~ e2 => In(e1, e2)
} |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,113 @@ package org.apache.spark.sql
package catalyst
package expressions

import java.util.regex.Pattern

import org.apache.spark.sql.catalyst.types.DataType
import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BooleanType
import org.apache.spark.sql.catalyst.trees.TreeNode
import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException


/**
* Thrown when an invalid RegEx string is found.
*/
class InvalidRegExException[TreeType <: TreeNode[_]](tree: TreeType, reason: String) extends
errors.TreeNodeException(tree, s"$reason", null)

trait StringRegexExpression {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add scala doc to this trait and the classes below.

self: BinaryExpression =>

type EvaluatedType = Any
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EvaluatedType can be set to Boolean.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, forgot that evaluation result can be null...


def escape(v: String): String
def nullable: Boolean = true
def dataType: DataType = BooleanType

// try cache the pattern for Literal
private lazy val cache: Pattern = right match {
case x @ Literal(value: String, StringType) => compile(value)
case _ => null
}

protected def compile(str: Any): Pattern = str match {
// TODO or let it be null if couldn't compile the regex?
case x: String if(x != null) => Pattern.compile(escape(x))
case x: String => null
case _ => throw new InvalidRegExException(this, "$str can not be compiled to regex pattern")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the right thing to do throwing an exception, or should we just return null? I'm not sure what Hive's semantics are.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter type is String(Text) of the HIVE UDFLike.evaluate, which means the UDF initialization will fail if the wrong type passed in. I put an exception throwing here just to guarantee we passed the right type. You are right, it also will throw exception if mis-match case happens, or it maybe never happens if the HiveTypeCoercion always does the right type casting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I was also thinking of the case where the regex itself is invalid, but it looks like Hive is going to fail here too.

[marmbrus@michaels-mbp spark (javaSchemaRDD)]$ sbt hive/console
[info] Starting scala interpreter...
[info] 
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.dsl._
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.catalyst.types._
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution
import org.apache.spark.sql.hive._
import org.apache.spark.sql.hive.TestHive._
import org.apache.spark.sql.parquet.ParquetTestData
Welcome to Scala version 2.10.3 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45).
Type in expressions to have them evaluated.
Type :help for more information.

scala> TestHive.runSqlHive("SELECT 'a' RLIKE '**' FROM src LIMIT 1")

======================
HIVE FAILURE OUTPUT
======================
set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=/private/var/folders/36/cjkbrr953xg2p_krwrmn8h_r0000gn/T/sparkHiveMetastore5920310799452446901;create=true
set hive.metastore.warehouse.dir=/private/var/folders/36/cjkbrr953xg2p_krwrmn8h_r0000gn/T/sparkHiveWarehouse505596429372573669
OK
Copying data from file:/Users/marmbrus/workspace/hive/data/files/kv1.txt
Copying file: file:/Users/marmbrus/workspace/hive/data/files/kv1.txt
Loading data to table default.src
Table default.src stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 5812, raw_data_size: 0]
OK
FAILED: ParseException line 1:0 cannot recognize input near 'test' '<EOF>' '<EOF>'

FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments ''**'': org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public org.apache.hadoop.io.BooleanWritable org.apache.hadoop.hive.ql.udf.UDFRegExp.evaluate(org.apache.hadoop.io.Text,org.apache.hadoop.io.Text)  on object org.apache.hadoop.hive.ql.udf.UDFRegExp@37348663 of class org.apache.hadoop.hive.ql.udf.UDFRegExp with arguments {a:org.apache.hadoop.io.Text, **:org.apache.hadoop.io.Text} of size 2

======================
END HIVE FAILURE OUTPUT
======================

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but in some of the database system, it results "null".
Anyway, thank you for the checking, I think we should follow Hive, I will remove the TODO.

}

protected def pattern(str: String) = if(cache == null) compile(str) else cache

protected def filter: PartialFunction[(Row, (String, String)), Any] = {
case (row, (null, r)) => { false }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few comments on this function:

  • I think you are already checking for null values below, so these cases will never match.
  • In match statements you can use _ to denote values where you do not need to check the value or use it.
  • I'm not sure why this is a partial function. Since you are lifting the partial function and using get to retrieve the result, it ends up being equivalent to just using a match statement, which would be syntactically much simpler.
  • The use of tuples as arguments to functions should be discouraged. Where there is a very clear (key,value) relationship or when they are only used locally it is maybe okay, but here it is very difficult to trace what the arguments to this function are supposed to be.
  • Given the above, I'd remove this PartialFunction and just embed the last case in the apply function.

case (row, (l, null)) => { false }
case (row, (l, r)) => {
val regex = pattern(r)
if(regex == null) {
null
} else {
regex.matcher(l).matches
}
}
}

case class Like(left: Expression, right: Expression) extends BinaryExpression {
def dataType = BooleanType
def nullable = left.nullable // Right cannot be null.
override def apply(input: Row): Any = {
val l = left.apply(input)
if(l == null) {
null
} else {
val r = right.apply(input)
if(r == null) {
null
} else {
filter.lift(input, (l.asInstanceOf[String], r.asInstanceOf[String])).get
}
}
}
}

/**
* Simple RegEx pattern matching function
*/
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {

def symbol = "LIKE"

// replace the _ with .{1} exactly match 1 time of any character
// replace the % with .*, match 0 or more times with any character
override def escape(v: String) = {
val sb = new StringBuilder()
var i = 0;
while (i < v.length) {
// Make a special case for "\\_" and "\\%"
val n = v.charAt(i);
if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) {
sb.append(v.charAt(i + 1))
i += 1
} else {
if (n == '_') {
sb.append(".");
} else if (n == '%') {
sb.append(".*");
} else {
sb.append(Pattern.quote(Character.toString(n)));
}
}

i += 1
}

sb.toString()
}
}

case class RLike(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {

def symbol = "RLIKE"

override def escape(v: String) = v
}
Loading