-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-2686 Add Length and OctetLen support to Spark SQL #1586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5f33482
8ddbcce
7ea3391
f780ad1
46bccf5
a91f6a3
c638587
31dcd4f
afe17e2
0d9db98
b08c87f
1e10e00
dea01f5
aae4b68
2fc131e
42f5016
ad3859e
6a6222a
81c64c3
94fcbd3
a0a03d7
91761be
22eddbc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,13 +17,16 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import java.io.UnsupportedEncodingException | ||
| import java.util.regex.Pattern | ||
|
|
||
| import org.apache.spark.Logging | ||
|
|
||
| import scala.collection.IndexedSeqOptimized | ||
|
|
||
|
|
||
| import org.apache.spark.sql.catalyst.analysis.UnresolvedException | ||
| import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType} | ||
| import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType, IntegerType} | ||
|
|
||
| trait StringRegexExpression { | ||
| self: BinaryExpression => | ||
|
|
@@ -208,6 +211,83 @@ case class EndsWith(left: Expression, right: Expression) | |
| def compare(l: String, r: String) = l.endsWith(r) | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * A function that returns the number of bytes in an expression | ||
| */ | ||
| case class Length(child: Expression) extends UnaryExpression { | ||
|
|
||
| type EvaluatedType = Any | ||
|
|
||
| override def dataType = IntegerType | ||
|
|
||
| override def foldable = child.foldable | ||
|
|
||
| override def nullable = child.nullable | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, now
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Ueshin, based on your prior comment I changed the logic it now returns null instead of throwing the exception. So I do not understand this comment - would you please clarify? thanks.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, this is not related to my prior comment. After that, you changed the logic to return
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK I think I got what you are saying. Is the following code looking better to you? a) Change to not throw exception in the case of non-string input: b) Remove unnecessary isInstanceOf : NOTE: that is incorrect, can not compile So we need the following Maybe there is another place you are referring to?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I'm just saying: override def nullable = trueinstead of override def nullable = child.nullable
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, made that change. |
||
|
|
||
| override def toString = s"Length($child)" | ||
|
|
||
| override def eval(input: Row): EvaluatedType = { | ||
| val inputVal = child.eval(input) | ||
| if (inputVal == null) { | ||
| null | ||
| } else if (!inputVal.isInstanceOf[String]) { | ||
| inputVal.toString.length | ||
| } else { | ||
| val str = inputVal.asInstanceOf[String] | ||
| str.codePointCount(0, str.length) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| object OctetLengthConstants { | ||
| val DefaultEncoding = "UTF-8" | ||
| } | ||
|
|
||
| /** | ||
| * A function that returns the number of characters in a string expression | ||
| */ | ||
| case class OctetLength(child: Expression, encoding : Expression) extends UnaryExpression | ||
| with Logging { | ||
|
|
||
| type EvaluatedType = Any | ||
|
|
||
| override def dataType = IntegerType | ||
|
|
||
| override def foldable = child.foldable | ||
|
|
||
| override def nullable = true | ||
|
|
||
| override def toString = s"OctetLen($child, $encoding)" | ||
|
|
||
| override def eval(input: Row): EvaluatedType = { | ||
| val evalInput = child.eval(input) | ||
| if (evalInput == null) { | ||
| null | ||
| } else if (!evalInput.isInstanceOf[String]) { | ||
| log.debug(s"Non-string value [$evalInput] provided to OctetLen") | ||
| null | ||
| } else { | ||
| var evalEncoding = encoding.eval(input) | ||
| val strEncoding = | ||
| if (evalEncoding != null) { | ||
| evalEncoding.toString | ||
| } else { | ||
| OctetLengthConstants.DefaultEncoding | ||
| } | ||
| val s: String = "" | ||
| try { | ||
| evalInput.asInstanceOf[String].getBytes(strEncoding).length | ||
| } catch { | ||
| case ue : UnsupportedEncodingException => { | ||
| throw new UnsupportedEncodingException( | ||
| s"OctetLen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * A function that takes a substring of its first argument starting at a given position. | ||
| * Defined for String and Binary types. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be
IntegerType?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we make integertype well then the input has to be integer. I have made the semantics here that ANY type may be provided.