-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-48221][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (Contains, StartsWith, EndsWith, StringLocate) #46511
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
4981a14
b5794fc
dd261bf
31b757b
35dd21a
0baa713
37e44ad
6f0ceb5
aa91b15
f72998c
86bf00b
77904fa
5b47499
aceeba8
3945137
30774a3
4923181
68bada3
8f59422
4b9b83e
eb602e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,7 +118,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.contains(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.containsInLowerCase(r); | ||
| if (r.numChars() == 0) return true; | ||
| if (l.numChars() < r.numChars()) return false; | ||
| return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -156,7 +158,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.startsWith(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.startsWithInLowerCase(r); | ||
| if (r.numBytes() == 0) return true; | ||
uros-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (l.numChars() < r.numChars()) return false; | ||
uros-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars()); | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -193,7 +197,10 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.endsWith(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.endsWithInLowerCase(r); | ||
| if (r.numBytes() == 0) return true; | ||
uros-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (l.numChars() < r.numChars()) return false; | ||
uros-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), | ||
| l.numChars() - r.numChars(), r.numChars()); | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -354,7 +361,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring | |
| return string.indexOf(substring, 0); | ||
| } | ||
| public static int execLowercase(final UTF8String string, final UTF8String substring) { | ||
| return string.toLowerCase().indexOf(substring.toLowerCase(), 0); | ||
| return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); | ||
| } | ||
| public static int execICU(final UTF8String string, final UTF8String substring, | ||
| final int collationId) { | ||
|
|
@@ -430,7 +437,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring | |
| } | ||
| public static int execLowercase(final UTF8String string, final UTF8String substring, | ||
| final int start) { | ||
| return string.toLowerCase().indexOf(substring.toLowerCase(), start); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to confirm, the previous implementation here is correct, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, unfortunately it's not - while it works fine for ASCII, it actually gives wrong results in some special cases featuring conditional case mapping, when a character has a lowercase equivalent that consists of multiple characters, or is found at a particular place in the string (context-awareness)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so as part of this PR, we actually changed the core definition of string-searching in UTF8_BINARY_LCASE, i.e. what it means for one substring (pattern) to be found in another string (target) under UTF8_BINARY_LCASE in the old implementation, and this is all due to the fact that |
||
| return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start); | ||
| } | ||
| public static int execICU(final UTF8String string, final UTF8String substring, final int start, | ||
| final int collationId) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.