Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,143 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {

/**
* The constant value to indicate that the match is not found
* when searching for a pattern string in a target string.
*/
private static final int MATCH_NOT_FOUND = -1;

/**
* Returns whether the target string starts with the specified prefix,
* with respect to the UTF8_BINARY_LCASE collation. The method assumes
* that the prefix is already lowercased prior to method call to avoid the
* overhead of calling .toLowerCase() multiple times on the same prefix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
*/
public static boolean lowercaseMatchFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that starts with
* the specified prefix, with respect to the UTF8_BINARY_LCASE collation.
* The method assumes that the prefix is already lowercased. The method only
* considers the part of target string that starts from the specified position.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
public static int lowercaseMatchLengthFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int len = 0; len <= target.numChars() - startPos; ++len) {
if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the first occurrence of the pattern string
* in the target string from the specified position (0-based index),
* with respect to the UTF8_BINARY_LCASE collation. The method assumes
* that the pattern string is already lowercased prior to method call.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target, if not found, -1 returned.
*/
public static int lowercaseFind(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
for (int i = startPos; i <= target.numChars(); ++i) {
if (lowercaseMatchFrom(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns whether the target string ends with the specified suffix,
* with respect to the UTF8_BINARY_LCASE collation. The method assumes
* that the suffix is already lowercased prior to method call to avoid the
* overhead of calling .toLowerCase() multiple times on the same suffix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return whether the target string ends with the specified suffix in lowercase
*/
public static boolean lowercaseMatchUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that ends with
* the specified suffix, with respect to the UTF8_BINARY_LCASE collation.
* The method assumes that the suffix is already lowercased. The method only
* considers the part of target string that ends at the specified position.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
public static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int len = 0; len <= endPos; ++len) {
if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the last occurrence of the pattern string
* in the target string until the specified position (0-based index),
* with respect to the UTF8_BINARY_LCASE collation. The method assumes
* that the pattern string is already lowercased prior to method call.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return the position of the last occurrence of pattern in target, if not found, -1 returned.
*/
public static int lowercaseRFind(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
for (int i = endPos; i >= 0; --i) {
if (lowercaseMatchUntil(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -183,6 +320,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
return 0;
}

/**
* Returns the position of the first occurrence of the pattern string
* in the target string from the specified position (0-based index),
* with respect to the UTF8_BINARY_LCASE collation.
*
* @param target the string to be searched in
* @param pattern the string to be searched for
* @param start the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target, if not found, -1 returned.
*/
public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
final int start) {
if (pattern.numChars() == 0) return 0;
return lowercaseFind(target, pattern.toLowerCase(), start);
}

public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) {
Expand Down Expand Up @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
return UTF8String.EMPTY_UTF8;
}

UTF8String lowercaseString = string.toLowerCase();
UTF8String lowercaseDelimiter = delimiter.toLowerCase();

if (count > 0) {
int idx = -1;
// search left to right (note: the start code point is inclusive)
int matchLength = -1;
while (count > 0) {
idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
if (idx >= 0) {
count--;
} else {
// can not find enough delim
return string;
}
}
if (idx == 0) {
return UTF8String.EMPTY_UTF8;
matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
else return string; // cannot find enough delimiters in the string
}
byte[] bytes = new byte[idx];
copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
return UTF8String.fromBytes(bytes);

if (matchLength == 0) return UTF8String.EMPTY_UTF8;
return string.substring(0, matchLength);
} else {
int idx = string.numBytes() - delimiter.numBytes() + 1;
// search right to left (note: the end code point is exclusive)
int matchLength = string.numChars() + 1;
count = -count;
while (count > 0) {
idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
if (idx >= 0) {
count--;
} else {
// can not find enough delim
return string;
}
matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
else return string; // cannot find enough delimiters in the string
}
if (idx + delimiter.numBytes() == string.numBytes()) {
return UTF8String.EMPTY_UTF8;
}
int size = string.numBytes() - delimiter.numBytes() - idx;
byte[] bytes = new byte[size];
copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
bytes, BYTE_ARRAY_OFFSET, size);
return UTF8String.fromBytes(bytes);
if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
return string.substring(matchLength, string.numChars());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
return string.indexOf(substring, 0);
}
public static int execLowercase(final UTF8String string, final UTF8String substring) {
return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
}
public static int execICU(final UTF8String string, final UTF8String substring,
final int collationId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import static org.junit.jupiter.api.Assertions.*;


// checkstyle.off: AvoidEscapedUnicodeCharacters
public class CollationSupportSuite {

/**
Expand Down Expand Up @@ -567,8 +567,26 @@ public void testStringInstr() throws SparkException {
assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
assertStringInstr("i̇", "i", "UNICODE_CI", 0);
assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0);
assertStringInstr("i̇", "İ", "UNICODE_CI", 1);
assertStringInstr("İ", "i", "UNICODE_CI", 0);
assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI
assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI
assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1);
assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0);
assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
}

private void assertFindInSet(String word, String set, String collationName,
Expand Down Expand Up @@ -798,6 +816,30 @@ public void testSubstringIndex() throws SparkException {
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab");
assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
}

private void assertStringTrim(
Expand Down Expand Up @@ -1008,3 +1050,4 @@ public void testStringTrim() throws SparkException {
// TODO: Test other collation-aware expressions.

}
// checkstyle.on: AvoidEscapedUnicodeCharacters