Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,20 @@ private static int lowercaseRFind(
return MATCH_NOT_FOUND;
}

/**
* Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
* UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
* method uses code points to compare the strings in a case-insensitive manner using ICU rules,
* as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
*
* @param left The first UTF8String to compare.
* @param right The second UTF8String to compare.
* @return An integer representing the comparison result.
*/
public static int lowercaseCompare(final UTF8String left, final UTF8String right) {
return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -296,6 +310,45 @@ public static String toLowerCase(final String target, final int collationId) {
return UCharacter.toLowerCase(locale, target);
}

/**
* Converts a single code point to lowercase using ICU rules, with special handling for
* one-to-many case mappings (i.e. characters that map to multiple characters in lowercase).
*
* @param codePoint The code point to convert to lowercase.
* @param sb The StringBuilder to append the lowercase character to.
*/
private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
if (codePoint == 0x0130) {
// Latin capital letter I with dot above is mapped to 2 lowercase characters.
sb.appendCodePoint(0x0069);
sb.appendCodePoint(0x0307);
}
else if (codePoint == 0x03C2) {
// Greek final and non-final capital letter sigma should be mapped the same.
sb.appendCodePoint(0x03C3);
}
else {
// All other characters should follow context-unaware ICU single-code point case mapping.
sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
}
}

/**
* Converts an entire string to lowercase using ICU rules, code point by code point, with
* special handling for one-to-many case mappings (i.e. characters that map to multiple
* characters in lowercase). This method omits information about context-sensitive case mappings.
*
* @param target The target string to convert to lowercase.
* @return The string converted to lowercase in a context-unaware manner.
*/
public static String lowerCaseCodePoints(final String target) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < target.length(); ++i) {
lowercaseCodePoint(target.codePointAt(i), sb);
}
return sb.toString();
}

public static String toTitleCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,9 @@ protected Collation buildCollation() {
"UTF8_BINARY_LCASE",
PROVIDER_SPARK,
null,
UTF8String::compareLowerCase,
CollationAwareUTF8String::lowercaseCompare,
"1.0",
s -> (long) s.toLowerCase().hashCode(),
s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.spark.unsafe.types;

import org.apache.spark.SparkException;
import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
import org.apache.spark.sql.catalyst.util.CollationFactory;
import org.apache.spark.sql.catalyst.util.CollationSupport;
import org.junit.jupiter.api.Test;
Expand All @@ -26,6 +27,121 @@
// checkstyle.off: AvoidEscapedUnicodeCharacters
public class CollationSupportSuite {

/**
* Collation-aware UTF8String comparison.
*/

private void assertStringCompare(String s1, String s2, String collationName, int expected)
throws SparkException {
UTF8String l = UTF8String.fromString(s1);
UTF8String r = UTF8String.fromString(s2);
int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
assertEquals(Integer.signum(expected), Integer.signum(compare));
}

@Test
public void testCompare() throws SparkException {
// Edge cases
assertStringCompare("", "", "UTF8_BINARY", 0);
assertStringCompare("a", "", "UTF8_BINARY", 1);
assertStringCompare("", "a", "UTF8_BINARY", -1);
assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
assertStringCompare("", "", "UNICODE", 0);
assertStringCompare("a", "", "UNICODE", 1);
assertStringCompare("", "a", "UNICODE", -1);
assertStringCompare("", "", "UNICODE_CI", 0);
assertStringCompare("a", "", "UNICODE_CI", 1);
assertStringCompare("", "a", "UNICODE_CI", -1);
// Basic tests
assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
// Accent variation
assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
// Case-variable character length
assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
assertStringCompare("i\u0307", "İ", "UNICODE", -1);
assertStringCompare("İ", "i\u0307", "UNICODE", 1);
assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
// Conditional case mapping
assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
assertStringCompare("ς", "σ", "UNICODE", 1);
assertStringCompare("ς", "Σ", "UNICODE", 1);
assertStringCompare("σ", "Σ", "UNICODE", -1);
assertStringCompare("ς", "σ", "UNICODE_CI", 0);
assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
}

private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected,
Boolean useCodePoints) {
if (useCodePoints) {
assertEquals(expected.toString(),
CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
} else {
assertEquals(expected, target.toLowerCase());
}
}

@Test
public void testLowerCaseCodePoints() {
// Edge cases
assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false);
assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true);
// Basic tests
assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false);
assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false);
assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true);
assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true);
// Accent variation
assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false);
assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true);
// Case-variable character length
assertLowerCaseCodePoints(
UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false);
assertLowerCaseCodePoints(
UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false);
assertLowerCaseCodePoints(
UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true);
assertLowerCaseCodePoints(
UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true);
// Conditional case mapping
assertLowerCaseCodePoints(
UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false);
assertLowerCaseCodePoints(
UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true);
// Surrogate pairs are treated as invalid UTF8 sequences
assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
{(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
UTF8String.fromString("\ufffd\ufffd"), false);
assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
{(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
UTF8String.fromString("\ufffd\ufffd"), true);
}

/**
* Collation-aware string expressions.
*/
Expand Down