diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 056b202bc398..54d0d3fbcc3a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -386,6 +386,24 @@ public static String lowerCaseCodePoints(final String target) { return sb.toString(); } + /** + * Convert the input string to titlecase using the ICU root locale rules. + */ + public static UTF8String toTitleCase(final UTF8String target) { + return UTF8String.fromString(toTitleCase(target.toString())); + } + + public static String toTitleCase(final String target) { + return UCharacter.toTitleCase(target, BreakIterator.getWordInstance()); + } + + /** + * Convert the input string to titlecase using the specified ICU collation rules. + */ + public static UTF8String toTitleCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toTitleCase(target.toString(), collationId)); + } + public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index d5bcc61bac2a..cc6528c28a29 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -260,8 +260,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) { public static class InitCap { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return execUTF8(v); + if (collation.supportsBinaryEquality) { + return execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); } else { return execICU(v, collationId); } @@ -270,25 +272,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) { public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.InitCap.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return String.format(expr + "UTF8(%s)", v); + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toLowerCase().toTitleCase(); } - + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toTitleCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString( - CollationAwareUTF8String.toTitleCase( - CollationAwareUTF8String.toLowerCase( - v.toString(), - collationId - ), - collationId)); + return CollationAwareUTF8String.toTitleCase(v, collationId); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index fefa5b52a0c2..86a25b6e7579 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -745,10 +745,48 @@ public void testInitCap() throws SparkException { assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "İo"); - assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "İo"); - assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); + assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); + assertInitCap("İo", "UNICODE", "İo"); + assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); + assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o"); + // Different possible word boundaries + assertInitCap("a b c", "UTF8_BINARY", "A B C"); + assertInitCap("a b c", "UNICODE", "A B C"); + assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C"); + assertInitCap("a b c", "UNICODE_CI", "A B C"); + assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c"); + assertInitCap("a.b,c", "UNICODE", "A.b,C"); + assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C"); + assertInitCap("a.b,c", "UNICODE_CI", "A.b,C"); + assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c"); + assertInitCap("a. b-c", "UNICODE", "A. B-C"); + assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C"); + assertInitCap("a. b-c", "UNICODE_CI", "A. B-C"); + assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c"); + assertInitCap("a?b世c", "UNICODE", "A?B世C"); + assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C"); + assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); + // Titlecase characters that are different from uppercase characters + assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE", "Dzdzdz"); + assertInitCap("dzDZDz", "UTF8_BINARY_LCASE", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz"); + assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", + "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); } private void assertStringInstr(String string, String substring, String collationName,