From 4e0fcca9ea7e1c3875c1a3fcbc3fa452fc6ad232 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 09:44:34 +0200 Subject: [PATCH 1/9] Initial commit --- .../util/CollationAwareUTF8String.java | 22 +++++++++++++++ .../sql/catalyst/util/CollationSupport.java | 27 +++++++++---------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e65..2be360d75f42 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -147,6 +147,28 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Convert the input string to titlecase using the ICU root locale rules. + * + * @param target the input string + * @return the titlecase string + */ + public static UTF8String toTitleCase(final UTF8String target) { + return UTF8String.fromString(toTitleCase(target.toString())); + } + public static String toTitleCase(final String target) { + return UCharacter.toTitleCase(target, BreakIterator.getWordInstance()); + } + + /** + * Convert the input string to titlecase using the specified ICU collation rules. + * + * @param target the input string + * @return the titlecase string + */ + public static UTF8String toTitleCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toTitleCase(target.toString(), collationId)); + } public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b448..c3516c002a24 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -260,8 +260,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) { public static class InitCap { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return execUTF8(v); + if (collation.supportsBinaryEquality) { + return execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); } else { return execICU(v, collationId); } @@ -270,25 +272,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) { public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.InitCap.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return String.format(expr + "UTF8(%s)", v); + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toLowerCase().toTitleCase(); } - + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toTitleCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString( - CollationAwareUTF8String.toTitleCase( - CollationAwareUTF8String.toLowerCase( - v.toString(), - collationId - ), - collationId)); + return CollationAwareUTF8String.toTitleCase(v, collationId); } } From 3c58f82c3aa2326814c2b9e31bfd66cb83fc97d4 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 09:44:43 +0200 Subject: [PATCH 2/9] Update tests --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3..7bfd65e0dff7 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -526,10 +526,10 @@ public void testInitCap() throws SparkException { assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "İo"); - assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "İo"); - assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); + assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); + assertInitCap("İo", "UNICODE", "I\u0307o"); + assertInitCap("İo", "UNICODE_CI", "İo"); } private void assertStringInstr(String string, String substring, String collationName, From 05972b68cb9cf09aa29c234c4b1a864d7e1942ce Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 10:06:23 +0200 Subject: [PATCH 3/9] More tests for word boundaries --- .../unsafe/types/CollationSupportSuite.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7bfd65e0dff7..5c856a9fa937 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -530,6 +530,23 @@ public void testInitCap() throws SparkException { assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); assertInitCap("İo", "UNICODE", "I\u0307o"); assertInitCap("İo", "UNICODE_CI", "İo"); + // Different possible word boundaries + assertInitCap("a b c", "UTF8_BINARY", "A B C"); + assertInitCap("a b c", "UNICODE", "A B C"); + assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C"); + assertInitCap("a b c", "UNICODE_CI", "A B C"); + assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c"); + assertInitCap("a.b,c", "UNICODE", "A.b,c"); + assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C"); + assertInitCap("a.b,c", "UNICODE_CI", "A.b,C"); + assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c"); + assertInitCap("a. b-c", "UNICODE", "A. B-c"); + assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C"); + assertInitCap("a. b-c", "UNICODE_CI", "A. B-C"); + assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c"); + assertInitCap("a?b世c", "UNICODE", "A?b世c"); + assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C"); + assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); } private void assertStringInstr(String string, String substring, String collationName, From 2f7db12b5e3f68b172055b959e681e7c87f27606 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 10:12:48 +0200 Subject: [PATCH 4/9] More tests for titlecase characters --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 5c856a9fa937..20f1137b46d1 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -547,6 +547,15 @@ public void testInitCap() throws SparkException { assertInitCap("a?b世c", "UNICODE", "A?b世c"); assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C"); assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); + // Titlecase characters that are different from uppercase characters + assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE", "Dzdzdz"); + assertInitCap("dzDZDz", "UTF8_BINARY_LCASE", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz"); + assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); } private void assertStringInstr(String string, String substring, String collationName, From 8c14f50cb6435f9c2d12a6f010ce03bef155de60 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 10:18:08 +0200 Subject: [PATCH 5/9] More tests for case mapping --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 20f1137b46d1..fd77b8cb0f7c 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -530,6 +530,10 @@ public void testInitCap() throws SparkException { assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); assertInitCap("İo", "UNICODE", "I\u0307o"); assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); + assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o"); // Different possible word boundaries assertInitCap("a b c", "UTF8_BINARY", "A B C"); assertInitCap("a b c", "UNICODE", "A B C"); From 41962e48df10af2ac10142f2e648af7ac3cd8206 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 20:49:16 +0200 Subject: [PATCH 6/9] Fix Java lint --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index fd77b8cb0f7c..157256e7c920 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -1038,3 +1038,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From 7691b877fa486d58bcb05f90eb8c730ffdb056ea Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 5 Jun 2024 10:24:01 +0200 Subject: [PATCH 7/9] Small fixes --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ef753e3c3140..4e6e6a9ff474 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -298,26 +298,22 @@ public static String toLowerCase(final String target, final int collationId) { /** * Convert the input string to titlecase using the ICU root locale rules. - * - * @param target the input string - * @return the titlecase string */ public static UTF8String toTitleCase(final UTF8String target) { return UTF8String.fromString(toTitleCase(target.toString())); } + public static String toTitleCase(final String target) { return UCharacter.toTitleCase(target, BreakIterator.getWordInstance()); } /** * Convert the input string to titlecase using the specified ICU collation rules. - * - * @param target the input string - * @return the titlecase string */ public static UTF8String toTitleCase(final UTF8String target, final int collationId) { return UTF8String.fromString(toTitleCase(target.toString(), collationId)); } + public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); From 18427e73f623a701b22dcf631435e2f12ab5bf84 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 7 Jun 2024 12:34:52 +0200 Subject: [PATCH 8/9] Fix tests --- .../spark/unsafe/types/CollationSupportSuite.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 312a082b0bad..1237856361d6 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -747,7 +747,7 @@ public void testInitCap() throws SparkException { // Case-variable character length assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "I\u0307o"); + assertInitCap("İo", "UNICODE", "İo"); assertInitCap("İo", "UNICODE_CI", "İo"); assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o"); @@ -759,15 +759,15 @@ public void testInitCap() throws SparkException { assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C"); assertInitCap("a b c", "UNICODE_CI", "A B C"); assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c"); - assertInitCap("a.b,c", "UNICODE", "A.b,c"); + assertInitCap("a.b,c", "UNICODE", "A.b,C"); assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C"); assertInitCap("a.b,c", "UNICODE_CI", "A.b,C"); assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c"); - assertInitCap("a. b-c", "UNICODE", "A. B-c"); + assertInitCap("a. b-c", "UNICODE", "A. B-C"); assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C"); assertInitCap("a. b-c", "UNICODE_CI", "A. B-C"); assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c"); - assertInitCap("a?b世c", "UNICODE", "A?b世c"); + assertInitCap("a?b世c", "UNICODE", "A?B世C"); assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C"); assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); // Titlecase characters that are different from uppercase characters @@ -779,6 +779,10 @@ public void testInitCap() throws SparkException { assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova"); assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); } private void assertStringInstr(String string, String substring, String collationName, From 39285ba2c996f754baa5910480ceac71284792d6 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 7 Jun 2024 21:29:09 +0200 Subject: [PATCH 9/9] Fix lint --- .../spark/unsafe/types/CollationSupportSuite.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 1237856361d6..86a25b6e7579 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -779,10 +779,14 @@ public void testInitCap() throws SparkException { assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova"); assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); - assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", + "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); } private void assertStringInstr(String string, String substring, String collationName,