apache · uros-db · May 27, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -18,6 +18,8 @@
 
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.text.StringSearch;
 import com.ibm.icu.util.ULocale;
 
@@ -27,6 +29,8 @@
 import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
 import static org.apache.spark.unsafe.Platform.copyMemory;
 
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -424,27 +428,56 @@ private static UTF8String toLowerCaseSlow(final UTF8String target, final int col
    * @param codePoint The code point to convert to lowercase.
    * @param sb The StringBuilder to append the lowercase character to.
    */
-  private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
-    if (codePoint == 0x0130) {
+  private static void appendLowercaseCodePoint(final int codePoint, final StringBuilder sb) {
+    int lowercaseCodePoint = getLowercaseCodePoint(codePoint);
+    if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
       // Latin capital letter I with dot above is mapped to 2 lowercase characters.
       sb.appendCodePoint(0x0069);
       sb.appendCodePoint(0x0307);
+    } else {
+      // All other characters should follow context-unaware ICU single-code point case mapping.
+      sb.appendCodePoint(lowercaseCodePoint);
+    }
+  }
+
+  /**
+   * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of the combined lowercase
+   * code point for ASCII lowercase letter i with an additional combining dot character (U+0307).
+   * This integer value is not a valid code point itself, but rather an artificial code point
+   * marker used to represent the two lowercase characters that are the result of converting the
+   * uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
+   */
+  private static final int CODE_POINT_LOWERCASE_I = 0x69;
+  private static final int CODE_POINT_COMBINING_DOT = 0x307;
+  private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT =
+    CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT;
+
+  /**
+   * Returns the lowercase version of the provided code point, with special handling for
+   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
+   * context-insensitive case mappings (i.e. characters that map to different characters based on
+   * the position in the string relative to other characters in lowercase).
+   */
+  private static int getLowercaseCodePoint(final int codePoint) {
+    if (codePoint == 0x0130) {
+      // Latin capital letter I with dot above is mapped to 2 lowercase characters.
+      return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
     }
     else if (codePoint == 0x03C2) {
       // Greek final and non-final capital letter sigma should be mapped the same.
-      sb.appendCodePoint(0x03C3);
+      return 0x03C3;
     }
     else {
       // All other characters should follow context-unaware ICU single-code point case mapping.
-      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+      return UCharacter.toLowerCase(codePoint);
     }
   }
 
   /**
    * Converts an entire string to lowercase using ICU rules, code point by code point, with
    * special handling for one-to-many case mappings (i.e. characters that map to multiple
    * characters in lowercase). Also, this method omits information about context-sensitive case
-   * mappings using special handling in the `lowercaseCodePoint` method.
+   * mappings using special handling in the `appendLowercaseCodePoint` method.
    *
    * @param target The target string to convert to lowercase.
    * @return The string converted to lowercase in a context-unaware manner.
@@ -458,7 +491,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
     String targetString = target.toValidString();
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < targetString.length(); ++i) {
-      lowercaseCodePoint(targetString.codePointAt(i), sb);
+      appendLowercaseCodePoint(targetString.codePointAt(i), sb);
     }
     return UTF8String.fromString(sb.toString());
   }
@@ -655,38 +688,106 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
     }
   }
 
-  public static Map<String, String> getCollationAwareDict(UTF8String string,
-      Map<String, String> dict, int collationId) {
-    // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
-    String srcStr = string.toString();
-
-    Map<String, String> collationAwareDict = new HashMap<>();
-    for (String key : dict.keySet()) {
-      StringSearch stringSearch =
-        CollationFactory.getStringSearch(string, UTF8String.fromString(key), collationId);
+  /**
+   * Converts the original translation dictionary (`dict`) to a dictionary with lowercased keys.
+   * This method is used to create a dictionary that can be used for the UTF8_LCASE collation.
+   * Note that `StringTranslate.buildDict` will ensure that all strings are validated properly.
+   *
+   * The method returns a map with lowercased code points as keys, while the values remain
+   * unchanged. Note that `dict` is constructed on a character by character basis, and the
+   * original keys are stored as strings. Keys in the resulting lowercase dictionary are stored
+   * as integers, which correspond only to single characters from the original `dict`. Also,
+   * there is special handling for the Turkish dotted uppercase letter I (U+0130).
+   */
+  private static Map<Integer, String> getLowercaseDict(final Map<String, String> dict) {
+    // Replace all the keys in the dict with lowercased code points.
+    Map<Integer, String> lowercaseDict = new HashMap<>();
+    for (Map.Entry<String, String> entry : dict.entrySet()) {
+      int codePoint = entry.getKey().codePointAt(0);
+      lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint), entry.getValue());
+    }
+    return lowercaseDict;
+  }
 
-      int pos = 0;
-      while ((pos = stringSearch.next()) != StringSearch.DONE) {
-        int codePoint = srcStr.codePointAt(pos);
-        int charCount = Character.charCount(codePoint);
-        String newKey = srcStr.substring(pos, pos + charCount);
+  /**
+   * Translates the `input` string using the translation map `dict`, for UTF8_LCASE collation.
+   * String translation is performed by iterating over the input string, from left to right, and
+   * repeatedly translating the longest possible substring that matches a key in the dictionary.
+   * For UTF8_LCASE, the method uses the lowercased substring to perform the lookup in the
+   * lowercase version of the translation map.
+   *
+   * @param input the string to be translated
+   * @param dict the lowercase translation dictionary
+   * @return the translated string
+   */
+  public static UTF8String lowercaseTranslate(final UTF8String input,
+      final Map<String, String> dict) {
+    Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+    StringBuilder sb = new StringBuilder();
+    for (int charIndex = 0; charIndex < input.numChars(); ++charIndex) {
+      int codePoint = input.getChar(charIndex);
+      if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+          codePoint == CODE_POINT_LOWERCASE_I && charIndex + 1 < input.numChars() &&
+          input.getChar(charIndex + 1) == CODE_POINT_COMBINING_DOT) {
+        // Special handling for letter i (U+0069) followed by a combining dot (U+0307)
+        codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+        ++charIndex;
+      }
+      String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+      if (translated == null) {
+        sb.appendCodePoint(codePoint);
+      } else if (!"\0".equals(translated)) {
+        sb.append(translated);
+      }
+    }
+    return UTF8String.fromString(sb.toString());
+  }
 
-        boolean exists = false;
-        for (String existingKey : collationAwareDict.keySet()) {
-          if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
-            collationAwareDict.put(newKey, collationAwareDict.get(existingKey));
-            exists = true;
-            break;
+  /**
+   * Translates the `input` string using the translation map `dict`, for all ICU collations.
+   * String translation is performed by iterating over the input string, from left to right, and
+   * repeatedly translating the longest possible substring that matches a key in the dictionary.
+   * For ICU collations, the method uses the collation key of the substring to perform the lookup
+   * in the collation aware version of the translation map.
+   *
+   * @param input the string to be translated
+   * @param dict the collation aware translation dictionary
+   * @param collationId the collation ID to use for string translation
+   * @return the translated string
+   */
+  public static UTF8String translate(final UTF8String input,
+      final Map<String, String> dict, final int collationId) {
+    String inputString = input.toValidString();
+    CharacterIterator target = new StringCharacterIterator(inputString);
+    Collator collator = CollationFactory.fetchCollation(collationId).collator;
+    StringBuilder sb = new StringBuilder();
+    int charIndex = 0;
+    while (charIndex < inputString.length()) {
+      int longestMatchLen = 0;
+      String longestMatch = "";
+      for (String key : dict.keySet()) {
+        StringSearch stringSearch = new StringSearch(key, target, (RuleBasedCollator) collator);
+        stringSearch.setIndex(charIndex);
+        int matchIndex = stringSearch.next();
+        if (matchIndex == charIndex) {
+          int matchLen = stringSearch.getMatchLength();
+          if (matchLen > longestMatchLen) {
+            longestMatchLen = matchLen;
+            longestMatch = key;
           }
         }
-
-        if (!exists) {
-          collationAwareDict.put(newKey, dict.get(key));
+      }
+      if (longestMatchLen == 0) {
+        sb.append(inputString.charAt(charIndex));
+        charIndex++;
+      } else {
+        if (!"\0".equals(dict.get(longestMatch))) {
+          sb.append(dict.get(longestMatch));
         }
+        charIndex += longestMatchLen;
       }
     }
-
-    return collationAwareDict;
+    return UTF8String.fromString(sb.toString());
   }
 
   public static UTF8String lowercaseTrim(

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -813,12 +813,24 @@ public static String[] getICULocaleNames() {
     return Collation.CollationSpecICU.ICULocaleNames;
   }
 
-  public static UTF8String getCollationKey(UTF8String input, int collationId) {
+  public static String getCollationKey(String input, int collationId) {
     Collation collation = fetchCollation(collationId);
     if (collation.supportsBinaryEquality) {
       return input;
     } else if (collation.supportsLowercaseEquality) {
       return input.toLowerCase();
+    } else {
+      CollationKey collationKey = collation.collator.getCollationKey(input);
+      return Arrays.toString(collationKey.toByteArray());
+    }
+  }
+
+  public static UTF8String getCollationKey(UTF8String input, int collationId) {
+    Collation collation = fetchCollation(collationId);
+    if (collation.supportsBinaryEquality) {
+      return input;
+    } else if (collation.supportsLowercaseEquality) {
+      return CollationAwareUTF8String.toLowerCase(input);
     } else {
       CollationKey collationKey = collation.collator.getCollationKey(input.toString());
       return UTF8String.fromBytes(collationKey.toByteArray());

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -514,34 +514,19 @@ public static String genCode(final String source, final String dict, final int c
         return String.format(expr + "Binary(%s, %s)", source, dict);
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", source, dict);
-      } else {
+      }  else {
         return String.format(expr + "ICU(%s, %s, %d)", source, dict, collationId);
       }
     }
     public static UTF8String execBinary(final UTF8String source, Map<String, String> dict) {
       return source.translate(dict);
     }
     public static UTF8String execLowercase(final UTF8String source, Map<String, String> dict) {
-      String srcStr = source.toString();
-      StringBuilder sb = new StringBuilder();
-      int charCount = 0;
-      for (int k = 0; k < srcStr.length(); k += charCount) {
-        int codePoint = srcStr.codePointAt(k);
-        charCount = Character.charCount(codePoint);
-        String subStr = srcStr.substring(k, k + charCount);
-        String translated = dict.get(subStr.toLowerCase());
-        if (null == translated) {
-          sb.append(subStr);
-        } else if (!"\0".equals(translated)) {
-          sb.append(translated);
-        }
-      }
-      return UTF8String.fromString(sb.toString());
+      return CollationAwareUTF8String.lowercaseTranslate(source, dict);
     }
     public static UTF8String execICU(final UTF8String source, Map<String, String> dict,
         final int collationId) {
-      return source.translate(CollationAwareUTF8String.getCollationAwareDict(
-        source, dict, collationId));
+      return CollationAwareUTF8String.translate(source, dict, collationId);
     }
   }
 

diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1051,15 +1051,35 @@ case class Overlay(input: Expression, replace: Expression, pos: Expression, len:
 
 object StringTranslate {
 
-  def buildDict(matchingString: UTF8String, replaceString: UTF8String, collationId: Int)
+  /**
+   * Build a translation dictionary from UTF8Strings. First, this method converts the input strings
+   * to valid Java Strings. However, we avoid any behavior changes for the UTF8_BINARY collation,
+   * but ensure that all other collations use `UTF8String.toValidString` to achieve this step.
+   */
+  def buildDict(matchingString: UTF8String, replaceString: UTF8String, collationId: Integer)
     : JMap[String, String] = {
-    val matching = if (CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
-      matchingString.toString().toLowerCase()
+    val isCollationAware = collationId == CollationFactory.UTF8_BINARY_COLLATION_ID
+    val matching: String = if (isCollationAware) {
+      matchingString.toString
+    } else {
+      matchingString.toValidString
+    }
+    val replace: String = if (isCollationAware) {
+      replaceString.toString
     } else {
-      matchingString.toString()
+      replaceString.toValidString
     }
+    buildDict(matching, replace)
+  }
 
-    val replace = replaceString.toString()
+  /**
+   * Build a translation dictionary from Strings. This method assumes that the input strings are
+   * already valid. The result dictionary maps each character in `matching` to the corresponding
+   * character in `replace`. If `replace` is shorter than `matching`, the extra characters in
+   * `matching` will be mapped to null terminator, which causes characters to get deleted during
+   * translation. If `replace` is longer than `matching`, the extra characters will be ignored.
+   */
+  private def buildDict(matching: String, replace: String): JMap[String, String] = {
     val dict = new HashMap[String, String]()
     var i = 0
     var j = 0
@@ -1083,6 +1103,7 @@ object StringTranslate {
     }
     dict
   }
+
 }
 
 /**