apache · RussellSpitzer · Jan 21, 2022 · Jan 25, 2022 · Jan 31, 2022 · Jan 31, 2022
diff --git a/build.gradle b/build.gradle
@@ -222,6 +222,7 @@ project(':iceberg-core') {
     }
 
     testImplementation "org.xerial:sqlite-jdbc"
+    testImplementation "org.apache.commons:commons-lang3"
     testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
   }
 }

diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.util;
+
+import java.util.Arrays;
+
+/**
+ * Within Z-Ordering the byte representations of objects being compared must be ordered,
+ * this requires several types to be transformed when converted to bytes. The goal is to
+ * map object's whose byte representation are not lexicographically ordered into representations
+ * that are lexicographically ordered.
+ * Most of these techniques are derived from
+ * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/
+ */
+public class ZOrderByteUtils {
+
+  private ZOrderByteUtils() {
+
+  }
+
+  /**
+   * Signed ints do not have their bytes in magnitude order because of the sign bit.
+   * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially
+   * shifts the 0 value so that we don't break our ordering when we cross the new 0 value.
+   */
+  public static byte[] orderIntLikeBytes(byte[] intBytes, int size) {
+    if (intBytes == null) {
+      return new byte[size];
+    }
+    intBytes[0] = (byte) (intBytes[0] ^ (1 << 7));
+    return intBytes;
+  }
+
+  /**
+   * IEEE 754 :
+   * “If two floating-point numbers in the same format are ordered (say, x \< y),
+   * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.”
+   *
+   * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically
+   * comparable bytes
+   */
+  public static byte[] orderFloatLikeBytes(byte[] floatBytes, int size) {
+    if (floatBytes == null) {
+      return new byte[size];
+    }
+    if ((floatBytes[0] & (1 << 7)) == 0) {
+      // The signed magnitude is positive set the first bit (reversing the sign so positives order after negatives)
+      floatBytes[0] = (byte) (floatBytes[0] | (1 << 7));
+    } else {
+      // The signed magnitude is negative so flip the first bit (reversing the sign so positives order after negatives)
+      // Then flip all remaining bits so numbers with greater negative magnitude come before those
+      // with less magnitude (reverse the order)
+      for (int i = 0; i < floatBytes.length; i++) {
+        floatBytes[i] = (byte) ~floatBytes[i];
+      }
+    }
+    return floatBytes;
+  }
+
+  /**
+   * Strings are lexicographically sortable BUT if different byte array lengths will
+   * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time).
+   * This implementation just uses a set size to for all output byte representations. Truncating longer strings
+   * and right padding 0 for shorter strings.
+   */
+  public static byte[] orderUTF8LikeBytes(byte[] stringBytes, int size) {
+    if (stringBytes == null) {
+      return new byte[size];
+    }
+    return Arrays.copyOf(stringBytes, size);
+  }
+
+  /**
+   * Interleave bits using a naive loop.
+   * @param columnsBinary an array of byte arrays, none of which are empty
+   * @return their bits interleaved
+   */
+  public static byte[] interleaveBits(byte[][] columnsBinary) {
+    int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum();
+    byte[] interleavedBytes = new byte[interleavedSize];
+    int sourceBit = 7;
+    int sourceByte = 0;
+    int sourceColumn = 0;
+    int interleaveBit = 7;
+    int interleaveByte = 0;
+    while (interleaveByte < interleavedSize) {
+      // Take what we have, Get the source Bit of the source Byte, move it to the interleaveBit position
+      interleavedBytes[interleaveByte] =
+          (byte) (interleavedBytes[interleaveByte] |
+              (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit);
+
+      if (--interleaveBit == -1) {
+        // Finished a byte in our interleave byte array start a new byte
+        interleaveByte++;
+        interleaveBit = 7;
+      }
+
+      // Find next column with a byte we can use
+      do {
+        if (++sourceColumn == columnsBinary.length) {
+          sourceColumn = 0;
+          if (--sourceBit == -1) {
+            sourceByte++;
+            sourceBit = 7;
+          }
+        }
+      } while (columnsBinary[sourceColumn].length <= sourceByte && interleaveByte < interleavedSize);
+    }
+    return interleavedBytes;
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.iceberg.util;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Random;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestZOrderByteUtil {
+  private static final byte IIIIIIII = (byte) 255;
+  private static final byte IOIOIOIO = (byte) 170;
+  private static final byte OIOIOIOI = (byte) 85;
+  private static final byte OOOOIIII = (byte) 15;
+  private static final byte OOOOOOOI = (byte) 1;
+  private static final byte OOOOOOOO = (byte) 0;
+
+  private static final int NUM_TESTS = 100000;
+
+  private final Random random = new Random(42);
+
+  private String bytesToString(byte[] bytes) {
+    StringBuilder result = new StringBuilder();
+    for (byte b : bytes) {
+      result.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(' ', '0'));
+    }
+    return result.toString();
+  }
+
+  /**
+   * Returns a non-0 length byte array
+   */
+  private byte[]  generateRandomBytes() {
+    int length = Math.abs(random.nextInt(100) + 1);
+    byte[] result = new byte[length];
+    random.nextBytes(result);
+    return result;
+  }
+
+  /**
+   * Test method to ensure correctness of byte interleaving code
+   */
+  private String interleaveStrings(String[] strings) {
+    StringBuilder result = new StringBuilder();
+    int totalLength = Arrays.stream(strings).mapToInt(String::length).sum();
+    int substringIndex = 0;
+    int characterIndex = 0;
+    while (characterIndex < totalLength) {
+      for (String str : strings) {
+        if (substringIndex < str.length()) {
+          result.append(str.charAt(substringIndex));
+          characterIndex++;
+        }
+      }
+      substringIndex++;
+    }
+    return result.toString();
+  }
+
+  /**
+   * Compares the result of a string based interleaving algorithm implemented above
+   * versus the binary bit-shifting algorithm used in ZOrderByteUtils. Either both
+   * algorithms are identically wrong or are both identically correct.
+   */
+  @Test
+  public void testInterleaveRandomExamples() {
+    for (int test = 0; test < NUM_TESTS; test++) {
+      int numByteArrays = Math.abs(random.nextInt(6)) + 1;
+      byte[][] testBytes =  new byte[numByteArrays][];
+      String[] testStrings = new String[numByteArrays];
+      for (int byteIndex = 0;  byteIndex < numByteArrays; byteIndex++) {
+        testBytes[byteIndex] = generateRandomBytes();
+        testStrings[byteIndex] = bytesToString(testBytes[byteIndex]);
+      }
+      byte[] byteResult = ZOrderByteUtils.interleaveBits(testBytes);
+      String byteResultAsString = bytesToString(byteResult);
+
+      String stringResult = interleaveStrings(testStrings);
+
+      Assert.assertEquals("String interleave didn't match byte interleave", stringResult, byteResultAsString);
+    }
+  }
+
+  @Test
+  public void testInterleaveEmptyBits() {
+    byte[][] test = new byte[4][10];
+    byte[] expected = new byte[40];
+
+    Assert.assertArrayEquals("Should combine empty arrays",
+        expected, ZOrderByteUtils.interleaveBits(test));
+  }
+
+  @Test
+  public void testInterleaveFullBits() {
+    byte[][] test = new byte[4][];
+    test[0] = new byte[]{IIIIIIII, IIIIIIII};
+    test[1] = new byte[]{IIIIIIII};
+    test[2] = new byte[0];
+    test[3] = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII};
+    byte[] expected = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII};
+
+    Assert.assertArrayEquals("Should combine full arrays",
+        expected, ZOrderByteUtils.interleaveBits(test));
+  }
+
+  @Test
+  public void testInterleaveMixedBits() {
+    byte[][] test = new byte[4][];
+    test[0] = new byte[]{OOOOOOOI, IIIIIIII, OOOOOOOO, OOOOIIII};
+    test[1] = new byte[]{OOOOOOOI, OOOOOOOO, IIIIIIII};
+    test[2] = new byte[]{OOOOOOOI};
+    test[3] = new byte[]{OOOOOOOI};
+    byte[] expected = new byte[]{
+        OOOOOOOO, OOOOOOOO, OOOOOOOO, OOOOIIII,
+        IOIOIOIO, IOIOIOIO,
+        OIOIOIOI, OIOIOIOI,
+        OOOOIIII};
+    Assert.assertArrayEquals("Should combine mixed byte arrays",
+        expected, ZOrderByteUtils.interleaveBits(test));
+  }
+
+  @Test
+  public void testIntOrdering() {
+    for (int i = 0; i < NUM_TESTS; i++) {
+      int aInt = random.nextInt();
+      int bInt = random.nextInt();
+      int intCompare = Integer.compare(aInt, bInt);
+      byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aInt), 4);
+      byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bInt), 4);
+      int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes);
+
+      Assert.assertTrue(String.format(
+          "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ",
+          aInt, bInt, intCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare),
+          (intCompare ^ byteCompare) >= 0);
+    }
+  }
+
+  @Test
+  public void testLongOrdering() {
+    for (int i = 0; i < NUM_TESTS; i++) {
+      long aLong = random.nextInt();
+      long bLong = random.nextInt();
+      int longCompare = Long.compare(aLong, bLong);
+      byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aLong), 8);
+      byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bLong), 8);
+      int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes);
+
+      Assert.assertTrue(String.format(
+          "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ",
+          aLong, bLong, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare),
+          (longCompare ^ byteCompare) >= 0);
+    }
+  }
+
+  @Test
+  public void testFloatOrdering() {
+    for (int i = 0; i < NUM_TESTS; i++) {
+      float aFloat = random.nextFloat();
+      float bFloat = random.nextFloat();
+      int floatCompare = Float.compare(aFloat, bFloat);
+      byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aFloat), 4);
+      byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bFloat), 4);
+      int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes);
+
+      Assert.assertTrue(String.format(
+          "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ",
+          aFloat, bFloat, floatCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare),
+          (floatCompare ^ byteCompare) >= 0);
+    }
+  }
+
+  @Test
+  public void testDoubleOrdering() {
+    for (int i = 0; i < NUM_TESTS; i++) {
+      double aDouble = random.nextDouble();
+      double bDouble = random.nextDouble();
+      int doubleCompare = Double.compare(aDouble, bDouble);
+      byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aDouble), 8);
+      byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bDouble), 8);
+      int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes);
+
+      Assert.assertTrue(String.format(
+          "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ",
+          aDouble, bDouble, doubleCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare),
+          (doubleCompare ^ byteCompare) >= 0);
+    }
+  }
+
+  @Test
+  public void testStringOrdering() {
+    for (int i = 0; i < NUM_TESTS; i++) {
+      String aString = RandomStringUtils.random(random.nextInt(35), true, true);
+      String bString = RandomStringUtils.random(random.nextInt(35), true, true);
+      int stringCompare = aString.compareTo(bString);
+      byte[] aBytes = ZOrderByteUtils.orderUTF8LikeBytes(aString.getBytes(StandardCharsets.UTF_8), 128);
+      byte[] bBytes = ZOrderByteUtils.orderUTF8LikeBytes(bString.getBytes(StandardCharsets.UTF_8), 128);
+      int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes);
+
+      Assert.assertTrue(String.format(
+          "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ",
+          aString, bString, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare),
+          (stringCompare ^ byteCompare) >= 0);
+    }
+  }
+
+  private byte[] bytesOf(int num) {
+    return ByteBuffer.allocate(4).putInt(num).array();
+  }
+
+  private byte[] bytesOf(long num) {
+    return ByteBuffer.allocate(8).putLong(num).array();
+  }
+
+  private byte[] bytesOf(float num) {
+    return ByteBuffer.allocate(4).putFloat(num).array();
+  }
+
+  private byte[] bytesOf(double num) {
+    return ByteBuffer.allocate(8).putDouble(num).array();
+  }
+}
diff --git a/versions.props b/versions.props
@@ -1,6 +1,7 @@
 org.slf4j:* = 1.7.25
 org.apache.avro:avro = 1.10.1
 org.apache.calcite:* = 1.10.0
+org.apache.commons:commons-lang3 = 3.12.0
 org.apache.flink:* = 1.12.5
 org.apache.hadoop:* = 2.7.3
 org.apache.hive:* = 2.3.8
-Original file line number
+Diff line change
@@ Expand Up / @@ -222,6 +222,7 @@ project(':iceberg-core') { @@
         }
         testImplementation "org.xerial:sqlite-jdbc"
+        testImplementation "org.apache.commons:commons-lang3"
         testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
       }
     }
@@ Expand Down @@