apache · RussellSpitzer · Jan 21, 2022 · Jan 25, 2022 · Jan 31, 2022 · Jan 31, 2022
diff --git a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java
@@ -21,6 +21,7 @@
 
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 
 public class ByteBuffers {
 
@@ -46,6 +47,15 @@ public static byte[] toByteArray(ByteBuffer buffer) {
     }
   }
 
+  public static ByteBuffer reuse(ByteBuffer reuse, int length) {
+    Preconditions.checkArgument(reuse.hasArray() && reuse.arrayOffset() == 0 && reuse.capacity() == length,
+        "Cannot reuse buffer: Should be an array %s, should have an offset of 0 %s, should be of size %s was %s",
+        reuse.hasArray(), reuse.arrayOffset(), length, reuse.capacity());
+    reuse.position(0);
+    reuse.limit(length);
+    return reuse;
+  }
+
   public static ByteBuffer copy(ByteBuffer buffer) {
     if (buffer == null) {
       return null;

diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java
@@ -44,35 +44,35 @@ private ZOrderByteUtils() {
    * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially
    * shifts the 0 value so that we don't break our ordering when we cross the new 0 value.
    */
-  public static byte[] intToOrderedBytes(int val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES);
+  public static byte[] intToOrderedBytes(int val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Integer.BYTES);
     bytes.putInt(val ^ 0x80000000);
     return bytes.array();
   }
 
   /**
-   * Signed longs are treated the same as the signed ints
+   * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
    */
-  public static byte[] longToOrderBytes(long val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES);
+  public static byte[] longToOrderedBytes(long val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Long.BYTES);
     bytes.putLong(val ^ 0x8000000000000000L);
     return bytes.array();
   }
 
   /**
-   * Signed shorts are treated the same as the signed ints
+   * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
    */
-  public static byte[] shortToOrderBytes(short val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Short.BYTES);
+  public static byte[] shortToOrderedBytes(short val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Short.BYTES);
     bytes.putShort((short) (val ^ (0x8000)));
     return bytes.array();
   }
 
   /**
-   * Signed tiny ints are treated the same as the signed ints
+   * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
    */
-  public static byte[] tinyintToOrderedBytes(byte val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Byte.BYTES);
+  public static byte[] tinyintToOrderedBytes(byte val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Byte.BYTES);
     bytes.put((byte) (val ^ (0x80)));
     return bytes.array();
   }
@@ -85,19 +85,19 @@ public static byte[] tinyintToOrderedBytes(byte val) {
    * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically
    * comparable bytes
    */
-  public static byte[] floatToOrderedBytes(float val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES);
+  public static byte[] floatToOrderedBytes(float val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Float.BYTES);
     int ival = Float.floatToIntBits(val);
     ival ^= ((ival >> (Integer.SIZE - 1)) | Integer.MIN_VALUE);
     bytes.putInt(ival);
     return bytes.array();
   }
 
   /**
-   * Doubles are treated the same as floats
+   * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)}
    */
-  public static byte[] doubleToOrderedBytes(double val) {
-    ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES);
+  public static byte[] doubleToOrderedBytes(double val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, Double.BYTES);
     long lng = Double.doubleToLongBits(val);
     lng ^= ((lng >> (Long.SIZE - 1)) | Long.MIN_VALUE);
     bytes.putLong(lng);
@@ -108,54 +108,70 @@ public static byte[] doubleToOrderedBytes(double val) {
    * Strings are lexicographically sortable BUT if different byte array lengths will
    * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time).
    * This implementation just uses a set size to for all output byte representations. Truncating longer strings
-   * and right padding 0 for shorter strings.
+   * and right padding 0 for shorter strings. Requires UTF8 (or ASCII) encoding for ordering guarantees to hold.
    */
-  public static byte[] stringToOrderedBytes(String val, int length) {
-    ByteBuffer bytes = ByteBuffer.allocate(length);
+  public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, length);
+    Arrays.fill(bytes.array(), 0, length, (byte) 0x00);
     if (val != null) {
       int maxLength = Math.min(length, val.length());
+      // We may truncate mid-character
       bytes.put(val.getBytes(), 0, maxLength);
     }
     return bytes.array();
   }
 
   /**
-   * Interleave bits using a naive loop.
-   * @param columnsBinary an array of byte arrays, none of which are empty
-   * @return their bits interleaved
+   * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is
+   * required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all
+   * columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the
+   * interleaving.
+   * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered
+   * @return the columnbytes interleaved
    */
   public static byte[] interleaveBits(byte[][] columnsBinary) {
     int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum();
     byte[] interleavedBytes = new byte[interleavedSize];
-    int sourceBit = 7;
-    int sourceByte = 0;
     int sourceColumn = 0;
-    int interleaveBit = 7;
+    int sourceByte = 0;
+    int sourceBit = 7;
     int interleaveByte = 0;
-    while (interleaveByte < interleavedSize) {
-      // Take what we have, Get the source Bit of the source Byte, move it to the interleaveBit position
-      interleavedBytes[interleaveByte] =
-          (byte) (interleavedBytes[interleaveByte] |
-              (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit);
+    int interleaveBit = 7;
 
+    while (interleaveByte < interleavedSize) {
+      // Take the source bit from source byte and move it to the output bit position
+      interleavedBytes[interleaveByte] |=
+              (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit;
       --interleaveBit;
+
+      // Check if an output byte has been completed
       if (interleaveBit == -1) {
-        // Finished a byte in our interleave byte array start a new byte
+        // Move to the next output byte
         interleaveByte++;
+        // Move to the highest order bit of the new output byte
         interleaveBit = 7;
       }
 
-      // Find next column with a byte we can use
+      // Check if the last output byte has been completed
+      if (interleaveByte == interleavedSize) {
+        break;
+      }
+
+      // Find the next source bit to interleave
       do {
+        // Move to next column
         ++sourceColumn;
         if (sourceColumn == columnsBinary.length) {
+          // If the last source column was used, reset to next bit of first column
           sourceColumn = 0;
-          if (--sourceBit == -1) {
+          --sourceBit;
+          if (sourceBit == -1) {
+            // If the last bit of the source byte was used, reset to the highest bit of the next byte
             sourceByte++;
             sourceBit = 7;
           }
         }
-      } while (columnsBinary[sourceColumn].length <= sourceByte && interleaveByte < interleavedSize);
+      } while (columnsBinary[sourceColumn].length <= sourceByte);
     }
     return interleavedBytes;
   }

diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java
@@ -20,6 +20,7 @@
 
 package org.apache.iceberg.util;
 
+import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Random;
 import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes;
@@ -36,6 +37,7 @@ public class TestZOrderByteUtil {
   private static final byte OOOOOOOO = (byte) 0;
 
   private static final int NUM_TESTS = 100000;
+  private static final int NUM_INTERLEAVE_TESTS = 1000;
 
   private final Random random = new Random(42);
 
@@ -84,7 +86,7 @@ private String interleaveStrings(String[] strings) {
    */
   @Test
   public void testInterleaveRandomExamples() {
-    for (int test = 0; test < NUM_TESTS; test++) {
+    for (int test = 0; test < NUM_INTERLEAVE_TESTS; test++) {
       int numByteArrays = Math.abs(random.nextInt(6)) + 1;
       byte[][] testBytes =  new byte[numByteArrays][];
       String[] testStrings = new String[numByteArrays];
@@ -141,12 +143,14 @@ public void testInterleaveMixedBits() {
 
   @Test
   public void testIntOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Integer.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Integer.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
       int aInt = random.nextInt();
       int bInt = random.nextInt();
       int intCompare = Integer.signum(Integer.compare(aInt, bInt));
-      byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt);
-      byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt);
+      byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -158,12 +162,14 @@ public void testIntOrdering() {
 
   @Test
   public void testLongOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Long.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Long.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
       long aLong = random.nextInt();
       long bLong = random.nextInt();
       int longCompare = Integer.signum(Long.compare(aLong, bLong));
-      byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aLong);
-      byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bLong);
+      byte[] aBytes = ZOrderByteUtils.longToOrderedBytes(aLong, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.longToOrderedBytes(bLong, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -175,12 +181,14 @@ public void testLongOrdering() {
 
   @Test
   public void testShortOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Short.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Short.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
       short aShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1));
       short bShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1));
       int longCompare = Integer.signum(Long.compare(aShort, bShort));
-      byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aShort);
-      byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bShort);
+      byte[] aBytes = ZOrderByteUtils.shortToOrderedBytes(aShort, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.shortToOrderedBytes(bShort, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -192,12 +200,14 @@ public void testShortOrdering() {
 
   @Test
   public void testTinyOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Byte.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Byte.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
-      long aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1));
-      long bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1));
+      byte aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1));
+      byte bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1));
       int longCompare = Integer.signum(Long.compare(aByte, bByte));
-      byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aByte);
-      byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bByte);
+      byte[] aBytes = ZOrderByteUtils.tinyintToOrderedBytes(aByte, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.tinyintToOrderedBytes(bByte, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -209,12 +219,14 @@ public void testTinyOrdering() {
 
   @Test
   public void testFloatOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Float.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Float.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
       float aFloat = random.nextFloat();
       float bFloat = random.nextFloat();
       int floatCompare = Integer.signum(Float.compare(aFloat, bFloat));
-      byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat);
-      byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat);
+      byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -226,12 +238,14 @@ public void testFloatOrdering() {
 
   @Test
   public void testDoubleOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(Double.BYTES);
+    ByteBuffer bBuffer = ByteBuffer.allocate(Double.BYTES);
     for (int i = 0; i < NUM_TESTS; i++) {
       double aDouble = random.nextDouble();
       double bDouble = random.nextDouble();
       int doubleCompare = Integer.signum(Double.compare(aDouble, bDouble));
-      byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble);
-      byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble);
+      byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(
@@ -243,12 +257,14 @@ public void testDoubleOrdering() {
 
   @Test
   public void testStringOrdering() {
+    ByteBuffer aBuffer = ByteBuffer.allocate(128);
+    ByteBuffer bBuffer = ByteBuffer.allocate(128);
     for (int i = 0; i < NUM_TESTS; i++) {
       String aString =  (String) RandomUtil.generatePrimitive(Types.StringType.get(), random);
       String bString =  (String) RandomUtil.generatePrimitive(Types.StringType.get(), random);
       int stringCompare = Integer.signum(aString.compareTo(bString));
-      byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128);
-      byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128);
+      byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128, aBuffer);
+      byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128, bBuffer);
       int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes));
 
       Assert.assertEquals(String.format(