apache · RussellSpitzer · Apr 21, 2022 · Jan 21, 2022 · Jan 25, 2022 · Jan 31, 2022
diff --git a/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java b/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java
@@ -129,6 +129,15 @@ default RewriteDataFiles sort(SortOrder sortOrder) {
     throw new UnsupportedOperationException("SORT Rewrite Strategy not implemented for this framework");
   }
 
+  /**
+   * Choose Z-ORDER as a strategy for this rewrite operation with a specified list of columns to use
+   * @param columns Columns to be used to generate Z-Values
+   * @return this for method chaining
+   */
+  default RewriteDataFiles zOrder(String... columns) {
+    throw new UnsupportedOperationException("Z-ORDER Rewrite Strategy not implemented for this framework");
+  }
+
   /**
    * A user provided filter for determining which files will be considered by the rewrite strategy. This will be used
    * in addition to whatever rules the rewrite strategy generates. For example this would be used for providing a

diff --git a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java
@@ -21,6 +21,7 @@
 
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 
 public class ByteBuffers {
 
@@ -46,6 +47,16 @@ public static byte[] toByteArray(ByteBuffer buffer) {
     }
   }
 
+  public static ByteBuffer reuse(ByteBuffer reuse, int length) {
+    Preconditions.checkArgument(reuse.hasArray(), "Cannot reuse a buffer not backed by an array");
+    Preconditions.checkArgument(reuse.arrayOffset() == 0, "Cannot reuse a buffer whose array offset is not 0");
+    Preconditions.checkArgument(reuse.capacity() == length,
+        "Canout use a buffer whose capacity (%s) is not equal to the requested length (%s)", length, reuse.capacity());
+    reuse.position(0);
+    reuse.limit(length);
+    return reuse;
+  }
+
   public static ByteBuffer copy(ByteBuffer buffer) {
     if (buffer == null) {
       return null;

diff --git a/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java b/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.iceberg.util;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.infra.Blackhole;
+
+@Fork(1)
+@State(Scope.Benchmark)
+@Measurement(iterations = 5)
+@BenchmarkMode(Mode.SingleShotTime)
+@Timeout(time = 1000, timeUnit = TimeUnit.HOURS)
+public class ZOrderByteUtilsBenchmark {
+
+  private static final int NUM_ENTRIES = 10000000;
+
+  private byte[][][] fourColumnInput;
+  private byte[][][] threeColumnInput;
+  private byte[][][] twoColumnInput;
+
+  @Setup
+  public void setupBench() {
+    Random rand = new Random(42);
+    fourColumnInput = new byte[NUM_ENTRIES][][];
+    threeColumnInput = new byte[NUM_ENTRIES][][];
+    twoColumnInput = new byte[NUM_ENTRIES][][];
+    for (int i = 0; i < NUM_ENTRIES; i++) {
+      fourColumnInput[i] = new byte[4][];
+      threeColumnInput[i] = new byte[3][];
+      twoColumnInput[i] = new byte[2][];
+      for (int j = 0; j < 4; j++) {
+        byte[] value = ByteBuffer.allocate(Long.BYTES).putLong(rand.nextLong()).array();
+        if (j < 2) {
+          twoColumnInput[i][j] = value;
+        }
+        if (j < 3) {
+          threeColumnInput[i][j] = value;
+        }
+        fourColumnInput[i][j] = value;
+      }
+    }
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void interleaveValuesFourColumns(Blackhole blackhole) {
+    int outputSize = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE * 4;
+    ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize);
+
+    for (int i = 0; i < fourColumnInput.length; i++) {
+      byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize,  outputBuffer);
+      blackhole.consume(interleavedBytes);
+    }
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void interleaveValuesThreeColumns(Blackhole blackhole) {
+    int outputSize = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE * 3;
+    ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize);
+
+    for (int i = 0; i < fourColumnInput.length; i++) {
+      byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(threeColumnInput[i], outputSize,  outputBuffer);
+      blackhole.consume(interleavedBytes);
+    }
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void interleaveValuesTwoColumns(Blackhole blackhole) {
+    int outputSize = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE * 2;
+    ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize);
+
+    for (int i = 0; i < fourColumnInput.length; i++) {
+      byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(twoColumnInput[i], outputSize,  outputBuffer);
+      blackhole.consume(interleavedBytes);
+    }
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void interleaveValuesFourColumns8ByteOutput(Blackhole blackhole) {
+    int outputSize = 8;
+    ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize);
+
+    for (int i = 0; i < fourColumnInput.length; i++) {
+      byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize,  outputBuffer);
+      blackhole.consume(interleavedBytes);
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.util;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+
+/**
+ * Within Z-Ordering the byte representations of objects being compared must be ordered,
+ * this requires several types to be transformed when converted to bytes. The goal is to
+ * map object's whose byte representation are not lexicographically ordered into representations
+ * that are lexicographically ordered. Bytes produced should be compared lexicographically as
+ * unsigned bytes, big-endian.
+ * <p>
+ * All types except for String are stored within an 8 Byte Buffer
+ * <p>
+ * Most of these techniques are derived from
+ * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/
+ * <p>
+ * Some implementation is taken from
+ * https://github.com/apache/hbase/blob/master/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java
+ */
+public class ZOrderByteUtils {
+
+  public static final int PRIMITIVE_BUFFER_SIZE = 8;
+
+  private ZOrderByteUtils() {
+
+  }
+
+  static ByteBuffer allocatePrimitiveBuffer() {
+    return ByteBuffer.allocate(PRIMITIVE_BUFFER_SIZE);
+  }
+
+  /**
+   * Signed ints do not have their bytes in magnitude order because of the sign bit.
+   * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially
+   * shifts the 0 value so that we don't break our ordering when we cross the new 0 value.
+   */
+  public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    bytes.putLong(((long) val) ^ 0x8000000000000000L);
+    return bytes;
+  }
+
+  /**
+   * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
+   */
+  public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    bytes.putLong(val ^ 0x8000000000000000L);
+    return bytes;
+  }
+
+  /**
+   * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
+   */
+  public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    bytes.putLong(((long) val) ^ 0x8000000000000000L);
+    return bytes;
+  }
+
+  /**
+   * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)}
+   */
+  public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    bytes.putLong(((long) val) ^ 0x8000000000000000L);
+    return bytes;
+  }
+
+  /**
+   * IEEE 754 :
+   * “If two floating-point numbers in the same format are ordered (say, x {@literal <} y),
+   * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.”
+   *
+   * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically
+   * comparable bytes
+   */
+  public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    long lval = Double.doubleToLongBits(val);
+    lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE);
+    bytes.putLong(lval);
+    return bytes;
+  }
+
+  /**
+   * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)}
+   */
+  public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE);
+    long lval = Double.doubleToLongBits(val);
+    lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE);
+    bytes.putLong(lval);
+    return bytes;
+  }
+
+  /**
+   * Strings are lexicographically sortable BUT if different byte array lengths will
+   * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time).
+   * This implementation just uses a set size to for all output byte representations. Truncating longer strings
+   * and right padding 0 for shorter strings.
+   */
+  public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse, CharsetEncoder encoder) {
+    Preconditions.checkArgument(encoder.charset().equals(StandardCharsets.UTF_8),
+        "Cannot use an encoder not using UTF_8 as it's Charset");
+
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, length);
+    Arrays.fill(bytes.array(), 0, length, (byte) 0x00);
+    if (val != null) {
+      CharBuffer inputBuffer = CharBuffer.wrap(val);
+      encoder.encode(inputBuffer, bytes, true);
+    }
+    return bytes;
+  }
+
+  /**
+   * Return a bytebuffer with the given bytes truncated to length, or filled with 0's to length depending on whether
+   * the given bytes are larger or smaller than the given length.
+   */
+  public static ByteBuffer byteTruncateOrFill(byte[] val, int length, ByteBuffer reuse) {
+    ByteBuffer bytes = ByteBuffers.reuse(reuse, length);
+    if (val.length < length) {
+      bytes.put(val, 0, val.length);
+      Arrays.fill(bytes.array(), val.length, length, (byte) 0x00);
+    } else {
+      bytes.put(val, 0, length);
+    }
+    return bytes;
+  }
+
+  static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) {
+    return interleaveBits(columnsBinary, interleavedSize, ByteBuffer.allocate(interleavedSize));
+  }
+
+  /**
+   * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is
+   * required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all
+   * columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the
+   * interleaving.
+   * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered
+   * @param interleavedSize the number of bytes to use in the output
+   * @return the columnbytes interleaved
+   */
+  public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize, ByteBuffer reuse) {
+    byte[] interleavedBytes = reuse.array();
+    int sourceColumn = 0;
+    int sourceByte = 0;
+    int sourceBit = 7;
+    int interleaveByte = 0;
+    int interleaveBit = 7;
+
+    while (interleaveByte < interleavedSize) {
+      // Take the source bit from source byte and move it to the output bit position
+      interleavedBytes[interleaveByte] |=
+              (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit;
+      --interleaveBit;
+
+      // Check if an output byte has been completed
+      if (interleaveBit == -1) {
+        // Move to the next output byte
+        interleaveByte++;
+        // Move to the highest order bit of the new output byte
+        interleaveBit = 7;
+      }
+
+      // Check if the last output byte has been completed
+      if (interleaveByte == interleavedSize) {
+        break;
+      }
+
+      // Find the next source bit to interleave
+      do {
+        // Move to next column
+        ++sourceColumn;
+        if (sourceColumn == columnsBinary.length) {
+          // If the last source column was used, reset to next bit of first column
+          sourceColumn = 0;
+          --sourceBit;
+          if (sourceBit == -1) {
+            // If the last bit of the source byte was used, reset to the highest bit of the next byte
+            sourceByte++;
+            sourceBit = 7;
+          }
+        }
+      } while (columnsBinary[sourceColumn].length <= sourceByte);
+    }
+    return interleavedBytes;
+  }
+
+}