-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Adds Utility Class for Implementing ZOrdering #3966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
ade1a1f
781a121
fda817c
c6954e6
3f6fc92
5ccf8f4
30c4633
10e561c
74d20a4
848de3b
83586f1
57e1462
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.util; | ||
|
|
||
| import java.util.Arrays; | ||
|
|
||
| /** | ||
| * Within Z-Ordering the byte representations of objects being compared must be ordered, | ||
| * this requires several types to be transformed when converted to bytes. The goal is to | ||
| * map object's whose byte representation are not lexicographically ordered into representations | ||
| * that are lexicographically ordered. | ||
| * Most of these techniques are derived from | ||
| * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ | ||
| */ | ||
| public class ZOrderByteUtils { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be documented somewhere the produced byte[] should be compared lexicographically, with bytes as unsigned.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I will add this |
||
|
|
||
| private ZOrderByteUtils() { | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * Signed ints do not have their bytes in magnitude order because of the sign bit. | ||
| * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially | ||
| * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. | ||
| */ | ||
| public static byte[] orderIntLikeBytes(byte[] intBytes, int size) { | ||
|
||
| if (intBytes == null) { | ||
| return new byte[size]; | ||
| } | ||
| intBytes[0] = (byte) (intBytes[0] ^ (1 << 7)); | ||
| return intBytes; | ||
| } | ||
|
|
||
| /** | ||
| * IEEE 754 : | ||
| * “If two floating-point numbers in the same format are ordered (say, x \< y), | ||
| * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.” | ||
| * | ||
| * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically | ||
| * comparable bytes | ||
| */ | ||
| public static byte[] orderFloatLikeBytes(byte[] floatBytes, int size) { | ||
|
||
| if (floatBytes == null) { | ||
| return new byte[size]; | ||
| } | ||
| if ((floatBytes[0] & (1 << 7)) == 0) { | ||
|
||
| // The signed magnitude is positive set the first bit (reversing the sign so positives order after negatives) | ||
| floatBytes[0] = (byte) (floatBytes[0] | (1 << 7)); | ||
| } else { | ||
| // The signed magnitude is negative so flip the first bit (reversing the sign so positives order after negatives) | ||
| // Then flip all remaining bits so numbers with greater negative magnitude come before those | ||
| // with less magnitude (reverse the order) | ||
| for (int i = 0; i < floatBytes.length; i++) { | ||
| floatBytes[i] = (byte) ~floatBytes[i]; | ||
| } | ||
| } | ||
| return floatBytes; | ||
| } | ||
|
|
||
| /** | ||
| * Strings are lexicographically sortable BUT if different byte array lengths will | ||
| * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time). | ||
| * This implementation just uses a set size to for all output byte representations. Truncating longer strings | ||
| * and right padding 0 for shorter strings. | ||
| */ | ||
| public static byte[] orderUTF8LikeBytes(byte[] stringBytes, int size) { | ||
| if (stringBytes == null) { | ||
| return new byte[size]; | ||
| } | ||
| return Arrays.copyOf(stringBytes, size); | ||
| } | ||
|
|
||
| /** | ||
| * Interleave bits using a naive loop. | ||
| * @param columnsBinary an array of byte arrays, none of which are empty | ||
|
||
| * @return their bits interleaved | ||
| */ | ||
| public static byte[] interleaveBits(byte[][] columnsBinary) { | ||
|
||
| int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum(); | ||
| byte[] interleavedBytes = new byte[interleavedSize]; | ||
| int sourceBit = 7; | ||
| int sourceByte = 0; | ||
| int sourceColumn = 0; | ||
| int interleaveBit = 7; | ||
| int interleaveByte = 0; | ||
| while (interleaveByte < interleavedSize) { | ||
| // Take what we have, Get the source Bit of the source Byte, move it to the interleaveBit position | ||
| interleavedBytes[interleaveByte] = | ||
| (byte) (interleavedBytes[interleaveByte] | | ||
|
||
| (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit); | ||
|
||
|
|
||
| if (--interleaveBit == -1) { | ||
| // Finished a byte in our interleave byte array start a new byte | ||
| interleaveByte++; | ||
| interleaveBit = 7; | ||
| } | ||
|
|
||
| // Find next column with a byte we can use | ||
| do { | ||
| if (++sourceColumn == columnsBinary.length) { | ||
|
||
| sourceColumn = 0; | ||
| if (--sourceBit == -1) { | ||
|
||
| sourceByte++; | ||
| sourceBit = 7; | ||
| } | ||
| } | ||
| } while (columnsBinary[sourceColumn].length <= sourceByte && interleaveByte < interleavedSize); | ||
|
||
| } | ||
| return interleavedBytes; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,244 @@ | ||||||||||||||||||||||||||||||||
| /* | ||||||||||||||||||||||||||||||||
| * Licensed to the Apache Software Foundation (ASF) under one | ||||||||||||||||||||||||||||||||
| * or more contributor license agreements. See the NOTICE file | ||||||||||||||||||||||||||||||||
| * distributed with this work for additional information | ||||||||||||||||||||||||||||||||
| * regarding copyright ownership. The ASF licenses this file | ||||||||||||||||||||||||||||||||
| * to you under the Apache License, Version 2.0 (the | ||||||||||||||||||||||||||||||||
| * "License"); you may not use this file except in compliance | ||||||||||||||||||||||||||||||||
| * with the License. You may obtain a copy of the License at | ||||||||||||||||||||||||||||||||
| * | ||||||||||||||||||||||||||||||||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||||||||||||
| * | ||||||||||||||||||||||||||||||||
| * Unless required by applicable law or agreed to in writing, | ||||||||||||||||||||||||||||||||
| * software distributed under the License is distributed on an | ||||||||||||||||||||||||||||||||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||||||||||||||||||||||||||||
| * KIND, either express or implied. See the License for the | ||||||||||||||||||||||||||||||||
| * specific language governing permissions and limitations | ||||||||||||||||||||||||||||||||
| * under the License. | ||||||||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| package org.apache.iceberg.util; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| import java.nio.ByteBuffer; | ||||||||||||||||||||||||||||||||
| import java.nio.charset.StandardCharsets; | ||||||||||||||||||||||||||||||||
| import java.util.Arrays; | ||||||||||||||||||||||||||||||||
| import java.util.Random; | ||||||||||||||||||||||||||||||||
| import org.apache.commons.lang3.RandomStringUtils; | ||||||||||||||||||||||||||||||||
| import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes; | ||||||||||||||||||||||||||||||||
| import org.junit.Assert; | ||||||||||||||||||||||||||||||||
| import org.junit.Test; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| public class TestZOrderByteUtil { | ||||||||||||||||||||||||||||||||
| private static final byte IIIIIIII = (byte) 255; | ||||||||||||||||||||||||||||||||
| private static final byte IOIOIOIO = (byte) 170; | ||||||||||||||||||||||||||||||||
| private static final byte OIOIOIOI = (byte) 85; | ||||||||||||||||||||||||||||||||
| private static final byte OOOOIIII = (byte) 15; | ||||||||||||||||||||||||||||||||
| private static final byte OOOOOOOI = (byte) 1; | ||||||||||||||||||||||||||||||||
| private static final byte OOOOOOOO = (byte) 0; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private static final int NUM_TESTS = 100000; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private final Random random = new Random(42); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private String bytesToString(byte[] bytes) { | ||||||||||||||||||||||||||||||||
| StringBuilder result = new StringBuilder(); | ||||||||||||||||||||||||||||||||
| for (byte b : bytes) { | ||||||||||||||||||||||||||||||||
| result.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(' ', '0')); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| return result.toString(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||||||||
| * Returns a non-0 length byte array | ||||||||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||||||||
| private byte[] generateRandomBytes() { | ||||||||||||||||||||||||||||||||
| int length = Math.abs(random.nextInt(100) + 1); | ||||||||||||||||||||||||||||||||
| byte[] result = new byte[length]; | ||||||||||||||||||||||||||||||||
| random.nextBytes(result); | ||||||||||||||||||||||||||||||||
| return result; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||||||||
| * Test method to ensure correctness of byte interleaving code | ||||||||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||||||||
| private String interleaveStrings(String[] strings) { | ||||||||||||||||||||||||||||||||
| StringBuilder result = new StringBuilder(); | ||||||||||||||||||||||||||||||||
| int totalLength = Arrays.stream(strings).mapToInt(String::length).sum(); | ||||||||||||||||||||||||||||||||
| int substringIndex = 0; | ||||||||||||||||||||||||||||||||
| int characterIndex = 0; | ||||||||||||||||||||||||||||||||
| while (characterIndex < totalLength) { | ||||||||||||||||||||||||||||||||
| for (String str : strings) { | ||||||||||||||||||||||||||||||||
| if (substringIndex < str.length()) { | ||||||||||||||||||||||||||||||||
| result.append(str.charAt(substringIndex)); | ||||||||||||||||||||||||||||||||
| characterIndex++; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| substringIndex++; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| return result.toString(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||||||||
| * Compares the result of a string based interleaving algorithm implemented above | ||||||||||||||||||||||||||||||||
| * versus the binary bit-shifting algorithm used in ZOrderByteUtils. Either both | ||||||||||||||||||||||||||||||||
| * algorithms are identically wrong or are both identically correct. | ||||||||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testInterleaveRandomExamples() { | ||||||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. takes 17-18 seconds to run on my local machine, just wondering whether we want to have such long running unit tests
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd probably
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can cut down the number of example :) I just really wanted to have some kind of fuzzing since I know we are going to want to optimize the interleave example with fancier byte tricks in the future.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed the number of trials run so now this takes a few hundred ms. The longest test in the suite now is the String compare test which is around 1 second on my 2019 MBP |
||||||||||||||||||||||||||||||||
| for (int test = 0; test < NUM_TESTS; test++) { | ||||||||||||||||||||||||||||||||
| int numByteArrays = Math.abs(random.nextInt(6)) + 1; | ||||||||||||||||||||||||||||||||
| byte[][] testBytes = new byte[numByteArrays][]; | ||||||||||||||||||||||||||||||||
| String[] testStrings = new String[numByteArrays]; | ||||||||||||||||||||||||||||||||
| for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { | ||||||||||||||||||||||||||||||||
| testBytes[byteIndex] = generateRandomBytes(); | ||||||||||||||||||||||||||||||||
| testStrings[byteIndex] = bytesToString(testBytes[byteIndex]); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| byte[] byteResult = ZOrderByteUtils.interleaveBits(testBytes); | ||||||||||||||||||||||||||||||||
| String byteResultAsString = bytesToString(byteResult); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| String stringResult = interleaveStrings(testStrings); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertEquals("String interleave didn't match byte interleave", stringResult, byteResultAsString); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testInterleaveEmptyBits() { | ||||||||||||||||||||||||||||||||
| byte[][] test = new byte[4][10]; | ||||||||||||||||||||||||||||||||
| byte[] expected = new byte[40]; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertArrayEquals("Should combine empty arrays", | ||||||||||||||||||||||||||||||||
| expected, ZOrderByteUtils.interleaveBits(test)); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testInterleaveFullBits() { | ||||||||||||||||||||||||||||||||
| byte[][] test = new byte[4][]; | ||||||||||||||||||||||||||||||||
| test[0] = new byte[]{IIIIIIII, IIIIIIII}; | ||||||||||||||||||||||||||||||||
| test[1] = new byte[]{IIIIIIII}; | ||||||||||||||||||||||||||||||||
| test[2] = new byte[0]; | ||||||||||||||||||||||||||||||||
| test[3] = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII}; | ||||||||||||||||||||||||||||||||
| byte[] expected = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII}; | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertArrayEquals("Should combine full arrays", | ||||||||||||||||||||||||||||||||
| expected, ZOrderByteUtils.interleaveBits(test)); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testInterleaveMixedBits() { | ||||||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that this is the correct behavior in all cases. I think that this is the correct behavior for strings because the length of the string doesn't matter. short v1 = 0xFFFF
int v2 = 0x0000FFFF
byte[] result = ZOrderByteUtils.interleaveBits(new byte[] { shortToOrderedBytes(v1), intToOrderedBytes(v2) }Result is going to be: This may be handy when trying to handle magnitude problems, assuming that the magnitude is always reflected in the type. But you could easily have a case where v1 and v2 are both in the range of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
nit: I think the result fails to account for
Not sure if the nit above addresses your concerns and what exactly you mean by not a useful z-order, could you expand on them? My understanding is that as long as the input bytes that get interleaved have an equivalent lexicographic order to the original inputs it is a z-order and should maintain the clustering properties z-orders provide, if you then compare the output bytes lexicographically. The only other principled option I could see here would be to zero pad all numeric values to the same number of bytes, but without thinking too deeply about it I think this just takes extra space.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (strings are always expected to be padded to a fixed length)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about this some more, I think the padding (casting to highest bit-width) is the right thing to do (below are tables showing 1 x 2 bit examples (I think what @rdblue meant by useful vs not-useful can be seen by following). The top and outer left most columns represent the input numbers Existing (this is essentially normal sorting by each element):
With padding:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have been talking about this a bit, but I think the only way to really solve this is to know the range of all column values before getting their binary values and shifting then. I think at this point in the algorithm (during interleaving) we don't have enough information to know whether we can bit shift a value into higher order bits but I do agree eventually we should have a representation that packs the information into the highest bits possible. Ideally I think this is a responsibility of the "toOrderedBinary" which I would like to have a "min/max" parameters in the future so we can shift and tighten the domain if possible.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree. I think this might imply that we always want an equal number of bits on each input to interleave (need to think about this some more)? I guess falling back to traditional sorting isn't a bad fallback but we are wasting compute to make that happen.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, I didn't do the sign bit flip, but the result is pretty much the same. All of the relevant bits of the short are interleaved with the irrelevant bits of the int and you effectively get an expensive way to sort by the short and then by the int. I think it makes the most sense to cast everything to the widest type. I think that is the behavior most people would expect: if you zorder a short with an int, it happens by magnitude not bit position.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have the general feeling that different columns sharing the same magnitude values but different types is more of a special case but I don't have any empirical evidence for that. I'm fine with doing the upcasting but maybe it makes more sense to add it in as part of a more complicated builder as suggested by @emkornfield. Like perhaps we have a class that takes all the column types (and in the future stats?) and it chooses the correct ordered byte representations and returns a ZOrder method for those specific input parameters.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's another simple option that I think solves an additional challenge... I'd propose that we cast everything to float so we don't have to worry about choosing a type and converting everything to it. That also gives us a more compact representation that is better for packing multiple columns. That's because floating point values are represented as scientific notation with base-2. The exponent encodes how many positions the fraction was shifted so that its first digit is a 1. Alternatively, you can think of it as encoding how many bits at the start of the number are 0, then storing a normalized fraction. That effectively compresses the most significant parts of a number into the highest bits. For float32, there's a sign bit, 8 bits of exponent, and 23 bits of fraction (mantissa). Converting all the values to floating point would solve the immediate problem because all numbers would use the same representation, giving what I think is the expected behavior (same result regardless of type). But it also means that we can fit more columns in 32 bits or 64 bits, which are going to be much faster for comparison once we pass them off to Spark because we could use native ints or longs rather than byte arrays. For example, if we took 4 integers and interleaved them, it would take 128 bits. But if we convert those to float first, then the most significant parts can fit in half that: 16 bits per value can carry the sign bit, the full 8-bit exponent, and 7 bits of the fraction. That's about 128 partitions of the number space per exponent value. If you don't want to lose so much of the value, then fewer columns do better. And we can always fall back to producing the full byte array if we want. What do you guys think? |
||||||||||||||||||||||||||||||||
| byte[][] test = new byte[4][]; | ||||||||||||||||||||||||||||||||
| test[0] = new byte[]{OOOOOOOI, IIIIIIII, OOOOOOOO, OOOOIIII}; | ||||||||||||||||||||||||||||||||
| test[1] = new byte[]{OOOOOOOI, OOOOOOOO, IIIIIIII}; | ||||||||||||||||||||||||||||||||
| test[2] = new byte[]{OOOOOOOI}; | ||||||||||||||||||||||||||||||||
| test[3] = new byte[]{OOOOOOOI}; | ||||||||||||||||||||||||||||||||
| byte[] expected = new byte[]{ | ||||||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could argue that the last byte of
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My base implementation of this for spark just uses a fixed byte size for strings since each column has to contribute the same number of bytes every time. I put this responsibility in the toOrderedBytes function and not in the interleave itself. We need all entries from the same column to be the same length for the following situation Imagine columns A,B, and C where A is between 0 and 2 bits, but B and C are always 2 bits. You could get ZValues like (AA, BB, CC) - >ABCABC The sorting is now dependent on the length of the column value and not the value itself. Shorter A's being clustered differently than longer A's. As for in the trailing situation, I think it's worthwhile to keep those bits as well since we can just use them for hierarchal sort within clusters. You can imagine the interleave bits creating multidimensional clustered groups and the final bits producing a hierarchal sort within those groups based on the unshared portion of the longest element. I do think in the future we may want to save space here and then we could trim the max length of a contributed byte array to the length of the second longest contributor.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing you could do since it doesn't matter is just break out of the interleave loop if moving to the next column doesn't change the column index. It would be nice to not allocate the space as well.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a good way to do this may be to change the api slightly to public static byte[] interleaveBits(byte[][] columnsBinary) {
return interleaveBits(columnsBinary,
Arrays.stream(columnsBinary).mapToInt(column -> column.length).max().getAsInt());
}
/**
* Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is
* required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all
* columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the
* interleaving.
* @param columnsBinary an array of ordered byte representations of the columns being ZOrdered
* @param interleavedSize the number of bytes to use in the output
* @return the columnbytes interleaved
*/
public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) {So that instead of calculating the output-size, the caller can just pass in the number of bytes it wants out of the algorithm. This handles bailing out earlier as well as not allocating as much. I can leave the original signature for the test suite.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to asking for a specific interleaved size. You could also have versions that produce int or long if you only want 4 or 8 bytes. That would work really well with Spark because you wouldn't have to store this in the variable-length part of UnsafeRow. |
||||||||||||||||||||||||||||||||
| OOOOOOOO, OOOOOOOO, OOOOOOOO, OOOOIIII, | ||||||||||||||||||||||||||||||||
| IOIOIOIO, IOIOIOIO, | ||||||||||||||||||||||||||||||||
| OIOIOIOI, OIOIOIOI, | ||||||||||||||||||||||||||||||||
| OOOOIIII}; | ||||||||||||||||||||||||||||||||
| Assert.assertArrayEquals("Should combine mixed byte arrays", | ||||||||||||||||||||||||||||||||
| expected, ZOrderByteUtils.interleaveBits(test)); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testIntOrdering() { | ||||||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good tests. |
||||||||||||||||||||||||||||||||
| for (int i = 0; i < NUM_TESTS; i++) { | ||||||||||||||||||||||||||||||||
| int aInt = random.nextInt(); | ||||||||||||||||||||||||||||||||
| int bInt = random.nextInt(); | ||||||||||||||||||||||||||||||||
| int intCompare = Integer.compare(aInt, bInt); | ||||||||||||||||||||||||||||||||
| byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aInt), 4); | ||||||||||||||||||||||||||||||||
| byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bInt), 4); | ||||||||||||||||||||||||||||||||
| int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertTrue(String.format( | ||||||||||||||||||||||||||||||||
| "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", | ||||||||||||||||||||||||||||||||
| aInt, bInt, intCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), | ||||||||||||||||||||||||||||||||
| (intCompare ^ byteCompare) >= 0); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testLongOrdering() { | ||||||||||||||||||||||||||||||||
| for (int i = 0; i < NUM_TESTS; i++) { | ||||||||||||||||||||||||||||||||
| long aLong = random.nextInt(); | ||||||||||||||||||||||||||||||||
| long bLong = random.nextInt(); | ||||||||||||||||||||||||||||||||
| int longCompare = Long.compare(aLong, bLong); | ||||||||||||||||||||||||||||||||
| byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aLong), 8); | ||||||||||||||||||||||||||||||||
| byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bLong), 8); | ||||||||||||||||||||||||||||||||
| int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertTrue(String.format( | ||||||||||||||||||||||||||||||||
| "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", | ||||||||||||||||||||||||||||||||
| aLong, bLong, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), | ||||||||||||||||||||||||||||||||
| (longCompare ^ byteCompare) >= 0); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testFloatOrdering() { | ||||||||||||||||||||||||||||||||
| for (int i = 0; i < NUM_TESTS; i++) { | ||||||||||||||||||||||||||||||||
| float aFloat = random.nextFloat(); | ||||||||||||||||||||||||||||||||
| float bFloat = random.nextFloat(); | ||||||||||||||||||||||||||||||||
| int floatCompare = Float.compare(aFloat, bFloat); | ||||||||||||||||||||||||||||||||
| byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aFloat), 4); | ||||||||||||||||||||||||||||||||
| byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bFloat), 4); | ||||||||||||||||||||||||||||||||
| int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertTrue(String.format( | ||||||||||||||||||||||||||||||||
| "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", | ||||||||||||||||||||||||||||||||
| aFloat, bFloat, floatCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), | ||||||||||||||||||||||||||||||||
| (floatCompare ^ byteCompare) >= 0); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testDoubleOrdering() { | ||||||||||||||||||||||||||||||||
| for (int i = 0; i < NUM_TESTS; i++) { | ||||||||||||||||||||||||||||||||
| double aDouble = random.nextDouble(); | ||||||||||||||||||||||||||||||||
| double bDouble = random.nextDouble(); | ||||||||||||||||||||||||||||||||
| int doubleCompare = Double.compare(aDouble, bDouble); | ||||||||||||||||||||||||||||||||
| byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aDouble), 8); | ||||||||||||||||||||||||||||||||
| byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bDouble), 8); | ||||||||||||||||||||||||||||||||
| int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertTrue(String.format( | ||||||||||||||||||||||||||||||||
| "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", | ||||||||||||||||||||||||||||||||
| aDouble, bDouble, doubleCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), | ||||||||||||||||||||||||||||||||
| (doubleCompare ^ byteCompare) >= 0); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||||||||
| public void testStringOrdering() { | ||||||||||||||||||||||||||||||||
| for (int i = 0; i < NUM_TESTS; i++) { | ||||||||||||||||||||||||||||||||
| String aString = RandomStringUtils.random(random.nextInt(35), true, true); | ||||||||||||||||||||||||||||||||
| String bString = RandomStringUtils.random(random.nextInt(35), true, true); | ||||||||||||||||||||||||||||||||
| int stringCompare = aString.compareTo(bString); | ||||||||||||||||||||||||||||||||
| byte[] aBytes = ZOrderByteUtils.orderUTF8LikeBytes(aString.getBytes(StandardCharsets.UTF_8), 128); | ||||||||||||||||||||||||||||||||
| byte[] bBytes = ZOrderByteUtils.orderUTF8LikeBytes(bString.getBytes(StandardCharsets.UTF_8), 128); | ||||||||||||||||||||||||||||||||
| int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Assert.assertTrue(String.format( | ||||||||||||||||||||||||||||||||
| "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", | ||||||||||||||||||||||||||||||||
| aString, bString, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), | ||||||||||||||||||||||||||||||||
| (stringCompare ^ byteCompare) >= 0); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private byte[] bytesOf(int num) { | ||||||||||||||||||||||||||||||||
| return ByteBuffer.allocate(4).putInt(num).array(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private byte[] bytesOf(long num) { | ||||||||||||||||||||||||||||||||
| return ByteBuffer.allocate(8).putLong(num).array(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private byte[] bytesOf(float num) { | ||||||||||||||||||||||||||||||||
| return ByteBuffer.allocate(4).putFloat(num).array(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| private byte[] bytesOf(double num) { | ||||||||||||||||||||||||||||||||
| return ByteBuffer.allocate(8).putDouble(num).array(); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.