apache · rdblue · Feb 6, 2017 · Feb 7, 2017 · Feb 7, 2017 · Feb 14, 2017
diff --git a/LogicalTypes.md b/LogicalTypes.md
@@ -37,6 +37,8 @@ may require additional metadata fields, as well as rules for those fields.
 `UTF8` may only be used to annotate the binary primitive type and indicates
 that the byte array should be interpreted as a UTF-8 encoded character string.
 
+The sort order used for `UTF8` strings must be `UNSIGNED` byte-wise comparison.
+
 ## Numeric Types
 
 ### Signed Integers
@@ -55,6 +57,8 @@ allows.
 implied by the `int32` and `int64` primitive types if no other annotation is
 present and should be considered optional.
 
+The sort order used for signed integer types must be `SIGNED`.
+
 ### Unsigned Integers
 
 `UINT_8`, `UINT_16`, `UINT_32`, and `UINT_64` annotations can be used to
@@ -70,6 +74,8 @@ allows.
 `UINT_8`, `UINT_16`, and `UINT_32` must annotate an `int32` primitive type and
 `UINT_64` must annotate an `int64` primitive type.
 
+The sort order used for unsigned integer types must be `UNSIGNED`.
+
 ### DECIMAL
 
 `DECIMAL` annotation represents arbitrary-precision signed decimal numbers of
@@ -98,6 +104,15 @@ integer. A precision too large for the underlying type (see below) is an error.
 A `SchemaElement` with the `DECIMAL` `ConvertedType` must also have both
 `scale` and `precision` fields set, even if scale is 0 by default.
 
+The sort order used for `DECIMAL` values must be `SIGNED`. The order is
+must be equivalent to signed comparison of decimal values.
+
+If the column uses `int32` or `int64` physical types, then signed comparison of
+the integer values produces the correct ordering. If the physical type is
+fixed, then the correct ordering can be produced by flipping the
+most-significant bit in the first byte and then using unsigned byte-wise
+comparison.
+
 ## Date/Time Types
 
 ### DATE
@@ -106,30 +121,40 @@ A `SchemaElement` with the `DECIMAL` `ConvertedType` must also have both
 annotate an `int32` that stores the number of days from the Unix epoch, 1
 January 1970.
 
+The sort order used for `DATE` is `SIGNED`.
+
 ### TIME\_MILLIS
 
 `TIME_MILLIS` is used for a logical time type with millisecond precision,
 without a date. It must annotate an `int32` that stores the number of
 milliseconds after midnight.
 
+The sort order used for `TIME\_MILLIS` is `SIGNED`.
+
 ### TIME\_MICROS
 
 `TIME_MICROS` is used for a logical time type with microsecond precision,
 without a date. It must annotate an `int64` that stores the number of
 microseconds after midnight.
 
+The sort order used for `TIME\_MICROS` is `SIGNED`.
+
 ### TIMESTAMP\_MILLIS
 
 `TIMESTAMP_MILLIS` is used for a combined logical date and time type, with
 millisecond precision. It must annotate an `int64` that stores the number of
 milliseconds from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC.
 
+The sort order used for `TIMESTAMP\_MILLIS` is `SIGNED`.
+
 ### TIMESTAMP\_MICROS
 
 `TIMESTAMP_MICROS` is used for a combined logical date and time type with
 microsecond precision. It must annotate an `int64` that stores the number of
 microseconds from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC.
 
+The sort order used for `TIMESTAMP\_MICROS` is `SIGNED`.
+
 ### INTERVAL
 
 `INTERVAL` is used for an interval of time. It must annotate a
@@ -144,8 +169,13 @@ example, there is no requirement that a large number of days should be
 expressed as a mix of months and days because there is not a constant
 conversion from days to months.
 
+The sort order used for `INTERVAL` is `UNSIGNED`, produced by sorting by
+the value of months, then days, then milliseconds with unsigned comparison.
+
 ## Embedded Types
 
+Embedded types do not have type-specific orderings.
+
 ### JSON
 
 `JSON` is used for an embedded JSON document. It must annotate a `binary`

diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
@@ -547,6 +547,58 @@ struct RowGroup {
   4: optional list<SortingColumn> sorting_columns
 }
 
+/** Identifier for built-in sort order used to produce min and max values. */
+enum Order {
+  /**
+   * The signed ordering is the order produced by comparing single primitive
+   * values with a signed comparator, or the lexicographic ordering produced by
+   * comparing each byte of a binary or fixed using a signed comparator.
+   *
+   * (A signed comparator uses the most-significant bit as a sign bit; an
+   * unsigned comparator uses the most-significant bit as part of the value's
+   * magnitude. Note that unsigned comparison is not defined for floating
+   * point values.)
+   */
+  SIGNED = 0;
+
+  /**
+   * The unsigned ordering is produced by comparing single primitive values
+   * with an unsigned comparison, or the lexicographic ordering produced by
+   * comparing each byte of a binary or fixed using an unsigned comparator.
+   *
+   * (A signed comparator uses the most-significant bit as a sign bit; an
+   * unsigned comparator uses the most-significant bit as part of the value's
+   * magnitude. Note that unsigned comparison is not defined for floating
+   * point values.)
+   */
+  UNSIGNED = 1;
+
+  /**
+   * Identifiers for custom orderings, to be defined in the ColumnOrder struct.
+   */
+  //CUSTOM = 2;
+}
+
+/** Descriptor for the order used for min, max, and sorting values in a column
+ */
+struct ColumnOrder {
+  /** The order used for this column */
+  1: required Order order;
+
+  /**
+   * A string that identifies the order for this column. This field should be
+   * set if the order is any value other than SIGNED or UNSIGNED and is used to
+   * identify the actual order used for min, max, and soring values.
+   *
+   * This identifier should follow one of the following formats:
+   * * 'icu54:<locale-keyword>' - ICU 54 ordering for the ICU 54 locale keyword
+   *
+   * To define order formats other than those listed above, contact the Parquet
+   * list.
+   */
+  //2: optional string custom_order;
+}
+
 /**
  * Description for file metadata
  */
@@ -576,5 +628,14 @@ struct FileMetaData {
    * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
    **/
   6: optional string created_by
+
+  /**
+   * Sort order used for each column in this file.
+   *
+   * If this list is not present, then the order for each column is assumed to
+   * be SIGNED. In addition, min and max values for INTERVAL or DECIMAL stored
+   * as fixed or bytes should be ignored.
+   */
+  7: optional list<ColumnOrder> column_orders;
 }