apache
diff --git a/‎be/src/exec/hdfs-parquet-scanner.cc‎
Lines changed: 22 additions & 27 deletions b/‎be/src/exec/hdfs-parquet-scanner.cc‎
Lines changed: 22 additions & 27 deletions
diff --git a/‎be/src/exec/hdfs-parquet-scanner.h‎
Lines changed: 3 additions & 3 deletions b/‎be/src/exec/hdfs-parquet-scanner.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎be/src/exec/parquet-column-readers.cc‎
Lines changed: 108 additions & 42 deletions b/‎be/src/exec/parquet-column-readers.cc‎
Lines changed: 108 additions & 42 deletions
diff --git a/‎be/src/exec/parquet-column-readers.h‎
Lines changed: 9 additions & 1 deletion b/‎be/src/exec/parquet-column-readers.h‎
Lines changed: 9 additions & 1 deletion
@@ -453,21 +453,6 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
 
   if (!state_->query_options().parquet_read_statistics) return Status::OK();
 
-  // IMPALA-7559: if the values are converted from UTC to local time, then either the
-  // stats need to be converted from UTC to local, or the predicate's min/max values
-  // need to be converted from local to UTC. Doing this correctly is quite complex if
-  // the timestamps fall into timezone rules changes (DST change or historical rule
-  // change), so currently stat filtering is simply disabled for these columns.
-  //
-  // Note that parquet-mr only writes stats if min and max are equal, because it cannot
-  // order timestamps correctly, so the only case affected here is when every value is
-  // the same in the column chunk.
-  // TODO: This topic needs more investigation related to IMPALA-5050, which will add
-  // support for INT64 millisec/microsec timestamp columns, and also a metadata field
-  // whether utc->local conversion is necessary. I am not sure how parquet-mr handles
-  // stats for these types at the moment.
-  bool disable_min_max_filter_for_timestamps = IsTimezoneConversionNeededForTimestamps();
-
   const TupleDescriptor* min_max_tuple_desc = scan_node_->min_max_tuple_desc();
   if (!min_max_tuple_desc) return Status::OK();
 
@@ -516,30 +501,36 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
     const parquet::ColumnChunk& col_chunk = row_group.columns[col_idx];
     const ColumnType& col_type = slot_desc->type();
 
+    DCHECK(node->element != nullptr);
+
+    ColumnStatsReader stat_reader(col_chunk, col_type, col_order,  *node->element);
+    if (col_type.IsTimestampType()) {
+      stat_reader.SetTimestampDecoder(CreateTimestampDecoder(*node->element));
+    }
+
     int64_t null_count = 0;
-    bool null_count_result = ColumnStatsBase::ReadNullCountStat(col_chunk, &null_count);
+    bool null_count_result = stat_reader.ReadNullCountStat(&null_count);
     if (null_count_result && null_count == col_chunk.meta_data.num_values) {
       *skip_row_group = true;
       break;
     }
 
-    if (col_type.IsTimestampType() && disable_min_max_filter_for_timestamps) continue;
-
-    bool stats_read = false;
-    void* slot = min_max_tuple_->GetSlot(slot_desc->tuple_offset());
     const string& fn_name = eval->root().function_name();
+    ColumnStatsReader::StatsField stats_field;
     if (fn_name == "lt" || fn_name == "le") {
       // We need to get min stats.
-      stats_read = ColumnStatsBase::ReadFromThrift(
-          col_chunk, col_type, col_order, ColumnStatsBase::StatsField::MIN, slot);
+      stats_field = ColumnStatsReader::StatsField::MIN;
     } else if (fn_name == "gt" || fn_name == "ge") {
       // We need to get max stats.
-      stats_read = ColumnStatsBase::ReadFromThrift(
-          col_chunk, col_type, col_order, ColumnStatsBase::StatsField::MAX, slot);
+      stats_field = ColumnStatsReader::StatsField::MAX;
     } else {
       DCHECK(false) << "Unsupported function name for statistics evaluation: " << fn_name;
+      continue;
     }
 
+    void* slot = min_max_tuple_->GetSlot(slot_desc->tuple_offset());
+    bool stats_read = stat_reader.ReadFromThrift(stats_field, slot);
+
     if (stats_read) {
       TupleRow row;
       row.SetTuple(0, min_max_tuple_);
@@ -1677,9 +1668,13 @@ Status HdfsParquetScanner::ValidateEndOfRowGroup(
   return Status::OK();
 }
 
-bool HdfsParquetScanner::IsTimezoneConversionNeededForTimestamps() {
-  return FLAGS_convert_legacy_hive_parquet_utc_timestamps &&
+ParquetTimestampDecoder HdfsParquetScanner::CreateTimestampDecoder(
+    const parquet::SchemaElement& element) {
+  bool timestamp_conversion_needed_for_int96_timestamps =
+      FLAGS_convert_legacy_hive_parquet_utc_timestamps &&
       file_version_.application == "parquet-mr";
-}
 
+  return ParquetTimestampDecoder(element, &state_->local_time_zone(),
+      timestamp_conversion_needed_for_int96_timestamps);
+}
 }
@@ -343,9 +343,9 @@ class HdfsParquetScanner : public HdfsScanner {
       llvm::Function** process_scratch_batch_fn)
       WARN_UNUSED_RESULT;
 
-  /// Returns true if the timestamps are expected to be in UTC and need to be
-  /// converted to local time.
-  bool IsTimezoneConversionNeededForTimestamps();
+  /// Initializes a ParquetTimestampDecoder depending on writer, timezone, and the schema
+  /// of the column.
+  ParquetTimestampDecoder CreateTimestampDecoder(const parquet::SchemaElement& element);
 
   /// The rep and def levels are set to this value to indicate the end of a row group.
   static const int16_t ROW_GROUP_END = numeric_limits<int16_t>::min();
 
@@ -372,16 +372,15 @@ class ScalarColumnReader : public BaseScalarColumnReader {
   /// the max length for VARCHAR columns. Unused otherwise.
   int fixed_len_size_;
 
-  /// Query-global timezone used as local timezone when executing the query.
-  const Timezone& local_time_zone_;
+  /// Contains extra data needed for Timestamp decoding.
+  ParquetTimestampDecoder timestamp_decoder_;
 };
 
 template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
 ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::ScalarColumnReader(
     HdfsParquetScanner* parent, const SchemaNode& node, const SlotDescriptor* slot_desc)
   : BaseScalarColumnReader(parent, node, slot_desc),
-    dict_decoder_(parent->scan_node_->mem_tracker()),
-    local_time_zone_(parent->state_->local_time_zone()) {
+    dict_decoder_(parent->scan_node_->mem_tracker()) {
   if (!MATERIALIZED) {
     // We're not materializing any values, just counting them. No need (or ability) to
     // initialize state used to materialize values.
@@ -399,9 +398,14 @@ ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::ScalarColumnReader
   } else {
     fixed_len_size_ = -1;
   }
-  needs_conversion_ = slot_desc_->type().type == TYPE_CHAR ||
-      (slot_desc_->type().type == TYPE_TIMESTAMP &&
-      parent->IsTimezoneConversionNeededForTimestamps());
+
+  needs_conversion_ = slot_desc_->type().type == TYPE_CHAR;
+
+  if (slot_desc_->type().type == TYPE_TIMESTAMP) {
+    timestamp_decoder_ = parent->CreateTimestampDecoder(*node.element);
+    dict_decoder_.SetTimestampHelper(timestamp_decoder_);
+    needs_conversion_ = timestamp_decoder_.NeedsConversion();
+  }
 }
 
 template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
@@ -641,6 +645,30 @@ bool ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::DecodeValue(
   return true;
 }
 
+template <>
+template <Encoding::type ENCODING>
+bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>::DecodeValue(
+    uint8_t** RESTRICT data, const uint8_t* RESTRICT data_end,
+    TimestampValue* RESTRICT val) RESTRICT {
+  DCHECK_EQ(page_encoding_, ENCODING);
+  if (ENCODING == Encoding::PLAIN_DICTIONARY) {
+    if (UNLIKELY(!dict_decoder_.GetNextValue(val))) {
+      SetDictDecodeError();
+      return false;
+    }
+  } else {
+    DCHECK_EQ(ENCODING, Encoding::PLAIN);
+    int encoded_len =
+        timestamp_decoder_.Decode<parquet::Type::INT64>(*data, data_end, val);
+    if (UNLIKELY(encoded_len < 0)) {
+      SetPlainDecodeError();
+      return false;
+    }
+    *data += encoded_len;
+  }
+  return true;
+}
+
 template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
 void ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>
     ::ReadPositionBatched(int16_t rep_level, int64_t* pos) {
@@ -674,14 +702,31 @@ ::NeedsConversionInline() const {
   return needs_conversion_;
 }
 
+template <>
+inline bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>
+::NeedsConversionInline() const {
+  return needs_conversion_;
+}
+
 template <>
 bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>::ConvertSlot(
     const TimestampValue* src, void* slot) {
   // Conversion should only happen when this flag is enabled.
   DCHECK(FLAGS_convert_legacy_hive_parquet_utc_timestamps);
+  DCHECK(timestamp_decoder_.NeedsConversion());
   TimestampValue* dst_ts = reinterpret_cast<TimestampValue*>(slot);
   *dst_ts = *src;
-  if (dst_ts->HasDateAndTime()) dst_ts->UtcToLocal(local_time_zone_);
+  timestamp_decoder_.ConvertToLocalTime(dst_ts);
+  return true;
+}
+
+template <>
+bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>::ConvertSlot(
+    const TimestampValue* src, void* slot) {
+  DCHECK(timestamp_decoder_.NeedsConversion());
+  TimestampValue* dst_ts = reinterpret_cast<TimestampValue*>(slot);
+  *dst_ts = *src;
+  timestamp_decoder_.ConvertToLocalTime(static_cast<TimestampValue*>(dst_ts));
   return true;
 }
 
@@ -691,6 +736,12 @@ ::NeedsValidationInline() const {
   return true;
 }
 
+template <>
+inline bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>
+::NeedsValidationInline() const {
+  return true;
+}
+
 template <>
 bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>::ValidateValue(
     TimestampValue* val) const {
@@ -711,6 +762,23 @@ bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>::ValidateVal
   return true;
 }
 
+template <>
+bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>::ValidateValue(
+    TimestampValue* val) const {
+  // The range was already checked during the int64_t->TimestampValue conversion, which
+  // sets the date to invalid if it was out of range.
+  if (UNLIKELY(!val->HasDate())) {
+    ErrorMsg msg(TErrorCode::PARQUET_TIMESTAMP_OUT_OF_RANGE,
+        filename(), node_.element->name);
+    Status status = parent_->state_->LogOrReturnError(msg);
+    if (!status.ok()) parent_->parse_status_ = status;
+    return false;
+  }
+  DCHECK(TimestampValue::IsValidDate(val->date()));
+  DCHECK(TimestampValue::IsValidTime(val->time()));
+  return true;
+}
+
 class BoolColumnReader : public BaseScalarColumnReader {
  public:
   BoolColumnReader(HdfsParquetScanner* parent, const SchemaNode& node,
@@ -1507,7 +1575,7 @@ void CollectionColumnReader::UpdateDerivedState() {
 }
 
 /// Returns a column reader for decimal types based on its size and parquet type.
-static ParquetColumnReader* GetDecimalColumnReader(const SchemaNode& node,
+static ParquetColumnReader* CreateDecimalColumnReader(const SchemaNode& node,
     const SlotDescriptor* slot_desc, HdfsParquetScanner* parent) {
   switch (node.element->type) {
     case parquet::Type::FIXED_LEN_BYTE_ARRAY:
@@ -1554,84 +1622,82 @@ static ParquetColumnReader* GetDecimalColumnReader(const SchemaNode& node,
 ParquetColumnReader* ParquetColumnReader::Create(const SchemaNode& node,
     bool is_collection_field, const SlotDescriptor* slot_desc,
     HdfsParquetScanner* parent) {
-  ParquetColumnReader* reader = nullptr;
   if (is_collection_field) {
     // Create collection reader (note this handles both NULL and non-NULL 'slot_desc')
-    reader = new CollectionColumnReader(parent, node, slot_desc);
+    return new CollectionColumnReader(parent, node, slot_desc);
   } else if (slot_desc != nullptr) {
     // Create the appropriate ScalarColumnReader type to read values into 'slot_desc'
     switch (slot_desc->type().type) {
       case TYPE_BOOLEAN:
-        reader = new BoolColumnReader(parent, node, slot_desc);
-        break;
+        return new BoolColumnReader(parent, node, slot_desc);
       case TYPE_TINYINT:
-        reader = new ScalarColumnReader<int8_t, parquet::Type::INT32, true>(parent, node,
+        return new ScalarColumnReader<int8_t, parquet::Type::INT32, true>(parent, node,
             slot_desc);
-        break;
       case TYPE_SMALLINT:
-        reader = new ScalarColumnReader<int16_t, parquet::Type::INT32, true>(parent, node,
+        return new ScalarColumnReader<int16_t, parquet::Type::INT32, true>(parent, node,
             slot_desc);
-        break;
       case TYPE_INT:
-        reader = new ScalarColumnReader<int32_t, parquet::Type::INT32, true>(parent, node,
+        return new ScalarColumnReader<int32_t, parquet::Type::INT32, true>(parent, node,
             slot_desc);
-        break;
       case TYPE_BIGINT:
         switch (node.element->type) {
           case parquet::Type::INT32:
-            reader = new ScalarColumnReader<int64_t, parquet::Type::INT32, true>(parent,
+            return new ScalarColumnReader<int64_t, parquet::Type::INT32, true>(parent,
                 node, slot_desc);
-            break;
           default:
-            reader = new ScalarColumnReader<int64_t, parquet::Type::INT64, true>(parent,
+            return new ScalarColumnReader<int64_t, parquet::Type::INT64, true>(parent,
                 node, slot_desc);
-            break;
         }
-        break;
       case TYPE_FLOAT:
-        reader = new ScalarColumnReader<float, parquet::Type::FLOAT, true>(parent, node,
+        return new ScalarColumnReader<float, parquet::Type::FLOAT, true>(parent, node,
             slot_desc);
-        break;
       case TYPE_DOUBLE:
         switch (node.element->type) {
           case parquet::Type::INT32:
-            reader = new ScalarColumnReader<double , parquet::Type::INT32, true>(parent,
+            return new ScalarColumnReader<double , parquet::Type::INT32, true>(parent,
                 node, slot_desc);
-            break;
           case parquet::Type::FLOAT:
-            reader = new ScalarColumnReader<double, parquet::Type::FLOAT, true>(parent,
+            return new ScalarColumnReader<double, parquet::Type::FLOAT, true>(parent,
                 node, slot_desc);
-            break;
           default:
-            reader = new ScalarColumnReader<double, parquet::Type::DOUBLE, true>(parent,
+            return new ScalarColumnReader<double, parquet::Type::DOUBLE, true>(parent,
                 node, slot_desc);
-            break;
         }
-        break;
       case TYPE_TIMESTAMP:
-        reader = new ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>(
-            parent, node, slot_desc);
-        break;
+        return CreateTimestampColumnReader(node, slot_desc, parent);
       case TYPE_STRING:
       case TYPE_VARCHAR:
       case TYPE_CHAR:
-        reader = new ScalarColumnReader<StringValue, parquet::Type::BYTE_ARRAY, true>(
+        return new ScalarColumnReader<StringValue, parquet::Type::BYTE_ARRAY, true>(
             parent, node, slot_desc);
-        break;
       case TYPE_DECIMAL:
-        reader = GetDecimalColumnReader(node, slot_desc, parent);
-        break;
+        return CreateDecimalColumnReader(node, slot_desc, parent);
       default:
         DCHECK(false) << slot_desc->type().DebugString();
+        return nullptr;
     }
   } else {
     // Special case for counting scalar values (e.g. count(*), no materialized columns in
     // the file, only materializing a position slot). We won't actually read any values,
     // only the rep and def levels, so it doesn't matter what kind of reader we make.
-    reader = new ScalarColumnReader<int8_t, parquet::Type::INT32, false>(parent, node,
+    return new ScalarColumnReader<int8_t, parquet::Type::INT32, false>(parent, node,
         slot_desc);
   }
-  return parent->obj_pool_.Add(reader);
+}
+
+ParquetColumnReader* ParquetColumnReader::CreateTimestampColumnReader(
+    const SchemaNode& node, const SlotDescriptor* slot_desc,
+    HdfsParquetScanner* parent) {
+  if (node.element->type == parquet::Type::INT96) {
+    return new ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>(
+        parent, node, slot_desc);
+  }
+  else if (node.element->type == parquet::Type::INT64) {
+    return new ScalarColumnReader<TimestampValue, parquet::Type::INT64, true>(
+        parent, node, slot_desc);
+  }
+  DCHECK(false) << slot_desc->type().DebugString();
+  return nullptr;
 }
 
 }
@@ -132,10 +132,12 @@ class ParquetLevelDecoder {
 /// level pair at a time. The current def and rep level are exposed to the user, and the
 /// corresponding value (if defined) can optionally be copied into a slot via
 /// ReadValue(). Can also write position slots.
+///
+/// The constructor adds the object to the obj_pool of the parent HdfsParquetScanner.
 class ParquetColumnReader {
  public:
   /// Creates a column reader for 'node' and associates it with the given parent scanner.
-  /// Adds the new column reader to the parent's object pool.
+  /// The constructor of column readers add the new object to the parent's object pool.
   /// 'slot_desc' may be NULL, in which case the returned column reader can only be used
   /// to read def/rep levels.
   /// 'is_collection_field' should be set to true if the returned reader is reading a
@@ -155,6 +157,9 @@ class ParquetColumnReader {
   static ParquetColumnReader* Create(const SchemaNode& node, bool is_collection_field,
       const SlotDescriptor* slot_desc, HdfsParquetScanner* parent);
 
+  static ParquetColumnReader* CreateTimestampColumnReader(const SchemaNode& node,
+      const SlotDescriptor* slot_desc, HdfsParquetScanner* parent);
+
   virtual ~ParquetColumnReader() { }
 
   int def_level() const { return def_level_; }
@@ -303,6 +308,9 @@ class ParquetColumnReader {
       tuple_offset_(slot_desc == NULL ? -1 : slot_desc->tuple_offset()),
       null_indicator_offset_(slot_desc == NULL ? NullIndicatorOffset() :
           slot_desc->null_indicator_offset()) {
+    DCHECK(parent != nullptr);
+    parent->obj_pool_.Add(this);
+
     DCHECK_GE(node_.max_rep_level, 0);
     DCHECK_LE(node_.max_rep_level, std::numeric_limits<int16_t>::max());
     DCHECK_GE(node_.max_def_level, 0);