@@ -372,16 +372,15 @@ class ScalarColumnReader : public BaseScalarColumnReader {
372372 // / the max length for VARCHAR columns. Unused otherwise.
373373 int fixed_len_size_;
374374
375- // / Query-global timezone used as local timezone when executing the query .
376- const Timezone& local_time_zone_ ;
375+ // / Contains extra data needed for Timestamp decoding .
376+ ParquetTimestampDecoder timestamp_decoder_ ;
377377};
378378
379379template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
380380ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::ScalarColumnReader(
381381 HdfsParquetScanner* parent, const SchemaNode& node, const SlotDescriptor* slot_desc)
382382 : BaseScalarColumnReader(parent, node, slot_desc),
383- dict_decoder_ (parent->scan_node_->mem_tracker ()),
384- local_time_zone_(parent->state_->local_time_zone ()) {
383+ dict_decoder_ (parent->scan_node_->mem_tracker ()) {
385384 if (!MATERIALIZED) {
386385 // We're not materializing any values, just counting them. No need (or ability) to
387386 // initialize state used to materialize values.
@@ -399,9 +398,14 @@ ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::ScalarColumnReader
399398 } else {
400399 fixed_len_size_ = -1 ;
401400 }
402- needs_conversion_ = slot_desc_->type ().type == TYPE_CHAR ||
403- (slot_desc_->type ().type == TYPE_TIMESTAMP &&
404- parent->IsTimezoneConversionNeededForTimestamps ());
401+
402+ needs_conversion_ = slot_desc_->type ().type == TYPE_CHAR;
403+
404+ if (slot_desc_->type ().type == TYPE_TIMESTAMP) {
405+ timestamp_decoder_ = parent->CreateTimestampDecoder (*node.element );
406+ dict_decoder_.SetTimestampHelper (timestamp_decoder_);
407+ needs_conversion_ = timestamp_decoder_.NeedsConversion ();
408+ }
405409}
406410
407411template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
@@ -641,6 +645,30 @@ bool ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::DecodeValue(
641645 return true ;
642646}
643647
648+ template <>
649+ template <Encoding::type ENCODING>
650+ bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >::DecodeValue(
651+ uint8_t ** RESTRICT data, const uint8_t * RESTRICT data_end,
652+ TimestampValue* RESTRICT val) RESTRICT {
653+ DCHECK_EQ (page_encoding_, ENCODING);
654+ if (ENCODING == Encoding::PLAIN_DICTIONARY) {
655+ if (UNLIKELY (!dict_decoder_.GetNextValue (val))) {
656+ SetDictDecodeError ();
657+ return false ;
658+ }
659+ } else {
660+ DCHECK_EQ (ENCODING, Encoding::PLAIN);
661+ int encoded_len =
662+ timestamp_decoder_.Decode <parquet::Type::INT64>(*data, data_end, val);
663+ if (UNLIKELY (encoded_len < 0 )) {
664+ SetPlainDecodeError ();
665+ return false ;
666+ }
667+ *data += encoded_len;
668+ }
669+ return true ;
670+ }
671+
644672template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED>
645673void ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>
646674 ::ReadPositionBatched (int16_t rep_level, int64_t * pos) {
@@ -674,14 +702,31 @@ ::NeedsConversionInline() const {
674702 return needs_conversion_;
675703}
676704
705+ template <>
706+ inline bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >
707+ ::NeedsConversionInline () const {
708+ return needs_conversion_;
709+ }
710+
677711template <>
678712bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true >::ConvertSlot(
679713 const TimestampValue* src, void * slot) {
680714 // Conversion should only happen when this flag is enabled.
681715 DCHECK (FLAGS_convert_legacy_hive_parquet_utc_timestamps);
716+ DCHECK (timestamp_decoder_.NeedsConversion ());
682717 TimestampValue* dst_ts = reinterpret_cast <TimestampValue*>(slot);
683718 *dst_ts = *src;
684- if (dst_ts->HasDateAndTime ()) dst_ts->UtcToLocal (local_time_zone_);
719+ timestamp_decoder_.ConvertToLocalTime (dst_ts);
720+ return true ;
721+ }
722+
723+ template <>
724+ bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >::ConvertSlot(
725+ const TimestampValue* src, void * slot) {
726+ DCHECK (timestamp_decoder_.NeedsConversion ());
727+ TimestampValue* dst_ts = reinterpret_cast <TimestampValue*>(slot);
728+ *dst_ts = *src;
729+ timestamp_decoder_.ConvertToLocalTime (static_cast <TimestampValue*>(dst_ts));
685730 return true ;
686731}
687732
@@ -691,6 +736,12 @@ ::NeedsValidationInline() const {
691736 return true ;
692737}
693738
739+ template <>
740+ inline bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >
741+ ::NeedsValidationInline () const {
742+ return true ;
743+ }
744+
694745template <>
695746bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true >::ValidateValue(
696747 TimestampValue* val) const {
@@ -711,6 +762,23 @@ bool ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>::ValidateVal
711762 return true ;
712763}
713764
765+ template <>
766+ bool ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >::ValidateValue(
767+ TimestampValue* val) const {
768+ // The range was already checked during the int64_t->TimestampValue conversion, which
769+ // sets the date to invalid if it was out of range.
770+ if (UNLIKELY (!val->HasDate ())) {
771+ ErrorMsg msg (TErrorCode::PARQUET_TIMESTAMP_OUT_OF_RANGE,
772+ filename (), node_.element ->name );
773+ Status status = parent_->state_ ->LogOrReturnError (msg);
774+ if (!status.ok ()) parent_->parse_status_ = status;
775+ return false ;
776+ }
777+ DCHECK (TimestampValue::IsValidDate (val->date ()));
778+ DCHECK (TimestampValue::IsValidTime (val->time ()));
779+ return true ;
780+ }
781+
714782class BoolColumnReader : public BaseScalarColumnReader {
715783 public:
716784 BoolColumnReader (HdfsParquetScanner* parent, const SchemaNode& node,
@@ -1507,7 +1575,7 @@ void CollectionColumnReader::UpdateDerivedState() {
15071575}
15081576
15091577// / Returns a column reader for decimal types based on its size and parquet type.
1510- static ParquetColumnReader* GetDecimalColumnReader (const SchemaNode& node,
1578+ static ParquetColumnReader* CreateDecimalColumnReader (const SchemaNode& node,
15111579 const SlotDescriptor* slot_desc, HdfsParquetScanner* parent) {
15121580 switch (node.element ->type ) {
15131581 case parquet::Type::FIXED_LEN_BYTE_ARRAY:
@@ -1554,84 +1622,82 @@ static ParquetColumnReader* GetDecimalColumnReader(const SchemaNode& node,
15541622ParquetColumnReader* ParquetColumnReader::Create (const SchemaNode& node,
15551623 bool is_collection_field, const SlotDescriptor* slot_desc,
15561624 HdfsParquetScanner* parent) {
1557- ParquetColumnReader* reader = nullptr ;
15581625 if (is_collection_field) {
15591626 // Create collection reader (note this handles both NULL and non-NULL 'slot_desc')
1560- reader = new CollectionColumnReader (parent, node, slot_desc);
1627+ return new CollectionColumnReader (parent, node, slot_desc);
15611628 } else if (slot_desc != nullptr ) {
15621629 // Create the appropriate ScalarColumnReader type to read values into 'slot_desc'
15631630 switch (slot_desc->type ().type ) {
15641631 case TYPE_BOOLEAN:
1565- reader = new BoolColumnReader (parent, node, slot_desc);
1566- break ;
1632+ return new BoolColumnReader (parent, node, slot_desc);
15671633 case TYPE_TINYINT:
1568- reader = new ScalarColumnReader<int8_t , parquet::Type::INT32, true >(parent, node,
1634+ return new ScalarColumnReader<int8_t , parquet::Type::INT32, true >(parent, node,
15691635 slot_desc);
1570- break ;
15711636 case TYPE_SMALLINT:
1572- reader = new ScalarColumnReader<int16_t , parquet::Type::INT32, true >(parent, node,
1637+ return new ScalarColumnReader<int16_t , parquet::Type::INT32, true >(parent, node,
15731638 slot_desc);
1574- break ;
15751639 case TYPE_INT:
1576- reader = new ScalarColumnReader<int32_t , parquet::Type::INT32, true >(parent, node,
1640+ return new ScalarColumnReader<int32_t , parquet::Type::INT32, true >(parent, node,
15771641 slot_desc);
1578- break ;
15791642 case TYPE_BIGINT:
15801643 switch (node.element ->type ) {
15811644 case parquet::Type::INT32:
1582- reader = new ScalarColumnReader<int64_t , parquet::Type::INT32, true >(parent,
1645+ return new ScalarColumnReader<int64_t , parquet::Type::INT32, true >(parent,
15831646 node, slot_desc);
1584- break ;
15851647 default :
1586- reader = new ScalarColumnReader<int64_t , parquet::Type::INT64, true >(parent,
1648+ return new ScalarColumnReader<int64_t , parquet::Type::INT64, true >(parent,
15871649 node, slot_desc);
1588- break ;
15891650 }
1590- break ;
15911651 case TYPE_FLOAT:
1592- reader = new ScalarColumnReader<float , parquet::Type::FLOAT, true >(parent, node,
1652+ return new ScalarColumnReader<float , parquet::Type::FLOAT, true >(parent, node,
15931653 slot_desc);
1594- break ;
15951654 case TYPE_DOUBLE:
15961655 switch (node.element ->type ) {
15971656 case parquet::Type::INT32:
1598- reader = new ScalarColumnReader<double , parquet::Type::INT32, true >(parent,
1657+ return new ScalarColumnReader<double , parquet::Type::INT32, true >(parent,
15991658 node, slot_desc);
1600- break ;
16011659 case parquet::Type::FLOAT:
1602- reader = new ScalarColumnReader<double , parquet::Type::FLOAT, true >(parent,
1660+ return new ScalarColumnReader<double , parquet::Type::FLOAT, true >(parent,
16031661 node, slot_desc);
1604- break ;
16051662 default :
1606- reader = new ScalarColumnReader<double , parquet::Type::DOUBLE, true >(parent,
1663+ return new ScalarColumnReader<double , parquet::Type::DOUBLE, true >(parent,
16071664 node, slot_desc);
1608- break ;
16091665 }
1610- break ;
16111666 case TYPE_TIMESTAMP:
1612- reader = new ScalarColumnReader<TimestampValue, parquet::Type::INT96, true >(
1613- parent, node, slot_desc);
1614- break ;
1667+ return CreateTimestampColumnReader (node, slot_desc, parent);
16151668 case TYPE_STRING:
16161669 case TYPE_VARCHAR:
16171670 case TYPE_CHAR:
1618- reader = new ScalarColumnReader<StringValue, parquet::Type::BYTE_ARRAY, true >(
1671+ return new ScalarColumnReader<StringValue, parquet::Type::BYTE_ARRAY, true >(
16191672 parent, node, slot_desc);
1620- break ;
16211673 case TYPE_DECIMAL:
1622- reader = GetDecimalColumnReader (node, slot_desc, parent);
1623- break ;
1674+ return CreateDecimalColumnReader (node, slot_desc, parent);
16241675 default :
16251676 DCHECK (false ) << slot_desc->type ().DebugString ();
1677+ return nullptr ;
16261678 }
16271679 } else {
16281680 // Special case for counting scalar values (e.g. count(*), no materialized columns in
16291681 // the file, only materializing a position slot). We won't actually read any values,
16301682 // only the rep and def levels, so it doesn't matter what kind of reader we make.
1631- reader = new ScalarColumnReader<int8_t , parquet::Type::INT32, false >(parent, node,
1683+ return new ScalarColumnReader<int8_t , parquet::Type::INT32, false >(parent, node,
16321684 slot_desc);
16331685 }
1634- return parent->obj_pool_ .Add (reader);
1686+ }
1687+
1688+ ParquetColumnReader* ParquetColumnReader::CreateTimestampColumnReader (
1689+ const SchemaNode& node, const SlotDescriptor* slot_desc,
1690+ HdfsParquetScanner* parent) {
1691+ if (node.element ->type == parquet::Type::INT96) {
1692+ return new ScalarColumnReader<TimestampValue, parquet::Type::INT96, true >(
1693+ parent, node, slot_desc);
1694+ }
1695+ else if (node.element ->type == parquet::Type::INT64) {
1696+ return new ScalarColumnReader<TimestampValue, parquet::Type::INT64, true >(
1697+ parent, node, slot_desc);
1698+ }
1699+ DCHECK (false ) << slot_desc->type ().DebugString ();
1700+ return nullptr ;
16351701}
16361702
16371703}
0 commit comments