-
Notifications
You must be signed in to change notification settings - Fork 3k
Read support for parquet int96 timestamps #1184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
ca3e955
cea839f
944c325
0c94f88
b35027b
7e48187
68dc4c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ | |
|
|
||
| package org.apache.iceberg.data.parquet; | ||
|
|
||
| import java.nio.ByteBuffer; | ||
| import java.nio.ByteOrder; | ||
| import java.time.Instant; | ||
| import java.time.LocalDate; | ||
| import java.time.LocalDateTime; | ||
|
|
@@ -28,6 +30,7 @@ | |
| import java.time.temporal.ChronoUnit; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.TimeUnit; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.parquet.ParquetSchemaUtil; | ||
| import org.apache.iceberg.parquet.ParquetValueReader; | ||
|
|
@@ -299,6 +302,10 @@ public ParquetValueReader<?> primitive(org.apache.iceberg.types.Type.PrimitiveTy | |
| case INT64: | ||
| case DOUBLE: | ||
| return new ParquetValueReaders.UnboxedReader<>(desc); | ||
| case INT96: | ||
| // Impala & Spark used to write timestamps as INT96 without a logical type. For backwards | ||
| // compatibility we try to read INT96 as timestamps. | ||
| return new TimestampInt96Reader(desc); | ||
| default: | ||
| throw new UnsupportedOperationException("Unsupported type: " + primitive); | ||
| } | ||
|
|
@@ -345,6 +352,25 @@ public LocalDateTime read(LocalDateTime reuse) { | |
| } | ||
| } | ||
|
|
||
| private static class TimestampInt96Reader extends ParquetValueReaders.PrimitiveReader<LocalDateTime> { | ||
| private static final long UNIX_EPOCH_JULIAN = 2_440_588L; | ||
|
|
||
| private TimestampInt96Reader(ColumnDescriptor desc) { | ||
| super(desc); | ||
| } | ||
|
|
||
| @Override | ||
| public LocalDateTime read(LocalDateTime reuse) { | ||
| final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for reviewers (and future me): |
||
| final long timeOfDayNanos = byteBuffer.getLong(); | ||
| final int julianDay = byteBuffer.getInt(); | ||
|
|
||
| return Instant | ||
| .ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) | ||
| .plusNanos(timeOfDayNanos).atOffset(ZoneOffset.UTC).toLocalDateTime(); | ||
| } | ||
| } | ||
|
|
||
| private static class TimestamptzReader extends ParquetValueReaders.PrimitiveReader<OffsetDateTime> { | ||
| private TimestamptzReader(ColumnDescriptor desc) { | ||
| super(desc); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,21 +21,30 @@ | |
|
|
||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.nio.file.Path; | ||
| import java.nio.file.Paths; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import org.apache.avro.generic.GenericData; | ||
| import org.apache.iceberg.Files; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.io.CloseableIterable; | ||
| import org.apache.iceberg.io.FileAppender; | ||
| import org.apache.iceberg.io.InputFile; | ||
| import org.apache.iceberg.parquet.Parquet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.spark.SparkSchemaUtil; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
| import org.apache.spark.sql.types.StructType; | ||
| import org.hamcrest.CoreMatchers; | ||
| import org.junit.Assert; | ||
| import org.junit.Assume; | ||
| import org.junit.Test; | ||
|
|
||
| import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; | ||
| import static org.apache.iceberg.types.Types.NestedField.required; | ||
|
|
||
| public class TestSparkParquetReader extends AvroDataTest { | ||
| @Override | ||
|
|
@@ -67,4 +76,41 @@ protected void writeAndValidate(Schema schema) throws IOException { | |
| Assert.assertFalse("Should not have extra rows", rows.hasNext()); | ||
| } | ||
| } | ||
|
|
||
| protected List<InternalRow> rowsFromFile(InputFile inputFile, Schema schema) throws IOException { | ||
| try (CloseableIterable<InternalRow> reader = | ||
| Parquet.read(inputFile) | ||
| .project(schema) | ||
| .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) | ||
| .build()) { | ||
| return Lists.newArrayList(reader); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { | ||
| final Schema schema = new Schema(required(1, "ts", Types.TimestampType.asSparkInt96())); | ||
| final StructType sparkSchema = SparkSchemaUtil.convert(schema); | ||
| final Path parquetFile = Paths.get(temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); | ||
|
||
| final List<InternalRow> rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); | ||
|
||
|
|
||
| try (FileAppender<InternalRow> writer = | ||
| Parquet.write(Files.localOutput(parquetFile.toString())) | ||
| .writeSupport( | ||
| new org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport()) | ||
| .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) | ||
| .set("org.apache.spark.legacyDateTime", "false") | ||
| .set("spark.sql.parquet.int96AsTimestamp", "true") | ||
| .set("spark.sql.parquet.writeLegacyFormat", "false") | ||
| .set("spark.sql.parquet.outputTimestampType", "INT96") | ||
| .schema(schema) | ||
|
||
| .build()) { | ||
| writer.addAll(rows); | ||
| } | ||
|
|
||
| final List<InternalRow> readRows = | ||
| rowsFromFile(Files.localInput(parquetFile.toString()), schema); | ||
| Assert.assertEquals(rows.size(), readRows.size()); | ||
| Assert.assertThat(readRows, CoreMatchers.is(rows)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should change the type system to support this. INT96 may be something that we can read, but Iceberg cannot write it, per the spec.
Was this needed to build the tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. I found a way to have tests running that doesn't add a new type, I had to create an implementation of
ParquetWriter.Builderthat uses Spark'sParquetWriteSupportand Iceberg'sParquetWriteAdapterto avoid creating aSparkSession.