Skip to content

Commit 70460e2

Browse files
committed
Implement support for apache/parquet-format#221 (IEEE total ordering of floating point numbers)
1 parent 94d529e commit 70460e2

10 files changed

Lines changed: 434 additions & 224 deletions

File tree

extension/parquet/include/parquet_column_schema.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ enum class ParquetExtraTypeInfo {
2727
FLOAT16
2828
};
2929

30+
enum class ParquetTypeOrder { STANDARD, IEEE_754_TOTAL_ORDER };
31+
3032
struct ParquetColumnSchema {
3133
ParquetColumnSchema() = default;
3234
ParquetColumnSchema(idx_t max_define, idx_t max_repeat, idx_t schema_index, idx_t file_index,
@@ -47,6 +49,7 @@ struct ParquetColumnSchema {
4749
uint32_t type_scale = 0;
4850
duckdb_parquet::Type::type parquet_type = duckdb_parquet::Type::INT32;
4951
ParquetExtraTypeInfo type_info = ParquetExtraTypeInfo::NONE;
52+
ParquetTypeOrder type_order = ParquetTypeOrder::STANDARD;
5053
vector<ParquetColumnSchema> children;
5154

5255
unique_ptr<BaseStatistics> Stats(ParquetReader &reader, idx_t row_group_idx_p,

extension/parquet/include/parquet_reader.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,10 @@ class ParquetReader : public BaseFileReader {
210210
MultiFileColumnDefinition ParseColumnDefinition(const duckdb_parquet::FileMetaData &file_meta_data,
211211
ParquetColumnSchema &element);
212212

213+
void ParseColumnOrdersRecursive(ParquetColumnSchema &column_schema,
214+
const duckdb_parquet::FileMetaData &file_meta_data, idx_t &column_order_idx);
215+
void ParseColumnOrders(ParquetColumnSchema &column_schema, const duckdb_parquet::FileMetaData &file_meta_data);
216+
213217
private:
214218
unique_ptr<FileHandle> file_handle;
215219
};

extension/parquet/include/parquet_writer.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ class ParquetWriter {
150150

151151
private:
152152
void GatherWrittenStatistics();
153+
void WriteColumnOrders(const ParquetColumnSchema &column_schemas, duckdb_parquet::FileMetaData &file_meta_data);
153154

154155
private:
155156
ClientContext &context;

extension/parquet/parquet_reader.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,8 @@ unique_ptr<ParquetColumnSchema> ParquetReader::ParseSchema() {
596596
if (root.type.id() != LogicalTypeId::STRUCT) {
597597
throw InvalidInputException("Root element of Parquet file must be a struct");
598598
}
599+
// parse the column orders (if any)
600+
ParseColumnOrders(root, *file_meta_data);
599601
D_ASSERT(next_schema_idx == file_meta_data->schema.size() - 1);
600602
D_ASSERT(file_meta_data->row_groups.empty() || next_file_idx == file_meta_data->row_groups[0].columns.size());
601603
if (parquet_options.file_row_number) {
@@ -611,6 +613,33 @@ unique_ptr<ParquetColumnSchema> ParquetReader::ParseSchema() {
611613
return make_uniq<ParquetColumnSchema>(root);
612614
}
613615

616+
void ParquetReader::ParseColumnOrdersRecursive(ParquetColumnSchema &column_schema,
617+
const duckdb_parquet::FileMetaData &file_meta_data,
618+
idx_t &column_order_idx) {
619+
if (column_order_idx >= file_meta_data.column_orders.size()) {
620+
// exhausted the column order set
621+
return;
622+
}
623+
if (column_schema.children.empty()) {
624+
// root schema - read the order
625+
auto &order = file_meta_data.column_orders[column_order_idx];
626+
if (order.__isset.IEEE_754_TOTAL_ORDER) {
627+
column_schema.type_order = ParquetTypeOrder::IEEE_754_TOTAL_ORDER;
628+
}
629+
column_order_idx++;
630+
return;
631+
}
632+
for (auto &child_schema : column_schema.children) {
633+
ParseColumnOrdersRecursive(child_schema, file_meta_data, column_order_idx);
634+
}
635+
}
636+
637+
void ParquetReader::ParseColumnOrders(ParquetColumnSchema &column_schema,
638+
const duckdb_parquet::FileMetaData &file_meta_data) {
639+
idx_t column_order_idx = 0;
640+
ParseColumnOrdersRecursive(column_schema, file_meta_data, column_order_idx);
641+
}
642+
614643
MultiFileColumnDefinition ParquetReader::ParseColumnDefinition(const FileMetaData &file_meta_data,
615644
ParquetColumnSchema &element) {
616645
MultiFileColumnDefinition result(element.name, element.type);

extension/parquet/parquet_statistics.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::CreateNumericStats(const Logi
5454
static unique_ptr<BaseStatistics> CreateFloatingPointStats(const LogicalType &type,
5555
const ParquetColumnSchema &schema_ele,
5656
const duckdb_parquet::Statistics &parquet_stats) {
57+
if (schema_ele.type_order == ParquetTypeOrder::IEEE_754_TOTAL_ORDER) {
58+
// if we have the total order defined we can treat floating point stats as regular numeric stats
59+
return ParquetStatisticsUtils::CreateNumericStats(type, schema_ele, parquet_stats);
60+
}
5761
auto stats = NumericStats::CreateUnknown(type);
5862

5963
// floating point values can always have NaN values - hence we cannot use the max value from the file

extension/parquet/parquet_writer.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,11 +417,32 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
417417
column_writers.push_back(
418418
ColumnWriter::CreateWriterRecursive(context, *this, file_meta_data.schema, child_schema, path_in_schema));
419419
}
420+
for (auto &child_schema : column_schemas) {
421+
WriteColumnOrders(child_schema, file_meta_data);
422+
}
423+
file_meta_data.__isset.column_orders = true;
420424
}
421425

422426
ParquetWriter::~ParquetWriter() {
423427
}
424428

429+
void ParquetWriter::WriteColumnOrders(const ParquetColumnSchema &column_schema,
430+
duckdb_parquet::FileMetaData &file_meta_data) {
431+
if (column_schema.children.empty()) {
432+
// root schema - write the orders
433+
duckdb_parquet::ColumnOrder order;
434+
if (column_schema.type.IsFloating()) {
435+
duckdb_parquet::IEEE754TotalOrder floating_order;
436+
order.__set_IEEE_754_TOTAL_ORDER(floating_order);
437+
}
438+
file_meta_data.column_orders.push_back(order);
439+
return;
440+
}
441+
for (auto &child_schema : column_schema.children) {
442+
WriteColumnOrders(child_schema, file_meta_data);
443+
}
444+
}
445+
425446
void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result) {
426447
// We write 8 columns at a time so that iterating over ColumnDataCollection is more efficient
427448
static constexpr idx_t COLUMNS_PER_PASS = 8;
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# name: test/sql/copy/parquet/test_parquet_fp_order.test
2+
# description: Test floating point order
3+
# group: [parquet]
4+
5+
require parquet
6+
7+
statement ok
8+
COPY (SELECT i::DOUBLE d FROM range(10) t(i)) TO '__TEST_DIR__/fp_order.parquet';
9+
10+
# verify that we can read back the correct stats
11+
query I
12+
SELECT stats(d) FROM '__TEST_DIR__/fp_order.parquet' LIMIT 1
13+
----
14+
<REGEX>:.*Min.*0.0.*Max.*9.0.*
15+
16+
statement ok
17+
SET explain_output = PHYSICAL_ONLY
18+
19+
# we can prune this
20+
query II
21+
EXPLAIN SELECT d FROM '__TEST_DIR__/fp_order.parquet' WHERE d>100
22+
----
23+
physical_plan <REGEX>:.*EMPTY_RESULT.*

third_party/parquet/parquet.thrift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -943,6 +943,9 @@ struct RowGroup {
943943
/** Empty struct to signal the order defined by the physical or logical type */
944944
struct TypeDefinedOrder {}
945945

946+
/** Empty struct to signal IEEE 754 total order for floating point types */
947+
struct IEEE754TotalOrder {}
948+
946949
/**
947950
* Union to specify the order used for the min_value and max_value fields for a
948951
* column. This union takes the role of an enhanced enum that allows rich
@@ -1008,6 +1011,7 @@ union ColumnOrder {
10081011
* `-0.0` should be written into the min statistics field.
10091012
*/
10101013
1: TypeDefinedOrder TYPE_ORDER;
1014+
2: IEEE754TotalOrder IEEE_754_TOTAL_ORDER;
10111015
}
10121016

10131017
struct PageLocation {

0 commit comments

Comments
 (0)