diff --git a/src/Client/BuzzHouse/Generator/ServerSettings.cpp b/src/Client/BuzzHouse/Generator/ServerSettings.cpp index 5c59d69bdb15..bc3d452e4f3c 100644 --- a/src/Client/BuzzHouse/Generator/ServerSettings.cpp +++ b/src/Client/BuzzHouse/Generator/ServerSettings.cpp @@ -702,6 +702,7 @@ static std::unordered_map serverSettings2 = { }, {}, false)}, + {"output_format_parquet_enum_as_byte_array", CHSetting(trueOrFalse, {}, false)}, {"output_format_parquet_datetime_as_uint32", trueOrFalseSettingNoOracle}, {"output_format_parquet_fixed_string_as_fixed_byte_array", trueOrFalseSettingNoOracle}, {"output_format_parquet_parallel_encoding", trueOrFalseSettingNoOracle}, diff --git a/src/Core/FormatFactorySettings.h b/src/Core/FormatFactorySettings.h index 022194d20c57..b46a51774847 100644 --- a/src/Core/FormatFactorySettings.h +++ b/src/Core/FormatFactorySettings.h @@ -1023,6 +1023,9 @@ Where in the parquet file to place the bloom filters. Bloom filters will be writ )", 0) \ DECLARE(Bool, output_format_parquet_datetime_as_uint32, false, R"( Write DateTime values as raw unix timestamp (read back as UInt32), instead of converting to milliseconds (read back as DateTime64(3)). + )", 0) \ + DECLARE(Bool, output_format_parquet_enum_as_byte_array, true, R"( +Write enum using parquet physical type: BYTE_ARRAY and logical type: ENUM )", 0) \ DECLARE(String, output_format_avro_codec, "", R"( Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'. diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 6e6bfe580117..89a550f3326a 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -67,6 +67,11 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() /// controls new feature and it's 'true' by default, use 'false' as previous_value). /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) /// Note: please check if the key already exists to prevent duplicate entries. + + addSettingsChanges(settings_changes_history, "25.6.5.2000", + { + {"output_format_parquet_enum_as_byte_array", true, true, "Enable writing Enum as byte array in Parquet by default"}, + }); addSettingsChanges(settings_changes_history, "25.6", { {"output_format_native_use_flattened_dynamic_and_json_serialization", false, false, "Add flattened Dynamic/JSON serializations to Native format"}, diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 8b85e48acfc6..db22070bd7f6 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -200,6 +200,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.parquet.output_string_as_string = settings[Setting::output_format_parquet_string_as_string]; format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings[Setting::output_format_parquet_fixed_string_as_fixed_byte_array]; format_settings.parquet.output_datetime_as_uint32 = settings[Setting::output_format_parquet_datetime_as_uint32]; + format_settings.parquet.output_enum_as_byte_array = settings[Setting::output_format_parquet_enum_as_byte_array]; format_settings.parquet.max_block_size = settings[Setting::input_format_parquet_max_block_size]; format_settings.parquet.prefer_block_bytes = settings[Setting::input_format_parquet_prefer_block_bytes]; format_settings.parquet.output_compression_method = settings[Setting::output_format_parquet_compression_method]; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index cd1dcf7de329..90c490223cff 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -288,6 +288,7 @@ struct FormatSettings bool output_string_as_string = false; bool output_fixed_string_as_fixed_byte_array = true; bool output_datetime_as_uint32 = false; + bool output_enum_as_byte_array = false; bool preserve_order = false; bool use_custom_encoder = true; bool parallel_encoding = true; diff --git a/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp b/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp index ee63f429717c..66f5b1828ba5 100644 --- a/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp +++ b/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp @@ -322,12 +322,22 @@ void preparePrimitiveColumn(ColumnPtr column, DataTypePtr type, const std::strin case TypeIndex::Int64: types(T::INT64); break; case TypeIndex::Float32: types(T::FLOAT); break; case TypeIndex::Float64: types(T::DOUBLE); break; - - /// These don't have suitable parquet logical types, so we write them as plain numbers. - /// (Parquet has "enums" but they're just strings, with nowhere to declare all possible enum - /// values in advance as part of the data type.) - case TypeIndex::Enum8: types(T::INT32, C::INT_8, int_type(8, true)); break; // Int8 - case TypeIndex::Enum16: types(T::INT32, C::INT_16, int_type(16, true)); break; // Int16 + case TypeIndex::Enum8: + case TypeIndex::Enum16: + { + if (options.output_enum_as_byte_array) + { + parq::LogicalType t; + t.__set_ENUM({}); + types(T::BYTE_ARRAY, C::ENUM, t); + } + else if (type->getTypeId() == TypeIndex::Enum8) + types(T::INT32, C::INT_8, int_type(8, true)); + else + types(T::INT32, C::INT_16, int_type(16, true)); + break; + } + /// IPv4 does not have suitable parquet logical types, so we write them as plain numbers. case TypeIndex::IPv4: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32 /// Parquet doesn't have 16-bit date type, so we cast Date to 32 bits. diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index a88b6cb7a301..4b51541f4624 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -18,6 +18,10 @@ #include #include #include +#include +#include +#include + #if USE_SNAPPY #include @@ -337,6 +341,34 @@ struct ConverterString } }; +template +struct ConverterEnumAsString +{ + using Statistics = StatisticsStringRef; + + explicit ConverterEnumAsString(const ColumnPtr & c, const DataTypePtr & enum_type_) + : column(assert_cast &>(*c)), enum_type(assert_cast *>(enum_type_.get())) {} + + const ColumnVector & column; + const DataTypeEnum * enum_type; + PODArray buf; + + const parquet::ByteArray * getBatch(size_t offset, size_t count) + { + buf.resize(count); + + const auto & data = column.getData(); + + for (size_t i = 0; i < count; ++i) + { + const T value = data[offset + i]; + const StringRef s = enum_type->getNameForValue(value); + buf[i] = parquet::ByteArray(static_cast(s.size), reinterpret_cast(s.data)); + } + return buf.data(); + } +}; + struct ConverterFixedString { using Statistics = StatisticsFixedStringRef; @@ -991,8 +1023,24 @@ void writeColumnChunkBody( break; case TypeIndex::UInt16 : N(UInt16, Int32Type); break; case TypeIndex::UInt64 : N(UInt64, Int64Type); break; - case TypeIndex::Int8 : N(Int8, Int32Type); break; - case TypeIndex::Int16 : N(Int16, Int32Type); break; + case TypeIndex::Int8: + { + if (options.output_enum_as_byte_array && isEnum8(s.type)) + writeColumnImpl( + s, options, out, ConverterEnumAsString(s.primitive_column, s.type)); + else + N(Int8, Int32Type); + break; + } + case TypeIndex::Int16: + { + if (options.output_enum_as_byte_array && isEnum16(s.type)) + writeColumnImpl( + s, options, out, ConverterEnumAsString(s.primitive_column, s.type)); + else + N(Int16, Int32Type); + break; + } case TypeIndex::Int32 : N(Int32, Int32Type); break; case TypeIndex::Int64 : N(Int64, Int64Type); break; diff --git a/src/Processors/Formats/Impl/Parquet/Write.h b/src/Processors/Formats/Impl/Parquet/Write.h index 83a15da2c252..2335b9230550 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.h +++ b/src/Processors/Formats/Impl/Parquet/Write.h @@ -23,6 +23,7 @@ struct WriteOptions bool output_string_as_string = false; bool output_fixed_string_as_fixed_byte_array = true; bool output_datetime_as_uint32 = false; + bool output_enum_as_byte_array = false; CompressionMethod compression = CompressionMethod::Lz4; int compression_level = 3; diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index fbe556540ee6..b4524085749c 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -105,6 +105,7 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Blo options.output_string_as_string = format_settings.parquet.output_string_as_string; options.output_fixed_string_as_fixed_byte_array = format_settings.parquet.output_fixed_string_as_fixed_byte_array; options.output_datetime_as_uint32 = format_settings.parquet.output_datetime_as_uint32; + options.output_enum_as_byte_array = format_settings.parquet.output_enum_as_byte_array; options.data_page_size = format_settings.parquet.data_page_size; options.write_batch_size = format_settings.parquet.write_batch_size; options.write_page_index = format_settings.parquet.write_page_index; diff --git a/tests/queries/0_stateless/02735_parquet_encoder.sql b/tests/queries/0_stateless/02735_parquet_encoder.sql index 44de248aaae9..616f97ccacd5 100644 --- a/tests/queries/0_stateless/02735_parquet_encoder.sql +++ b/tests/queries/0_stateless/02735_parquet_encoder.sql @@ -7,6 +7,7 @@ set output_format_parquet_batch_size = 100; set output_format_parquet_row_group_size_bytes = 1000000000; set engine_file_truncate_on_insert=1; set allow_suspicious_low_cardinality_types=1; +set output_format_parquet_enum_as_byte_array=0; -- Write random data to parquet file, then read from it and check that it matches what we wrote. -- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive), diff --git a/tests/queries/0_stateless/03525_parquet_string_enum.reference b/tests/queries/0_stateless/03525_parquet_string_enum.reference new file mode 100644 index 000000000000..215675430a9b --- /dev/null +++ b/tests/queries/0_stateless/03525_parquet_string_enum.reference @@ -0,0 +1 @@ +physical_type: BYTE_ARRAY logical_type: Enum diff --git a/tests/queries/0_stateless/03525_parquet_string_enum.sh b/tests/queries/0_stateless/03525_parquet_string_enum.sh new file mode 100755 index 000000000000..abe55a3277f2 --- /dev/null +++ b/tests/queries/0_stateless/03525_parquet_string_enum.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "INSERT INTO FUNCTION file(03525_enum.parquet, Parquet, 'animal Enum8(\'dog\' = 1, \'cat\' = 2)') SETTINGS output_format_parquet_enum_as_byte_array=1, engine_file_truncate_on_insert=1 VALUES ('dog'), ('cat');" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('03525_enum.parquet', 'ParquetMetadata') FORMAT JSONCompactEachRow" | jq -r '.[7][] | select(.name == "animal") | "physical_type: \(.physical_type) logical_type: \(.logical_type)"'