Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Client/BuzzHouse/Generator/ServerSettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,7 @@ static std::unordered_map<String, CHSetting> serverSettings2 = {
},
{},
false)},
{"output_format_parquet_enum_as_byte_array", CHSetting(trueOrFalse, {}, false)},
{"output_format_parquet_datetime_as_uint32", trueOrFalseSettingNoOracle},
{"output_format_parquet_fixed_string_as_fixed_byte_array", trueOrFalseSettingNoOracle},
{"output_format_parquet_parallel_encoding", trueOrFalseSettingNoOracle},
Expand Down
3 changes: 3 additions & 0 deletions src/Core/FormatFactorySettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,9 @@ Where in the parquet file to place the bloom filters. Bloom filters will be writ
)", 0) \
DECLARE(Bool, output_format_parquet_datetime_as_uint32, false, R"(
Write DateTime values as raw unix timestamp (read back as UInt32), instead of converting to milliseconds (read back as DateTime64(3)).
)", 0) \
DECLARE(Bool, output_format_parquet_enum_as_byte_array, true, R"(
Write enum using parquet physical type: BYTE_ARRAY and logical type: ENUM
)", 0) \
DECLARE(String, output_format_avro_codec, "", R"(
Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.
Expand Down
5 changes: 5 additions & 0 deletions src/Core/SettingsChangesHistory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
/// controls new feature and it's 'true' by default, use 'false' as previous_value).
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
/// Note: please check if the key already exists to prevent duplicate entries.

addSettingsChanges(settings_changes_history, "25.6.5.2000",
{
{"output_format_parquet_enum_as_byte_array", true, true, "Enable writing Enum as byte array in Parquet by default"},
});
addSettingsChanges(settings_changes_history, "25.6",
{
{"output_format_native_use_flattened_dynamic_and_json_serialization", false, false, "Add flattened Dynamic/JSON serializations to Native format"},
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.parquet.output_string_as_string = settings[Setting::output_format_parquet_string_as_string];
format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings[Setting::output_format_parquet_fixed_string_as_fixed_byte_array];
format_settings.parquet.output_datetime_as_uint32 = settings[Setting::output_format_parquet_datetime_as_uint32];
format_settings.parquet.output_enum_as_byte_array = settings[Setting::output_format_parquet_enum_as_byte_array];
format_settings.parquet.max_block_size = settings[Setting::input_format_parquet_max_block_size];
format_settings.parquet.prefer_block_bytes = settings[Setting::input_format_parquet_prefer_block_bytes];
format_settings.parquet.output_compression_method = settings[Setting::output_format_parquet_compression_method];
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ struct FormatSettings
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
bool output_datetime_as_uint32 = false;
bool output_enum_as_byte_array = false;
bool preserve_order = false;
bool use_custom_encoder = true;
bool parallel_encoding = true;
Expand Down
22 changes: 16 additions & 6 deletions src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,22 @@ void preparePrimitiveColumn(ColumnPtr column, DataTypePtr type, const std::strin
case TypeIndex::Int64: types(T::INT64); break;
case TypeIndex::Float32: types(T::FLOAT); break;
case TypeIndex::Float64: types(T::DOUBLE); break;

/// These don't have suitable parquet logical types, so we write them as plain numbers.
/// (Parquet has "enums" but they're just strings, with nowhere to declare all possible enum
/// values in advance as part of the data type.)
case TypeIndex::Enum8: types(T::INT32, C::INT_8, int_type(8, true)); break; // Int8
case TypeIndex::Enum16: types(T::INT32, C::INT_16, int_type(16, true)); break; // Int16
case TypeIndex::Enum8:
case TypeIndex::Enum16:
{
if (options.output_enum_as_byte_array)
{
parq::LogicalType t;
t.__set_ENUM({});
types(T::BYTE_ARRAY, C::ENUM, t);
}
else if (type->getTypeId() == TypeIndex::Enum8)
types(T::INT32, C::INT_8, int_type(8, true));
else
types(T::INT32, C::INT_16, int_type(16, true));
break;
}
/// IPv4 does not have suitable parquet logical types, so we write them as plain numbers.
case TypeIndex::IPv4: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32

/// Parquet doesn't have 16-bit date type, so we cast Date to 32 bits.
Expand Down
52 changes: 50 additions & 2 deletions src/Processors/Formats/Impl/Parquet/Write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include <Common/config_version.h>
#include <Common/formatReadable.h>
#include <Common/HashTable/HashSet.h>
#include <DataTypes/DataTypeEnum.h>
#include <Core/Block.h>
#include <DataTypes/DataTypeCustom.h>


#if USE_SNAPPY
#include <snappy.h>
Expand Down Expand Up @@ -337,6 +341,34 @@ struct ConverterString
}
};

template <typename T>
struct ConverterEnumAsString
{
using Statistics = StatisticsStringRef;

explicit ConverterEnumAsString(const ColumnPtr & c, const DataTypePtr & enum_type_)
: column(assert_cast<const ColumnVector<T> &>(*c)), enum_type(assert_cast<const DataTypeEnum<T> *>(enum_type_.get())) {}

const ColumnVector<T> & column;
const DataTypeEnum<T> * enum_type;
PODArray<parquet::ByteArray> buf;

const parquet::ByteArray * getBatch(size_t offset, size_t count)
{
buf.resize(count);

const auto & data = column.getData();

for (size_t i = 0; i < count; ++i)
{
const T value = data[offset + i];
const StringRef s = enum_type->getNameForValue(value);
buf[i] = parquet::ByteArray(static_cast<UInt32>(s.size), reinterpret_cast<const uint8_t *>(s.data));
}
return buf.data();
}
};

struct ConverterFixedString
{
using Statistics = StatisticsFixedStringRef;
Expand Down Expand Up @@ -991,8 +1023,24 @@ void writeColumnChunkBody(
break;
case TypeIndex::UInt16 : N(UInt16, Int32Type); break;
case TypeIndex::UInt64 : N(UInt64, Int64Type); break;
case TypeIndex::Int8 : N(Int8, Int32Type); break;
case TypeIndex::Int16 : N(Int16, Int32Type); break;
case TypeIndex::Int8:
{
if (options.output_enum_as_byte_array && isEnum8(s.type))
writeColumnImpl<parquet::ByteArrayType>(
s, options, out, ConverterEnumAsString<Int8>(s.primitive_column, s.type));
else
N(Int8, Int32Type);
break;
}
case TypeIndex::Int16:
{
if (options.output_enum_as_byte_array && isEnum16(s.type))
writeColumnImpl<parquet::ByteArrayType>(
s, options, out, ConverterEnumAsString<Int16>(s.primitive_column, s.type));
else
N(Int16, Int32Type);
break;
}
case TypeIndex::Int32 : N(Int32, Int32Type); break;
case TypeIndex::Int64 : N(Int64, Int64Type); break;

Expand Down
1 change: 1 addition & 0 deletions src/Processors/Formats/Impl/Parquet/Write.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct WriteOptions
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
bool output_datetime_as_uint32 = false;
bool output_enum_as_byte_array = false;

CompressionMethod compression = CompressionMethod::Lz4;
int compression_level = 3;
Expand Down
1 change: 1 addition & 0 deletions src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Blo
options.output_string_as_string = format_settings.parquet.output_string_as_string;
options.output_fixed_string_as_fixed_byte_array = format_settings.parquet.output_fixed_string_as_fixed_byte_array;
options.output_datetime_as_uint32 = format_settings.parquet.output_datetime_as_uint32;
options.output_enum_as_byte_array = format_settings.parquet.output_enum_as_byte_array;
options.data_page_size = format_settings.parquet.data_page_size;
options.write_batch_size = format_settings.parquet.write_batch_size;
options.write_page_index = format_settings.parquet.write_page_index;
Expand Down
1 change: 1 addition & 0 deletions tests/queries/0_stateless/02735_parquet_encoder.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set output_format_parquet_batch_size = 100;
set output_format_parquet_row_group_size_bytes = 1000000000;
set engine_file_truncate_on_insert=1;
set allow_suspicious_low_cardinality_types=1;
set output_format_parquet_enum_as_byte_array=0;

-- Write random data to parquet file, then read from it and check that it matches what we wrote.
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
physical_type: BYTE_ARRAY logical_type: Enum
10 changes: 10 additions & 0 deletions tests/queries/0_stateless/03525_parquet_string_enum.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash
# Tags: no-fasttest, no-parallel

CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh

${CLICKHOUSE_CLIENT} -q "INSERT INTO FUNCTION file(03525_enum.parquet, Parquet, 'animal Enum8(\'dog\' = 1, \'cat\' = 2)') SETTINGS output_format_parquet_enum_as_byte_array=1, engine_file_truncate_on_insert=1 VALUES ('dog'), ('cat');"

${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('03525_enum.parquet', 'ParquetMetadata') FORMAT JSONCompactEachRow" | jq -r '.[7][] | select(.name == "animal") | "physical_type: \(.physical_type) logical_type: \(.logical_type)"'
Loading