diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 5c367bb69f0d..eebb65879695 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -322,6 +322,69 @@ static std::shared_ptr getNestedArrowColumn(std::shared_ptr return std::make_shared(array_vector); } +static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues( + const std::shared_ptr & dict_values, + const ColumnPtr & indexes_column, + const String & column_name +) +{ + auto lc_type = std::make_shared(dict_values->type); + + auto lc_column = lc_type->createColumn(); + + for (auto i = 0u; i < indexes_column->size(); i++) + { + Field f; + dict_values->column->get(indexes_column->getUInt(i), f); + lc_column->insert(f); + } + + return {std::move(lc_column), std::move(lc_type), column_name}; +} + +/* + * Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index. + * It doesn't have the concept of null or default values. + * An empty string is just a regular value appended at any position of the dictionary. + * Null values have an index of 0, but it should be ignored since the nullmap will return null. + * In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning. + * [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1. + * No nullmap is used. + * */ +static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues( + const std::shared_ptr & dict_values, + const ColumnPtr & indexes_column, + const ColumnPtr & nullmap_column, + const String & column_name +) +{ + /* + * ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type. + * Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable + * so LC column is created from nullable type and a null value at the beginning of the collection + * is automatically added. + * */ + auto lc_type = std::make_shared(makeNullable(dict_values->type)); + + auto lc_column = lc_type->createColumn(); + + for (auto i = 0u; i < indexes_column->size(); i++) + { + if (nullmap_column && nullmap_column->getBool(i)) + { + lc_column->insertDefault(); + } + else + { + Field f; + dict_values->column->get(indexes_column->getUInt(i), f); + lc_column->insert(f); + } + } + + return {std::move(lc_column), std::move(lc_type), column_name}; +} + static ColumnWithTypeAndName readColumnFromArrowColumn( std::shared_ptr & arrow_column, const std::string & column_name, @@ -331,7 +394,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( bool read_ints_as_dates) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST - && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) + && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT && + arrow_column->type()->id() != arrow::Type::DICTIONARY) { auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); auto nullmap_column = readByteMapFromArrowColumn(arrow_column); @@ -439,12 +503,6 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( } auto arrow_dict_column = std::make_shared(dict_array); auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); - - /// We should convert read column to ColumnUnique. - auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); - auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); - static_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); - dict_column.column = std::move(tmp_dict_column); dict_values = std::make_shared(std::move(dict_column)); } @@ -457,9 +515,19 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( auto arrow_indexes_column = std::make_shared(indexes_array); auto indexes_column = readColumnWithIndexesData(arrow_indexes_column); - auto lc_column = ColumnLowCardinality::create(dict_values->column, indexes_column); - auto lc_type = std::make_shared(dict_values->type); - return {std::move(lc_column), std::move(lc_type), column_name}; + + const auto contains_null = arrow_column->null_count() > 0; + + if (contains_null) + { + auto nullmap_column = readByteMapFromArrowColumn(arrow_column); + + return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name); + } + else + { + return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name); + } } # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ diff --git a/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.reference b/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.reference new file mode 100644 index 000000000000..a6c4a5b13a22 --- /dev/null +++ b/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.reference @@ -0,0 +1,9 @@ +lc_nullable_string +LowCardinality(Nullable(String)) +one +\N +three + +\N + +six diff --git a/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.sh b/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.sh new file mode 100755 index 000000000000..2d854c956b25 --- /dev/null +++ b/tests/queries/0_stateless/02381_arrow_dict_of_nullable_string_to_lc.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# ## reading ArrowStream file from python +# import pyarrow as pa +# stream = pa.ipc.open_stream("test.arrows") +# x = stream.read_all() +# print(x) + +## writing ArrowStream file from python +# import pyarrow as pa +# data = [ +# pa.array(["one", None, "three", "", None, "", "six"]).dictionary_encode(), +# ] +# batch = pa.record_batch(data, names=['id', 'lc_nullable', 'lc_int_nullable', 'bool_nullable']) +# writer = pa.ipc.new_stream("test4.arrows", batch.schema) +# writer.write_batch(batch) +# writer.close() + +# cat data.arrow | gzip | base64 + +cat <