From e1123506dd5e4199dc2c4580d9aceff2835283eb Mon Sep 17 00:00:00 2001 From: robot-clickhouse-ci-1 <118761991+robot-clickhouse-ci-1@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:46:50 +0200 Subject: [PATCH] Merge pull request #48838 from amosbird/issue_48735 Fix key condition on duplicate primary keys --- src/Storages/MergeTree/KeyCondition.cpp | 33 +++---- src/Storages/MergeTree/KeyCondition.h | 8 +- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 26 ++--- .../02540_duplicate_primary_key.sql | 16 +-- .../02540_duplicate_primary_key2.reference | 1 + .../02540_duplicate_primary_key2.sql | 99 +++++++++++++++++++ 6 files changed, 142 insertions(+), 41 deletions(-) create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key2.reference create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key2.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 0d4af4880e01..710fb05c5a18 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -743,9 +743,16 @@ KeyCondition::KeyCondition( , single_point(single_point_) , strict(strict_) { + size_t key_index = 0; for (const auto & name : key_column_names) + { if (!key_columns.contains(name)) + { key_columns[name] = key_columns.size(); + key_indices.push_back(key_index); + } + ++key_index; + } auto filter_node = buildFilterNode(query, additional_filter_asts); @@ -808,9 +815,16 @@ KeyCondition::KeyCondition( , single_point(single_point_) , strict(strict_) { + size_t key_index = 0; for (const auto & name : key_column_names) + { if (!key_columns.contains(name)) + { key_columns[name] = key_columns.size(); + key_indices.push_back(key_index); + } + ++key_index; + } if (!filter_dag) { @@ -2561,25 +2575,6 @@ bool KeyCondition::alwaysFalse() const return rpn_stack[0] == 0; } -size_t KeyCondition::getMaxKeyColumn() const -{ - size_t res = 0; - for (const auto & element : rpn) - { - if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE - || element.function == RPNElement::FUNCTION_IN_RANGE - || element.function == RPNElement::FUNCTION_IS_NULL - || element.function == RPNElement::FUNCTION_IS_NOT_NULL - || element.function == RPNElement::FUNCTION_IN_SET - || element.function == RPNElement::FUNCTION_NOT_IN_SET) - { - if (element.key_column > res) - res = element.key_column; - } - } - return res; -} - bool KeyCondition::hasMonotonicFunctionsChain() const { for (const auto & element : rpn) diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 0a4ac93b082b..f29ace57e32f 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -286,9 +286,6 @@ class KeyCondition bool alwaysFalse() const; - /// Get the maximum number of the key element used in the condition. - size_t getMaxKeyColumn() const; - bool hasMonotonicFunctionsChain() const; /// Impose an additional condition: the value in the column `column` must be in the range `range`. @@ -297,6 +294,9 @@ class KeyCondition String toString() const; + /// Get the key indices of key names used in the condition. + const std::vector & getKeyIndices() const { return key_indices; } + /// Condition description for EXPLAIN query. struct Description { @@ -478,6 +478,8 @@ class KeyCondition RPN rpn; ColumnIndices key_columns; + std::vector key_indices; + /// Expression which is used for key condition. const ExpressionActionsPtr key_expr; /// All intermediate columns are used to calculate key_expr. diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 2039912106ce..22df8f298c4e 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1430,18 +1430,21 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( return res; } - size_t used_key_size = key_condition.getMaxKeyColumn() + 1; - const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name; + const auto & primary_key = metadata_snapshot->getPrimaryKey(); + auto index_columns = std::make_shared(); + const auto & key_indices = key_condition.getKeyIndices(); + DataTypes key_types; + for (size_t i : key_indices) + { + index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]}); + key_types.emplace_back(primary_key.data_types[i]); + } - std::function create_field_ref; /// If there are no monotonic functions, there is no need to save block reference. /// Passing explicit field to FieldRef allows to optimize ranges and shows better performance. - const auto & primary_key = metadata_snapshot->getPrimaryKey(); + std::function create_field_ref; if (key_condition.hasMonotonicFunctionsChain()) { - auto index_columns = std::make_shared(); - for (size_t i = 0; i < used_key_size; ++i) - index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]}); create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field) { @@ -1453,9 +1456,9 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( } else { - create_field_ref = [&index](size_t row, size_t column, FieldRef & field) + create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field) { - index[column]->get(row, field); + (*index_columns)[column].column->get(row, field); // NULL_LAST if (field.isNull()) field = POSITIVE_INFINITY; @@ -1463,6 +1466,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( } /// NOTE Creating temporary Field objects to pass to KeyCondition. + size_t used_key_size = key_indices.size(); std::vector index_left(used_key_size); std::vector index_right(used_key_size); @@ -1487,10 +1491,10 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( create_field_ref(range.end, i, index_right[i]); } } - return key_condition.mayBeTrueInRange( - used_key_size, index_left.data(), index_right.data(), primary_key.data_types); + return key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types); }; + const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name; if (!key_condition.matchesExactContinuousRange()) { // Do exclusion search, where we drop ranges that do not match diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.sql b/tests/queries/0_stateless/02540_duplicate_primary_key.sql index 322b6d748455..a084d76964be 100644 --- a/tests/queries/0_stateless/02540_duplicate_primary_key.sql +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.sql @@ -90,16 +90,16 @@ ORDER BY (coverage, situation_name, NAME_toe, NAME_cockroach); insert into test select * from generateRandom() limit 10; -with dissonance as ( - Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation +with dissonance as ( + Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation from test - group by flour having flour >= toDate(now())-100 + group by flour having flour >= toDate(now())-100 ), -cheetah as ( - Select flour, regulation from dissonance - union distinct - Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation -) +cheetah as ( + Select flour, regulation from dissonance + union distinct + Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation +) Select flour, regulation from cheetah order by flour with fill step 1 limit 100 format Null; drop table test; diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key2.reference b/tests/queries/0_stateless/02540_duplicate_primary_key2.reference new file mode 100644 index 000000000000..08839f6bb296 --- /dev/null +++ b/tests/queries/0_stateless/02540_duplicate_primary_key2.reference @@ -0,0 +1 @@ +200 diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key2.sql b/tests/queries/0_stateless/02540_duplicate_primary_key2.sql new file mode 100644 index 000000000000..d0f02a894f2b --- /dev/null +++ b/tests/queries/0_stateless/02540_duplicate_primary_key2.sql @@ -0,0 +1,99 @@ +drop table if exists test; + +set allow_suspicious_low_cardinality_types = 1; + +CREATE TABLE test +( + `timestamp` DateTime, + `latitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), + `longitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), + `xxxx1` LowCardinality(UInt8), + `xxxx2` LowCardinality(Nullable(Int16)), + `xxxx3` LowCardinality(Nullable(Int16)), + `xxxx4` Nullable(Int32), + `xxxx5` LowCardinality(Nullable(Int32)), + `xxxx6` Nullable(Int32), + `xxxx7` Nullable(Int32), + `xxxx8` LowCardinality(Int32), + `xxxx9` LowCardinality(Nullable(Int16)), + `xxxx10` LowCardinality(Nullable(Int16)), + `xxxx11` LowCardinality(Nullable(Int16)), + `xxxx12` LowCardinality(String), + `xxxx13` Nullable(Float32), + `xxxx14` LowCardinality(String), + `xxxx15` LowCardinality(Nullable(String)), + `xxxx16` LowCardinality(String), + `xxxx17` LowCardinality(String), + `xxxx18` FixedString(19), + `xxxx19` FixedString(17), + `xxxx20` LowCardinality(UInt8), + `xxxx21` LowCardinality(Nullable(Int16)), + `xxxx22` LowCardinality(Nullable(Int16)), + `xxxx23` LowCardinality(Nullable(Int16)), + `xxxx24` LowCardinality(Nullable(Int16)), + `xxxx25` LowCardinality(Nullable(Int16)), + `xxxx26` LowCardinality(Nullable(Int16)), + `xxxx27` Nullable(Float32), + `xxxx28` LowCardinality(Nullable(String)), + `xxxx29` LowCardinality(String), + `xxxx30` LowCardinality(String), + `xxxx31` LowCardinality(Nullable(String)), + `xxxx32` UInt64, + PROJECTION cumsum_projection_simple + ( + SELECT + xxxx1, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + xxxx17, + xxxx16, + xxxx14, + xxxx9, + xxxx10, + xxxx21, + xxxx22, + xxxx11, + sum(multiIf(xxxx21 IS NULL, 0, 1)), + sum(multiIf(xxxx22 IS NULL, 0, 1)), + sum(multiIf(xxxx23 IS NULL, 0, 1)), + max(toStartOfInterval(timestamp, toIntervalDay(1))), + max(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')), + min(toStartOfInterval(timestamp, toIntervalDay(1))), + min(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')), + count(), + sum(1), + COUNTDistinct(xxxx16), + COUNTDistinct(xxxx31), + COUNTDistinct(xxxx14), + COUNTDistinct(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)')) + GROUP BY + xxxx1, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + xxxx1, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + xxxx17, + xxxx16, + xxxx14, + xxxx9, + xxxx10, + xxxx21, + xxxx22, + xxxx11 + ) +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(timestamp) +ORDER BY (xxxx17, xxxx14, xxxx16, toStartOfDay(timestamp), left(xxxx19, 10), timestamp); + +INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'Airtel' as xxxx14 FROM generateRandom() LIMIT 100; +INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'BSNL' as xxxx14 FROM generateRandom() LIMIT 100; +INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'xxx' as xxxx14 FROM generateRandom() LIMIT 100; + +select sum(1) from test where toStartOfInterval(timestamp, INTERVAL 1 day) >= TIMESTAMP '2022-02-01 01:00:00' and xxxx14 in ('Airtel', 'BSNL') and xxxx1 = 1 GROUP BY xxxx16; + +drop table test;