From e1123506dd5e4199dc2c4580d9aceff2835283eb Mon Sep 17 00:00:00 2001
From: robot-clickhouse-ci-1
 <118761991+robot-clickhouse-ci-1@users.noreply.github.com>
Date: Fri, 21 Apr 2023 12:46:50 +0200
Subject: [PATCH] Merge pull request #48838 from amosbird/issue_48735

Fix key condition on duplicate primary keys
---
 src/Storages/MergeTree/KeyCondition.cpp       | 33 +++----
 src/Storages/MergeTree/KeyCondition.h         |  8 +-
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 26 ++---
 .../02540_duplicate_primary_key.sql           | 16 +--
 .../02540_duplicate_primary_key2.reference    |  1 +
 .../02540_duplicate_primary_key2.sql          | 99 +++++++++++++++++++
 6 files changed, 142 insertions(+), 41 deletions(-)
 create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key2.reference
 create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key2.sql

diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index 0d4af4880e01..710fb05c5a18 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -743,9 +743,16 @@ KeyCondition::KeyCondition(
     , single_point(single_point_)
     , strict(strict_)
 {
+    size_t key_index = 0;
     for (const auto & name : key_column_names)
+    {
         if (!key_columns.contains(name))
+        {
             key_columns[name] = key_columns.size();
+            key_indices.push_back(key_index);
+        }
+        ++key_index;
+    }
 
     auto filter_node = buildFilterNode(query, additional_filter_asts);
 
@@ -808,9 +815,16 @@ KeyCondition::KeyCondition(
     , single_point(single_point_)
     , strict(strict_)
 {
+    size_t key_index = 0;
     for (const auto & name : key_column_names)
+    {
         if (!key_columns.contains(name))
+        {
             key_columns[name] = key_columns.size();
+            key_indices.push_back(key_index);
+        }
+        ++key_index;
+    }
 
     if (!filter_dag)
     {
@@ -2561,25 +2575,6 @@ bool KeyCondition::alwaysFalse() const
     return rpn_stack[0] == 0;
 }
 
-size_t KeyCondition::getMaxKeyColumn() const
-{
-    size_t res = 0;
-    for (const auto & element : rpn)
-    {
-        if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE
-            || element.function == RPNElement::FUNCTION_IN_RANGE
-            || element.function == RPNElement::FUNCTION_IS_NULL
-            || element.function == RPNElement::FUNCTION_IS_NOT_NULL
-            || element.function == RPNElement::FUNCTION_IN_SET
-            || element.function == RPNElement::FUNCTION_NOT_IN_SET)
-        {
-            if (element.key_column > res)
-                res = element.key_column;
-        }
-    }
-    return res;
-}
-
 bool KeyCondition::hasMonotonicFunctionsChain() const
 {
     for (const auto & element : rpn)
diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h
index 0a4ac93b082b..f29ace57e32f 100644
--- a/src/Storages/MergeTree/KeyCondition.h
+++ b/src/Storages/MergeTree/KeyCondition.h
@@ -286,9 +286,6 @@ class KeyCondition
 
     bool alwaysFalse() const;
 
-    /// Get the maximum number of the key element used in the condition.
-    size_t getMaxKeyColumn() const;
-
     bool hasMonotonicFunctionsChain() const;
 
     /// Impose an additional condition: the value in the column `column` must be in the range `range`.
@@ -297,6 +294,9 @@ class KeyCondition
 
     String toString() const;
 
+    /// Get the key indices of key names used in the condition.
+    const std::vector<size_t> & getKeyIndices() const { return key_indices; }
+
     /// Condition description for EXPLAIN query.
     struct Description
     {
@@ -478,6 +478,8 @@ class KeyCondition
     RPN rpn;
 
     ColumnIndices key_columns;
+    std::vector<size_t> key_indices;
+
     /// Expression which is used for key condition.
     const ExpressionActionsPtr key_expr;
     /// All intermediate columns are used to calculate key_expr.
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 2039912106ce..22df8f298c4e 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -1430,18 +1430,21 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
         return res;
     }
 
-    size_t used_key_size = key_condition.getMaxKeyColumn() + 1;
-    const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
+    const auto & primary_key = metadata_snapshot->getPrimaryKey();
+    auto index_columns = std::make_shared<ColumnsWithTypeAndName>();
+    const auto & key_indices = key_condition.getKeyIndices();
+    DataTypes key_types;
+    for (size_t i : key_indices)
+    {
+        index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
+        key_types.emplace_back(primary_key.data_types[i]);
+    }
 
-    std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
     /// If there are no monotonic functions, there is no need to save block reference.
     /// Passing explicit field to FieldRef allows to optimize ranges and shows better performance.
-    const auto & primary_key = metadata_snapshot->getPrimaryKey();
+    std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
     if (key_condition.hasMonotonicFunctionsChain())
     {
-        auto index_columns = std::make_shared<ColumnsWithTypeAndName>();
-        for (size_t i = 0; i < used_key_size; ++i)
-            index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
 
         create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
         {
@@ -1453,9 +1456,9 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     }
     else
     {
-        create_field_ref = [&index](size_t row, size_t column, FieldRef & field)
+        create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
         {
-            index[column]->get(row, field);
+            (*index_columns)[column].column->get(row, field);
             // NULL_LAST
             if (field.isNull())
                 field = POSITIVE_INFINITY;
@@ -1463,6 +1466,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     }
 
     /// NOTE Creating temporary Field objects to pass to KeyCondition.
+    size_t used_key_size = key_indices.size();
     std::vector<FieldRef> index_left(used_key_size);
     std::vector<FieldRef> index_right(used_key_size);
 
@@ -1487,10 +1491,10 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
                 create_field_ref(range.end, i, index_right[i]);
             }
         }
-        return key_condition.mayBeTrueInRange(
-            used_key_size, index_left.data(), index_right.data(), primary_key.data_types);
+        return key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types);
     };
 
+    const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
     if (!key_condition.matchesExactContinuousRange())
     {
         // Do exclusion search, where we drop ranges that do not match
diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.sql b/tests/queries/0_stateless/02540_duplicate_primary_key.sql
index 322b6d748455..a084d76964be 100644
--- a/tests/queries/0_stateless/02540_duplicate_primary_key.sql
+++ b/tests/queries/0_stateless/02540_duplicate_primary_key.sql
@@ -90,16 +90,16 @@ ORDER BY (coverage, situation_name, NAME_toe, NAME_cockroach);
 
 insert into test select * from generateRandom() limit 10;
 
-with dissonance as ( 
-    Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation 
+with dissonance as (
+    Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation
     from test
-    group by flour having flour >= toDate(now())-100 
+    group by flour having flour >= toDate(now())-100
     ),
-cheetah as ( 
-    Select flour, regulation from dissonance 
-    union distinct 
-    Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation 
-) 
+cheetah as (
+    Select flour, regulation from dissonance
+    union distinct
+    Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation
+)
 Select flour, regulation from cheetah order by flour with fill step 1 limit 100 format Null;
 
 drop table test;
diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key2.reference b/tests/queries/0_stateless/02540_duplicate_primary_key2.reference
new file mode 100644
index 000000000000..08839f6bb296
--- /dev/null
+++ b/tests/queries/0_stateless/02540_duplicate_primary_key2.reference
@@ -0,0 +1 @@
+200
diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key2.sql b/tests/queries/0_stateless/02540_duplicate_primary_key2.sql
new file mode 100644
index 000000000000..d0f02a894f2b
--- /dev/null
+++ b/tests/queries/0_stateless/02540_duplicate_primary_key2.sql
@@ -0,0 +1,99 @@
+drop table if exists test;
+
+set allow_suspicious_low_cardinality_types = 1;
+
+CREATE TABLE test
+(
+    `timestamp` DateTime,
+    `latitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)),
+    `longitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)),
+    `xxxx1` LowCardinality(UInt8),
+    `xxxx2` LowCardinality(Nullable(Int16)),
+    `xxxx3` LowCardinality(Nullable(Int16)),
+    `xxxx4` Nullable(Int32),
+    `xxxx5` LowCardinality(Nullable(Int32)),
+    `xxxx6` Nullable(Int32),
+    `xxxx7` Nullable(Int32),
+    `xxxx8` LowCardinality(Int32),
+    `xxxx9` LowCardinality(Nullable(Int16)),
+    `xxxx10` LowCardinality(Nullable(Int16)),
+    `xxxx11` LowCardinality(Nullable(Int16)),
+    `xxxx12` LowCardinality(String),
+    `xxxx13` Nullable(Float32),
+    `xxxx14` LowCardinality(String),
+    `xxxx15` LowCardinality(Nullable(String)),
+    `xxxx16` LowCardinality(String),
+    `xxxx17` LowCardinality(String),
+    `xxxx18` FixedString(19),
+    `xxxx19` FixedString(17),
+    `xxxx20` LowCardinality(UInt8),
+    `xxxx21` LowCardinality(Nullable(Int16)),
+    `xxxx22` LowCardinality(Nullable(Int16)),
+    `xxxx23` LowCardinality(Nullable(Int16)),
+    `xxxx24` LowCardinality(Nullable(Int16)),
+    `xxxx25` LowCardinality(Nullable(Int16)),
+    `xxxx26` LowCardinality(Nullable(Int16)),
+    `xxxx27` Nullable(Float32),
+    `xxxx28` LowCardinality(Nullable(String)),
+    `xxxx29` LowCardinality(String),
+    `xxxx30` LowCardinality(String),
+    `xxxx31` LowCardinality(Nullable(String)),
+    `xxxx32` UInt64,
+    PROJECTION cumsum_projection_simple
+    (
+        SELECT
+            xxxx1,
+            toStartOfInterval(timestamp, toIntervalMonth(1)),
+            toStartOfWeek(timestamp, 8),
+            toStartOfInterval(timestamp, toIntervalDay(1)),
+            xxxx17,
+            xxxx16,
+            xxxx14,
+            xxxx9,
+            xxxx10,
+            xxxx21,
+            xxxx22,
+            xxxx11,
+            sum(multiIf(xxxx21 IS NULL, 0, 1)),
+            sum(multiIf(xxxx22 IS NULL, 0, 1)),
+            sum(multiIf(xxxx23 IS NULL, 0, 1)),
+            max(toStartOfInterval(timestamp, toIntervalDay(1))),
+            max(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')),
+            min(toStartOfInterval(timestamp, toIntervalDay(1))),
+            min(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')),
+            count(),
+            sum(1),
+            COUNTDistinct(xxxx16),
+            COUNTDistinct(xxxx31),
+            COUNTDistinct(xxxx14),
+            COUNTDistinct(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'))
+        GROUP BY
+            xxxx1,
+            toStartOfInterval(timestamp, toIntervalMonth(1)),
+            toStartOfWeek(timestamp, 8),
+            toStartOfInterval(timestamp, toIntervalDay(1)),
+            xxxx1,
+            toStartOfInterval(timestamp, toIntervalMonth(1)),
+            toStartOfWeek(timestamp, 8),
+            toStartOfInterval(timestamp, toIntervalDay(1)),
+            xxxx17,
+            xxxx16,
+            xxxx14,
+            xxxx9,
+            xxxx10,
+            xxxx21,
+            xxxx22,
+            xxxx11
+    )
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(timestamp)
+ORDER BY (xxxx17, xxxx14, xxxx16, toStartOfDay(timestamp), left(xxxx19, 10), timestamp);
+
+INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'Airtel' as xxxx14 FROM generateRandom() LIMIT 100;
+INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'BSNL' as xxxx14 FROM generateRandom() LIMIT 100;
+INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'xxx' as xxxx14 FROM generateRandom() LIMIT 100;
+
+select sum(1) from test where toStartOfInterval(timestamp, INTERVAL 1 day) >= TIMESTAMP '2022-02-01 01:00:00' and xxxx14 in ('Airtel', 'BSNL') and xxxx1 = 1 GROUP BY xxxx16;
+
+drop table test;