From 3a11a8307451766527758825cab14bbd25aa3a04 Mon Sep 17 00:00:00 2001 From: Pervakov Grigorii Date: Fri, 6 Jun 2025 14:48:02 +0000 Subject: [PATCH 1/2] Merge pull request #79969 from filimonov/fix_index_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix match(col, '^…') index analysis with escaped metacharacters to avoid wrong results and crashes --- src/Storages/MergeTree/KeyCondition.cpp | 11 +- ...56_match_index_prefix_extraction.reference | 52 ++++++++ .../03456_match_index_prefix_extraction.sql | 111 ++++++++++++++++++ 3 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/03456_match_index_prefix_extraction.reference create mode 100644 tests/queries/0_stateless/03456_match_index_prefix_extraction.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 64d33da1d2c5..fbc33ec800f6 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -101,7 +101,7 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp) const char * pos = begin; const char * end = regexp.data() + regexp.size(); - while (pos != end) + while (pos < end) { switch (*pos) { @@ -124,19 +124,22 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp) case '$': case '.': case '[': + case ']': case '?': case '*': case '+': + case '\\': case '{': + case '}': + case '-': fixed_prefix += *pos; + ++pos; break; default: /// all other escape sequences are not supported - pos = end; - break; + pos = end; } - ++pos; break; } diff --git a/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference b/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference new file mode 100644 index 000000000000..72aa8c7e403f --- /dev/null +++ b/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference @@ -0,0 +1,52 @@ +Condition: (path in [\'xxx(zzz\', \'xxx(zz{\')) +1 +Condition: (path in [\'xxx)zzz\', \'xxx)zz{\')) +1 +Condition: (path in [\'xxx^zzz\', \'xxx^zz{\')) +1 +Condition: (path in [\'xxx$zzz\', \'xxx$zz{\')) +1 +Condition: (path in [\'xxx.zzz\', \'xxx.zz{\')) +1 +Condition: (path in [\'xxx[zzz\', \'xxx[zz{\')) +1 +Condition: (path in [\'xxx]zzz\', \'xxx]zz{\')) +1 +Condition: (path in [\'xxx?zzz\', \'xxx?zz{\')) +1 +Condition: (path in [\'xxx*zzz\', \'xxx*zz{\')) +1 +Condition: (path in [\'xxx+zzz\', \'xxx+zz{\')) +1 +Condition: (path in [\'xxx\\\\zzz\', \'xxx\\\\zz{\')) +1 +Condition: (path in [\'xxx{zzz\', \'xxx{zz{\')) +1 +Condition: (path in [\'xxx}zzz\', \'xxx}zz{\')) +1 +Condition: (path in [\'xxx-zzz\', \'xxx-zz{\')) +1 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: true +Condition: true diff --git a/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql b/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql new file mode 100644 index 000000000000..56eeb8bdb93d --- /dev/null +++ b/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql @@ -0,0 +1,111 @@ +SET parallel_replicas_local_plan=1; + +drop table if exists foo; + +CREATE TABLE foo (id UInt8, path String) engine = MergeTree ORDER BY (path) SETTINGS index_granularity=1; + +INSERT INTO foo VALUES (1, 'xxx|yyy'), +(2, 'xxx(zzz'), +(3, 'xxx)zzz'), +(4, 'xxx^zzz'), +(5, 'xxx$zzz'), +(6, 'xxx.zzz'), +(7, 'xxx[zzz'), +(8, 'xxx]zzz'), +(9, 'xxx?zzz'), +(10, 'xxx*zzz'), +(11, 'xxx+zzz'), +(12, 'xxx\\zzz'), +(13, 'xxx{zzz'), +(14, 'xxx}zzz'), +(15, 'xxx-zzz'); + + +-- check if also escaped sequence are properly extracted +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\(zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\(zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\)zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\)zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\^zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\^zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\$zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\$zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\.zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\.zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\[zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\[zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\]zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\]zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\?zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\?zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\*zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\*zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\+zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\+zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\\\zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\\\zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\{zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\{zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\}zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\}zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\-zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\-zzz') SETTINGS force_primary_key = 1; + + +-- those regex chars prevent the index use (only 3 first chars used during index scan) +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\0bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\0bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx(bla)')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx(bla)') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx[bla]')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx[bla]') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx^bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx^bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx.bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx.bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx+bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx+bla') SETTINGS force_primary_key = 1; + + +-- here the forth char is not used during index, because it has 0+ quantifier +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx{0,1}')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx{0,1}') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx?')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx?') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx*')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx*') SETTINGS force_primary_key = 1; + +-- some unsupported regex chars - only 3 first chars used during index scan +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\d+')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\d+') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\w+')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\w+') SETTINGS force_primary_key = 1; + + +-- fully disabled for pipes - see https://github.com/ClickHouse/ClickHouse/pull/54696 +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\|zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\|zzz') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED } + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx|foo')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx|foo') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED } From b24162488a669b7734e763da53805eb5e70b6fd1 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 18 Jun 2025 08:45:09 +0300 Subject: [PATCH 2/2] Poke CI once more