diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 3895cf3e77db..645d87dfda63 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -102,7 +102,17 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } case State::FLUSH_PAIR: { - return flushPair(file, key, value, row_offset); + incrementRowOffset(row_offset); + return state_handler.flushPair(file, key, value); + } + case State::FLUSH_PAIR_AFTER_QUOTED_VALUE: + { + incrementRowOffset(row_offset); + return state_handler.flushPairAfterQuotedValue(file, key, value); + } + case State::WAITING_PAIR_DELIMITER: + { + return state_handler.waitPairDelimiter(file); } case State::END: { @@ -111,8 +121,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } } - NextState flushPair(const std::string_view & file, auto & key, - auto & value, uint64_t & row_offset) + void incrementRowOffset(uint64_t & row_offset) { row_offset++; @@ -120,11 +129,6 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor { throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); } - - key.commit(); - value.commit(); - - return {0, file.empty() ? State::END : State::WAITING_KEY}; } void reset(auto & key, auto & value) diff --git a/src/Functions/keyvaluepair/impl/NeedleFactory.h b/src/Functions/keyvaluepair/impl/NeedleFactory.h index 83862a2281a5..5e58523fb277 100644 --- a/src/Functions/keyvaluepair/impl/NeedleFactory.h +++ b/src/Functions/keyvaluepair/impl/NeedleFactory.h @@ -20,7 +20,7 @@ template class NeedleFactory { public: - SearchSymbols getWaitNeedles(const Configuration & extractor_configuration) + SearchSymbols getWaitKeyNeedles(const Configuration & extractor_configuration) { const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; @@ -39,6 +39,17 @@ class NeedleFactory return SearchSymbols {std::string{needles.data(), needles.size()}}; } + SearchSymbols getWaitPairDelimiterNeedles(const Configuration & extractor_configuration) + { + const auto & pair_delimiters = extractor_configuration.pair_delimiters; + + std::vector needles; + + std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles)); + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } + SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration) { const auto & [key_value_delimiter, quoting_character, pair_delimiters] diff --git a/src/Functions/keyvaluepair/impl/StateHandler.h b/src/Functions/keyvaluepair/impl/StateHandler.h index 178974e9d366..f102debec609 100644 --- a/src/Functions/keyvaluepair/impl/StateHandler.h +++ b/src/Functions/keyvaluepair/impl/StateHandler.h @@ -30,6 +30,11 @@ class StateHandler READING_QUOTED_VALUE, // In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END. FLUSH_PAIR, + // The `READING_QUOTED_VALUE` will assert the closing quoting character is found and then flush the pair. In this case, we should not + // move from `FLUSH_PAIR` directly to `WAITING_FOR_KEY` because a pair delimiter has not been found. Might jump to WAITING_FOR_PAIR_DELIMITER or END + FLUSH_PAIR_AFTER_QUOTED_VALUE, + // Might jump to WAITING_KEY or END. + WAITING_PAIR_DELIMITER, END }; diff --git a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h index cf31d30b9dc4..8b18b2050fdd 100644 --- a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h +++ b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h @@ -40,10 +40,11 @@ class StateHandlerImpl : public StateHandler * */ NeedleFactory needle_factory; - wait_needles = needle_factory.getWaitNeedles(configuration); + wait_key_needles = needle_factory.getWaitKeyNeedles(configuration); read_key_needles = needle_factory.getReadKeyNeedles(configuration); read_value_needles = needle_factory.getReadValueNeedles(configuration); read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration); + wait_pair_delimiter_needles = needle_factory.getWaitPairDelimiterNeedles(configuration); } /* @@ -51,7 +52,7 @@ class StateHandlerImpl : public StateHandler * */ [[nodiscard]] NextState waitKey(std::string_view file) const { - if (const auto * p = find_first_not_symbols_or_null(file, wait_needles)) + if (const auto * p = find_first_not_symbols_or_null(file, wait_key_needles)) { const size_t character_position = p - file.begin(); if (isQuotingCharacter(*p)) @@ -284,7 +285,7 @@ class StateHandlerImpl : public StateHandler { value.append(file.begin() + pos, file.begin() + character_position); - return {next_pos, State::FLUSH_PAIR}; + return {next_pos, State::FLUSH_PAIR_AFTER_QUOTED_VALUE}; } pos = next_pos; @@ -292,14 +293,44 @@ class StateHandlerImpl : public StateHandler return {file.size(), State::END}; } + [[nodiscard]] NextState flushPair(std::string_view file, auto & key, auto & value) const + { + key.commit(); + value.commit(); + + return {0, file.empty() ? State::END : State::WAITING_KEY}; + } + + [[nodiscard]] NextState flushPairAfterQuotedValue(std::string_view file, auto & key, auto & value) const + { + key.commit(); + value.commit(); + + return {0, file.empty() ? State::END : State::WAITING_PAIR_DELIMITER}; + } + + [[nodiscard]] NextState waitPairDelimiter(std::string_view file) const + { + if (const auto * p = find_first_symbols_or_null(file, wait_pair_delimiter_needles)) + { + const size_t character_position = p - file.data(); + size_t next_pos = character_position + 1u; + + return {next_pos, State::WAITING_KEY}; + } + + return {file.size(), State::END}; + } + const Configuration configuration; private: - SearchSymbols wait_needles; + SearchSymbols wait_key_needles; SearchSymbols read_key_needles; SearchSymbols read_value_needles; SearchSymbols read_quoted_needles; + SearchSymbols wait_pair_delimiter_needles; /* * Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index 9a0cfdffcb50..bf6f30bfc1b1 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -381,3 +381,15 @@ WITH SELECT x; {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- after parsing a quoted value, the next key should only start after a pair delimiter +WITH + extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; +{'key':'quoted_value','second_key':'0'} diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 4f3db3f166b4..ce05ee6a3f3b 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -516,3 +516,15 @@ WITH ) AS x SELECT x; + +-- after parsing a quoted value, the next key should only start after a pair delimiter +WITH + extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x;