Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,17 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}
case State::FLUSH_PAIR:
{
return flushPair(file, key, value, row_offset);
incrementRowOffset(row_offset);
return state_handler.flushPair(file, key, value);
}
case State::FLUSH_PAIR_AFTER_QUOTED_VALUE:
{
incrementRowOffset(row_offset);
return state_handler.flushPairAfterQuotedValue(file, key, value);
}
case State::WAITING_PAIR_DELIMITER:
{
return state_handler.waitPairDelimiter(file);
}
case State::END:
{
Expand All @@ -111,20 +121,14 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}
}

NextState flushPair(const std::string_view & file, auto & key,
auto & value, uint64_t & row_offset)
void incrementRowOffset(uint64_t & row_offset)
{
row_offset++;

if (row_offset > max_number_of_pairs)
{
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
}

key.commit();
value.commit();

return {0, file.empty() ? State::END : State::WAITING_KEY};
}

void reset(auto & key, auto & value)
Expand Down
13 changes: 12 additions & 1 deletion src/Functions/keyvaluepair/impl/NeedleFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ template <bool WITH_ESCAPING>
class NeedleFactory
{
public:
SearchSymbols getWaitNeedles(const Configuration & extractor_configuration)
SearchSymbols getWaitKeyNeedles(const Configuration & extractor_configuration)
{
const auto & [key_value_delimiter, quoting_character, pair_delimiters]
= extractor_configuration;
Expand All @@ -39,6 +39,17 @@ class NeedleFactory
return SearchSymbols {std::string{needles.data(), needles.size()}};
}

SearchSymbols getWaitPairDelimiterNeedles(const Configuration & extractor_configuration)
{
const auto & pair_delimiters = extractor_configuration.pair_delimiters;

std::vector<char> needles;

std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles));

return SearchSymbols {std::string{needles.data(), needles.size()}};
}

SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration)
{
const auto & [key_value_delimiter, quoting_character, pair_delimiters]
Expand Down
5 changes: 5 additions & 0 deletions src/Functions/keyvaluepair/impl/StateHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ class StateHandler
READING_QUOTED_VALUE,
// In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END.
FLUSH_PAIR,
// The `READING_QUOTED_VALUE` will assert the closing quoting character is found and then flush the pair. In this case, we should not
// move from `FLUSH_PAIR` directly to `WAITING_FOR_KEY` because a pair delimiter has not been found. Might jump to WAITING_FOR_PAIR_DELIMITER or END
FLUSH_PAIR_AFTER_QUOTED_VALUE,
// Might jump to WAITING_KEY or END.
WAITING_PAIR_DELIMITER,
END
};

Expand Down
39 changes: 35 additions & 4 deletions src/Functions/keyvaluepair/impl/StateHandlerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,19 @@ class StateHandlerImpl : public StateHandler
* */
NeedleFactory<WITH_ESCAPING> needle_factory;

wait_needles = needle_factory.getWaitNeedles(configuration);
wait_key_needles = needle_factory.getWaitKeyNeedles(configuration);
read_key_needles = needle_factory.getReadKeyNeedles(configuration);
read_value_needles = needle_factory.getReadValueNeedles(configuration);
read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration);
wait_pair_delimiter_needles = needle_factory.getWaitPairDelimiterNeedles(configuration);
}

/*
* Find first character that is considered a valid key character and proceeds to READING_KEY like states.
* */
[[nodiscard]] NextState waitKey(std::string_view file) const
{
if (const auto * p = find_first_not_symbols_or_null(file, wait_needles))
if (const auto * p = find_first_not_symbols_or_null(file, wait_key_needles))
{
const size_t character_position = p - file.begin();
if (isQuotingCharacter(*p))
Expand Down Expand Up @@ -284,22 +285,52 @@ class StateHandlerImpl : public StateHandler
{
value.append(file.begin() + pos, file.begin() + character_position);

return {next_pos, State::FLUSH_PAIR};
return {next_pos, State::FLUSH_PAIR_AFTER_QUOTED_VALUE};
}

pos = next_pos;
}

return {file.size(), State::END};
}
[[nodiscard]] NextState flushPair(std::string_view file, auto & key, auto & value) const
{
key.commit();
value.commit();

return {0, file.empty() ? State::END : State::WAITING_KEY};
}

[[nodiscard]] NextState flushPairAfterQuotedValue(std::string_view file, auto & key, auto & value) const
{
key.commit();
value.commit();

return {0, file.empty() ? State::END : State::WAITING_PAIR_DELIMITER};
}

[[nodiscard]] NextState waitPairDelimiter(std::string_view file) const
{
if (const auto * p = find_first_symbols_or_null(file, wait_pair_delimiter_needles))
{
const size_t character_position = p - file.data();
size_t next_pos = character_position + 1u;

return {next_pos, State::WAITING_KEY};
}

return {file.size(), State::END};
}


const Configuration configuration;

private:
SearchSymbols wait_needles;
SearchSymbols wait_key_needles;
SearchSymbols read_key_needles;
SearchSymbols read_value_needles;
SearchSymbols read_quoted_needles;
SearchSymbols wait_pair_delimiter_needles;

/*
* Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,15 @@ WITH
SELECT
x;
{'age':'31','name':'neymar','nationality':'brazil','team':'psg'}
-- after parsing a quoted value, the next key should only start after a pair delimiter
WITH
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
{'key':'quoted_value','second_key':'0'}
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,15 @@ WITH
) AS x
SELECT
x;

-- after parsing a quoted value, the next key should only start after a pair delimiter
WITH
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
Loading