diff --git a/CMakeLists.txt b/CMakeLists.txt index 7af8b272..891a4565 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,8 @@ set(SOURCE_FILES src/log_surgeon/wildcard_query_parser/ExpressionCharacter.hpp src/log_surgeon/wildcard_query_parser/ExpressionView.cpp src/log_surgeon/wildcard_query_parser/ExpressionView.hpp + src/log_surgeon/wildcard_query_parser/Query.cpp + src/log_surgeon/wildcard_query_parser/Query.hpp src/log_surgeon/wildcard_query_parser/QueryInterpretation.cpp src/log_surgeon/wildcard_query_parser/QueryInterpretation.hpp src/log_surgeon/wildcard_query_parser/StaticQueryToken.hpp diff --git a/docs/doxygen/mainpage.dox b/docs/doxygen/mainpage.dox index ab640c7f..329e4875 100644 --- a/docs/doxygen/mainpage.dox +++ b/docs/doxygen/mainpage.dox @@ -17,6 +17,7 @@ * - @ref unit_tests_expression_view "Expression View" * - @ref unit_tests_nfa "NFA" * - @ref unit_tests_prefix_tree "Prefix tree" + * - @ref unit_tests_query "Query" * - @ref unit_tests_query_interpretation "Query Interpretation" * - @ref unit_tests_regex_ast "Regex AST" * - @ref unit_tests_register_handler "Register handler" diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 78ed68d8..9503f681 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -152,6 +152,10 @@ class Lexer { [[nodiscard]] auto get_has_delimiters() const -> bool const& { return m_has_delimiters; } + [[nodiscard]] auto get_delim_table() const -> std::array const& { + return m_is_delimiter; + } + [[nodiscard]] auto is_delimiter(uint8_t byte) const -> bool const& { return m_is_delimiter[byte]; } @@ -252,7 +256,10 @@ class Lexer { std::array m_is_first_char_of_a_variable{false}; std::vector> m_rules; uint32_t m_line{0}; + + // For performance, `m_has_delimiters` caches whether any element in `m_is_delimiter` is true. bool m_has_delimiters{false}; + std::unique_ptr> m_dfa; std::optional m_first_delimiter_pos{std::nullopt}; bool m_asked_for_more_data{false}; diff --git a/src/log_surgeon/wildcard_query_parser/Expression.hpp b/src/log_surgeon/wildcard_query_parser/Expression.hpp index cac314fa..01633ffe 100644 --- a/src/log_surgeon/wildcard_query_parser/Expression.hpp +++ b/src/log_surgeon/wildcard_query_parser/Expression.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_HPP #define LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_HPP +#include #include #include @@ -24,6 +25,8 @@ class Expression { [[nodiscard]] auto get_search_string() const -> std::string const& { return m_search_string; } + [[nodiscard]] auto length() const -> size_t { return m_search_string.size(); } + private: std::vector m_chars; std::string m_search_string; diff --git a/src/log_surgeon/wildcard_query_parser/ExpressionCharacter.hpp b/src/log_surgeon/wildcard_query_parser/ExpressionCharacter.hpp index 37f6e387..e5e907bb 100644 --- a/src/log_surgeon/wildcard_query_parser/ExpressionCharacter.hpp +++ b/src/log_surgeon/wildcard_query_parser/ExpressionCharacter.hpp @@ -1,8 +1,11 @@ #ifndef LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_CHARACTER_HPP #define LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_CHARACTER_HPP +#include #include +#include + namespace log_surgeon::wildcard_query_parser { class ExpressionCharacter { public: @@ -23,6 +26,19 @@ class ExpressionCharacter { return Type::NonGreedyWildcard == m_type; } + [[nodiscard]] auto is_wildcard() const -> bool { + return Type::GreedyWildcard == m_type || Type::NonGreedyWildcard == m_type; + } + + [[nodiscard]] auto is_delim(std::array const& delim_table) const -> bool { + return delim_table.at(static_cast(m_value)); + } + + [[nodiscard]] auto is_delim_or_wildcard(std::array const& delim_table) const + -> bool { + return is_delim(delim_table) || is_wildcard(); + } + [[nodiscard]] auto is_escape() const -> bool { return Type::Escape == m_type; } private: diff --git a/src/log_surgeon/wildcard_query_parser/ExpressionView.cpp b/src/log_surgeon/wildcard_query_parser/ExpressionView.cpp index 15e5339d..f055c453 100644 --- a/src/log_surgeon/wildcard_query_parser/ExpressionView.cpp +++ b/src/log_surgeon/wildcard_query_parser/ExpressionView.cpp @@ -1,12 +1,14 @@ #include "ExpressionView.hpp" #include +#include #include #include #include #include #include +#include #include #include @@ -42,6 +44,39 @@ auto ExpressionView::extend_to_adjacent_greedy_wildcards() const return {is_extended, wildcard_expression_view}; } +auto ExpressionView::is_surrounded_by_delims(std::array const& delim_table) const + -> bool { + auto const [begin_idx, end_idx]{get_indices()}; + + bool has_left_boundary{false}; + if (0 == begin_idx) { + has_left_boundary = true; + } else { + auto const& preceding_char{m_expression->get_chars()[begin_idx - 1]}; + has_left_boundary = preceding_char.is_delim_or_wildcard(delim_table) + || (false == m_chars.empty() && m_chars.front().is_greedy_wildcard()); + } + + bool has_right_boundary{false}; + if (m_expression->length() == end_idx) { + has_right_boundary = true; + } else { + auto const& succeeding_char{m_expression->get_chars()[end_idx]}; + if (succeeding_char.is_escape()) { + if (m_expression->length() > end_idx + 1) { + auto const& logical_succeeding_char{m_expression->get_chars()[end_idx + 1]}; + has_right_boundary = logical_succeeding_char.is_delim(delim_table); + } + } else { + has_right_boundary = succeeding_char.is_delim_or_wildcard(delim_table); + } + has_right_boundary = has_right_boundary + || (false == m_chars.empty() && m_chars.back().is_greedy_wildcard()); + } + + return has_left_boundary && has_right_boundary; +} + auto ExpressionView::is_well_formed() const -> bool { if (m_chars.empty()) { return true; diff --git a/src/log_surgeon/wildcard_query_parser/ExpressionView.hpp b/src/log_surgeon/wildcard_query_parser/ExpressionView.hpp index cf228643..a69575c5 100644 --- a/src/log_surgeon/wildcard_query_parser/ExpressionView.hpp +++ b/src/log_surgeon/wildcard_query_parser/ExpressionView.hpp @@ -1,12 +1,14 @@ #ifndef LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_VIEW_HPP #define LOG_SURGEON_WILDCARD_QUERY_PARSER_EXPRESSION_VIEW_HPP +#include #include #include #include #include #include +#include #include #include @@ -41,6 +43,33 @@ class ExpressionView { && (m_chars[0].is_greedy_wildcard() || m_chars.back().is_greedy_wildcard()); } + /** + * Checks whether the view is surrounded by delimiters. The start and end of an expression are + * always considered a delimiter. A greedy wildcard may represent a string that includes a + * flanking delimiter. + * + * A view is considered bounded if both its left and right boundary satisfy certain + * requirements. + * + * Left boundary: + * - The view is at the start of the expression, or + * - The first character is a greedy wildcard (if non-empty), or + * - Immediately left of the view is a delimiter or wildcard. + * + * Right boundary: + * - The view is at the end of the expression, or + * - The last character is a greedy wildcard (if non-empty), or + * - Immediately right of the view is a delimiter or wildcard, or + * - Immediately right of the view is an escape character and the character to its + * immediate right is a delimiter. + * + * @param delim_table Table indicating for each character whether or not it is a delimiter. + * @return true when both left and right boundaries qualify; false otherwise. + */ + [[nodiscard]] auto is_surrounded_by_delims( + std::array const& delim_table + ) const -> bool; + /** * Checks whether this `ExpressionView` represents a well-formed subrange. * diff --git a/src/log_surgeon/wildcard_query_parser/Query.cpp b/src/log_surgeon/wildcard_query_parser/Query.cpp new file mode 100644 index 00000000..5a730eda --- /dev/null +++ b/src/log_surgeon/wildcard_query_parser/Query.cpp @@ -0,0 +1,176 @@ +#include "Query.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using log_surgeon::finite_automata::ByteDfaState; +using log_surgeon::finite_automata::ByteNfaState; +using log_surgeon::lexers::ByteLexer; +using std::set; +using std::string; +using std::vector; + +using ByteDfa = log_surgeon::finite_automata::Dfa; +using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteNfa = log_surgeon::finite_automata::Nfa; + +namespace log_surgeon::wildcard_query_parser { +Query::Query(string const& query_string) { + m_processed_query_string.reserve(query_string.size()); + Expression const expression(query_string); + + bool prev_is_escape{false}; + string unhandled_wildcard_sequence; + bool unhandled_wildcard_sequence_contains_greedy_wildcard{false}; + for (auto c : expression.get_chars()) { + if (false == unhandled_wildcard_sequence.empty() && false == c.is_wildcard()) { + if (unhandled_wildcard_sequence_contains_greedy_wildcard) { + m_processed_query_string.push_back('*'); + } else { + m_processed_query_string += unhandled_wildcard_sequence; + } + unhandled_wildcard_sequence.clear(); + unhandled_wildcard_sequence_contains_greedy_wildcard = false; + } + + if (prev_is_escape) { + m_processed_query_string.push_back(c.value()); + prev_is_escape = false; + } else if (c.is_escape()) { + prev_is_escape = true; + m_processed_query_string.push_back(c.value()); + } else if (c.is_greedy_wildcard()) { + unhandled_wildcard_sequence.push_back(c.value()); + unhandled_wildcard_sequence_contains_greedy_wildcard = true; + } else if (c.is_non_greedy_wildcard()) { + unhandled_wildcard_sequence.push_back(c.value()); + } else { + m_processed_query_string.push_back(c.value()); + } + } + if (false == unhandled_wildcard_sequence.empty()) { + if (unhandled_wildcard_sequence_contains_greedy_wildcard) { + m_processed_query_string.push_back('*'); + } else { + m_processed_query_string += unhandled_wildcard_sequence; + } + } +} + +auto Query::get_all_multi_token_interpretations(ByteLexer const& lexer) const + -> std::set { + if (m_processed_query_string.empty()) { + return {}; + } + + Expression const expression{m_processed_query_string}; + vector> query_interpretations(expression.length()); + for (size_t end_idx = 1; end_idx <= expression.length(); ++end_idx) { + for (size_t begin_idx = 0; begin_idx < end_idx; ++begin_idx) { + ExpressionView const expression_view{expression, begin_idx, end_idx}; + if ("*" != expression_view.get_search_string() + && expression_view.starts_or_ends_with_greedy_wildcard()) + { + continue; + } + + auto const extended_view{expression_view.extend_to_adjacent_greedy_wildcards().second}; + auto const single_token_interpretations{ + get_all_single_token_interpretations(extended_view, lexer) + }; + if (single_token_interpretations.empty()) { + continue; + } + + if (begin_idx == 0) { + query_interpretations[end_idx - 1].insert( + std::make_move_iterator(single_token_interpretations.begin()), + std::make_move_iterator(single_token_interpretations.end()) + ); + } else { + for (auto const& prefix : query_interpretations[begin_idx - 1]) { + for (auto const& suffix : single_token_interpretations) { + QueryInterpretation combined{prefix}; + combined.append_query_interpretation(suffix); + query_interpretations[end_idx - 1].insert(std::move(combined)); + } + } + } + } + } + return query_interpretations.back(); +} + +auto Query::get_all_single_token_interpretations( + ExpressionView const& expression_view, + ByteLexer const& lexer +) -> std::vector { + vector interpretations; + + if (false == expression_view.is_well_formed()) { + return interpretations; + } + if ("*" == expression_view.get_search_string()) { + interpretations.emplace_back("*"); + return interpretations; + } + if (false == expression_view.is_surrounded_by_delims(lexer.get_delim_table())) { + interpretations.emplace_back(string{expression_view.get_search_string()}); + return interpretations; + } + + auto const [regex_string, contains_wildcard]{expression_view.generate_regex_string()}; + + auto const matching_var_type_ids{get_matching_variable_types(regex_string, lexer)}; + if (matching_var_type_ids.empty() || contains_wildcard) { + interpretations.emplace_back(string{expression_view.get_search_string()}); + } + + for (auto const variable_type_id : matching_var_type_ids) { + interpretations.emplace_back( + variable_type_id, + string{expression_view.get_search_string()}, + contains_wildcard + ); + if (false == contains_wildcard) { + break; + } + } + return interpretations; +} + +auto Query::get_matching_variable_types(string const& regex_string, ByteLexer const& lexer) + -> set { + NonTerminal::m_next_children_start = 0; + + Schema schema; + schema.add_variable("search:" + regex_string, -1); + auto const schema_ast = schema.release_schema_ast_ptr(); + auto& rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); + vector rules; + rules.emplace_back(0, std::move(rule_ast.m_regex_ptr)); + ByteNfa const nfa{rules}; + ByteDfa const dfa{nfa}; + + auto var_types = lexer.get_dfa()->get_intersect(&dfa); + return var_types; +} +} // namespace log_surgeon::wildcard_query_parser diff --git a/src/log_surgeon/wildcard_query_parser/Query.hpp b/src/log_surgeon/wildcard_query_parser/Query.hpp new file mode 100644 index 00000000..915077dd --- /dev/null +++ b/src/log_surgeon/wildcard_query_parser/Query.hpp @@ -0,0 +1,137 @@ +#ifndef LOG_SURGEON_WILDCARD_QUERY_PARSER_QUERY_HPP +#define LOG_SURGEON_WILDCARD_QUERY_PARSER_QUERY_HPP + +#include +#include +#include +#include + +#include +#include +#include + +namespace log_surgeon::wildcard_query_parser { +class Query { +public: + explicit Query(std::string const& query_string); + + /** + * Generates all k-token interpretations of the n-character query string, where 1 <= k < n. + * + * 1. Interpret each substring [a,b) as a single token (k=1). + * - Substrings adjacent to greedy wildcards must be interpreted as if they include them. To + * implement this, we extend all substrings to include adjacent greedy wildcards. + * - Example: consider query "a*b" and variable type `hasNum` ("\w*\d+\w*"): + * - Without extension: + * - "a" -> static-text + * - "b" -> static-text + * - "a*" -> (a*) + * - "*b" -> (*b) + * - Multi-token interpretations (via step 2 below): + * - {a*b}, + * - {(a*)b}, + * - {a(*b)}. + * - None of these match a string like "a1 c 1b", which has interpretation + * {(a1) c (1b)}. By interpreting "a" as "a*" and "b" as "*b", the '*' + * is preserved allowing for interpretation {(a*)*(*b)}, which matches + * {(a1) c (1b)}. + * - Special cases: + * - Single-character greedy wildcards ("*") are not extended as they have no adjacent + * greedy wildcards (repeated wildcards are collapsed during preprocessing). + * - Substrings are not extended to non-greedy wildcards (`?`) as "a?b" =/= "a??b". + * - Substrings of length >= 2 that begin or end with a greedy wildcard are skipped as they + * are redundant. + * - Example: in "a*b", substring [0,1) extends to "a*", therefore substring [0,2) "a*" is + * redundant. This avoids producing interpretation {(a*)b}, which is a subset of + * {(a*)*b}. + * - Note: The length >= 2 requirement avoids skipping 1-length greedy substrings ("*") as + * they are never redundant (i.e., no 0-length substring exists to extend). + * + * 2. Let I(a) be the set of all k-token interpretations of substring [0,a), where 1 <= k < a. + * - Let T(a,b) be the set of all valid single-token interpretations of substring [a,b). + * - We can then compute I(a) recursively: + * + * I(a) = T(0,a) + * U (I(1) x T(1,a)) + * U (I(2) x T(2,a)) + * ... + * U (I(a-1) x T(a-1,a)) + * + * where x denotes the cross product: all combinations of prefix interpretations from I(i) + * and suffix interpretations from T(i,a). + * + * 3. Use dynamic programming to compute I(n) efficiently: + * - Instead of generating all possible combinations naively, we store only unique + * interpretations by recursively building up the combinations as shown below. + * - Compute I(n) iteratively in increasing order of substring length: + * - Compute T(0,1), then I(1) + * - Compute T(0,2), T(1,2), then I(2) + * - Compute T(0,3), T(1,3), T(2,3), then I(3) + * - ... + * - Compute T(0,n), ..., T(n-1,n), then I(n) + * + * @param lexer The lexer used to determine variable types and delimiters. + * @return A set of `QueryInterpretation` representing all valid multi-token interpretations of + * the full query string. + */ + [[nodiscard]] auto get_all_multi_token_interpretations(lexers::ByteLexer const& lexer) const + -> std::set; + + [[nodiscard]] auto get_processed_query_string() const -> std::string const& { + return m_processed_query_string; + } + +private: + /** + * Generates all single-token interpretations for a given expression view matching a given + * lexer. + * + * A single-token interpretation can be one of: + * - A static token (literal text). + * - A variable token (e.g., int, float, hasNumber) as defined by the lexer's schema. Each + * unique variable type is considered a distinct interpretation. + * + * Rules: + * - If the substring is malformed (has hanging escape characters): + * - There are no valid interpretations. + * - Else if the substring: + * - Is an isolated greedy wildcard, "*", or + * - Is not surrounded by delimiters or wildcards (lexer won't consider it a variable), or + * - Does not match any variable. + * - Then: + * - The only interpretation is a static token. + * - Else if the substring contains a wildcard: + * - The interpretations include a static token, plus a variable token for each matching type. + * - Else: + * - The only interpretation is the variable token corresponding to the highest priority + * match. + * + * @param expression_view The view of the substring to interpret. + * @param lexer The lexer used to determine variable types and delimiters. + * @return A vector of `QueryInterpretation` objects representing all valid single-token + * interpretations for the given substring. + */ + [[nodiscard]] static auto get_all_single_token_interpretations( + ExpressionView const& expression_view, + lexers::ByteLexer const& lexer + ) -> std::vector; + + /** + * Determines the set of variable types matched by the lexer for all strings generated from the + * input regex. + * + * Generates a DFA from the input regex and computes its intersection with the lexer's DFA. + * + * @param regex_string The input regex string for which to find matching variable types. + * @param lexer The lexer whose DFA is used for matching. + * @return The set of all matching variable type IDs. + */ + [[nodiscard]] static auto + get_matching_variable_types(std::string const& regex_string, lexers::ByteLexer const& lexer) + -> std::set; + + std::string m_processed_query_string; +}; +} // namespace log_surgeon::wildcard_query_parser + +#endif // LOG_SURGEON_WILDCARD_QUERY_PARSER_QUERY_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2838d8a4..2121195d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,6 +11,7 @@ target_sources( test-expression-view.cpp test-nfa.cpp test-prefix-tree.cpp + test-query.cpp test-query-interpretation.cpp test-regex-ast.cpp test-register-handler.cpp diff --git a/tests/test-query.cpp b/tests/test-query.cpp new file mode 100644 index 00000000..19687c6b --- /dev/null +++ b/tests/test-query.cpp @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +/** + * @defgroup unit_tests_query `Query` unit tests. + * @brief Unit tests for `Query` construction and interpretation. + + * These unit tests contain the `Query` tag. + */ + +using log_surgeon::lexers::ByteLexer; +using log_surgeon::Schema; +using log_surgeon::SchemaVarAST; +using log_surgeon::wildcard_query_parser::Query; +using std::set; +using std::string; +using std::string_view; +using std::vector; + +namespace { +/** + * Creates a query from the given query string and tests that its processed query string and + * interpretations match the expected values. + * + * @param raw_query_string The search query. + * @param expected_processed_query_string The processed search query. + * @param schema_rules A vector of strings, each string representing a schema rule. + * @param expected_serialized_interpretations The expected set of serialized interpretations. + */ +auto test_query( + string_view raw_query_string, + string_view expected_processed_query_string, + vector const& schema_rules, + set const& expected_serialized_interpretations +) -> void; + +/** + * Initializes a `ByteLexer` with space as a delimiter and the given `schema_rules`. + * + * @param schema_rules A vector of strings, each string representing a schema rule. + * @return The initialized `ByteLexer`. + */ +auto make_test_lexer(vector const& schema_rules) -> ByteLexer; + +auto test_query( + string_view const raw_query_string, + string_view const expected_processed_query_string, + vector const& schema_rules, + set const& expected_serialized_interpretations +) -> void { + auto const lexer{make_test_lexer(schema_rules)}; + + Query const query{string(raw_query_string)}; + REQUIRE(expected_processed_query_string == query.get_processed_query_string()); + + auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; + set serialized_interpretations; + for (auto const& interpretation : interpretations) { + serialized_interpretations.insert(interpretation.serialize()); + } + + REQUIRE(expected_serialized_interpretations == serialized_interpretations); +} + +auto make_test_lexer(vector const& schema_rules) -> ByteLexer { + ByteLexer lexer; + lexer.set_delimiters({' '}); + + Schema schema; + for (auto const& schema_rule : schema_rules) { + schema.add_variable(schema_rule, -1); + } + + auto const schema_ast = schema.release_schema_ast_ptr(); + REQUIRE(nullptr != schema_ast); + REQUIRE(schema_rules.size() == schema_ast->m_schema_vars.size()); + for (size_t i{0}; i < schema_ast->m_schema_vars.size(); ++i) { + REQUIRE(nullptr != schema_ast->m_schema_vars[i]); + auto* capture_rule_ast{dynamic_cast(schema_ast->m_schema_vars[i].get())}; + REQUIRE(nullptr != capture_rule_ast); + lexer.add_rule(i, std::move(capture_rule_ast->m_regex_ptr)); + } + + lexer.generate(); + return lexer; +} +} // namespace + +/** + * @ingroup unit_tests_query + * @brief Creates and tests an empty `Query`. + */ +TEST_CASE("empty_query", "[Query]") { + constexpr string_view cRawQueryString; + constexpr string_view cProcessedQueryString; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a greedy wildcard `Query`. + */ +TEST_CASE("greedy_wildcard_query", "[Query]") { + constexpr string_view cRawQueryString{"*"}; + constexpr string_view cProcessedQueryString{"*"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{"logtype='*', contains_wildcard='0'"}; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with repeated greedy wildcards. + */ +TEST_CASE("repeated_greedy_wildcard_query", "[Query]") { + constexpr string_view cRawQueryString{"a**b"}; + constexpr string_view cProcessedQueryString{"a*b"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + "logtype='a*b', contains_wildcard='0'", + "logtype='a***b', contains_wildcard='0'", + "logtype='<0>(a*)**b', contains_wildcard='10'", + "logtype='<0>(a*)*<0>(*b)', contains_wildcard='101'", + "logtype='<0>(a*b)', contains_wildcard='1'", + "logtype='a**<0>(*b)', contains_wildcard='01'" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with a non-greedy wildcard followed by a greedy wildcard. + */ +TEST_CASE("short_wildcard_sequence_query", "[Query]") { + constexpr string_view cRawQueryString{"a?*b"}; + constexpr string_view cProcessedQueryString{"a*b"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + "logtype='a*b', contains_wildcard='0'", + "logtype='a***b', contains_wildcard='0'", + "logtype='<0>(a*)**b', contains_wildcard='10'", + "logtype='<0>(a*)*<0>(*b)', contains_wildcard='101'", + "logtype='<0>(a*b)', contains_wildcard='1'", + "logtype='a**<0>(*b)', contains_wildcard='01'" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with a long mixed wildcard sequence. + */ +TEST_CASE("long_mixed_wildcard_sequence_query", "[Query]") { + constexpr string_view cRawQueryString{"a?*?*?*?b"}; + constexpr string_view cProcessedQueryString{"a*b"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + "logtype='a*b', contains_wildcard='0'", + "logtype='a***b', contains_wildcard='0'", + "logtype='<0>(a*)**b', contains_wildcard='10'", + "logtype='<0>(a*)*<0>(*b)', contains_wildcard='101'", + "logtype='<0>(a*b)', contains_wildcard='1'", + "logtype='a**<0>(*b)', contains_wildcard='01'" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with a long non-greedy wildcard sequence. + */ +TEST_CASE("long_non_greedy_wildcard_sequence_query", "[Query]") { + constexpr string_view cRawQueryString{"a????b"}; + constexpr string_view cProcessedQueryString{"a????b"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + R"(logtype='a????b', contains_wildcard='0')", + + R"(logtype='<0>(a?)???b', contains_wildcard='10')", + R"(logtype='<0>(a??)??b', contains_wildcard='10')", + R"(logtype='<0>(a???)?b', contains_wildcard='10')", + R"(logtype='<0>(a????b)', contains_wildcard='1')", + + R"(logtype='a?<0>(?)??b', contains_wildcard='010')", + R"(logtype='a?<0>(??)?b', contains_wildcard='010')", + R"(logtype='a?<0>(???b)', contains_wildcard='01')", + R"(logtype='a?<0>(?)?<0>(?b)', contains_wildcard='0101')", + + R"(logtype='a??<0>(?)?b', contains_wildcard='010')", + R"(logtype='a??<0>(??b)', contains_wildcard='01')", + + R"(logtype='a???<0>(?b)', contains_wildcard='01')", + + R"(logtype='<0>(a?)?<0>(?)?b', contains_wildcard='1010')", + R"(logtype='<0>(a?)?<0>(??b)', contains_wildcard='101')", + R"(logtype='<0>(a?)??<0>(?b)', contains_wildcard='101')", + + R"(logtype='<0>(a??)?<0>(?b)', contains_wildcard='101')", + + // Double dipping on delimiters + R"(logtype='<0>(a?)<0>(?)??b', contains_wildcard='110')", + R"(logtype='<0>(a?)<0>(??)?b', contains_wildcard='110')", + R"(logtype='<0>(a?)<0>(???b)', contains_wildcard='11')", + R"(logtype='<0>(a?)<0>(?)?<0>(?b)', contains_wildcard='1101')", + R"(logtype='<0>(a?)?<0>(?)<0>(?b)', contains_wildcard='1011')", + + R"(logtype='<0>(a??)<0>(?)?b', contains_wildcard='110')", + R"(logtype='<0>(a??)<0>(??b)', contains_wildcard='11')", + + R"(logtype='<0>(a???)<0>(?b)', contains_wildcard='11')", + + R"(logtype='a?<0>(?)<0>(?)?b', contains_wildcard='0110')", + R"(logtype='a?<0>(?)<0>(??b)', contains_wildcard='011')", + + R"(logtype='a?<0>(??)<0>(?b)', contains_wildcard='011')", + R"(logtype='a??<0>(?)<0>(?b)', contains_wildcard='011')", + + R"(logtype='<0>(a?)<0>(?)<0>(?)?b', contains_wildcard='1110')", + R"(logtype='<0>(a?)<0>(?)<0>(??b)', contains_wildcard='111')", + R"(logtype='<0>(a?)<0>(??)<0>(?b)', contains_wildcard='111')", + R"(logtype='<0>(a??)<0>(?)<0>(?b)', contains_wildcard='111')", + R"(logtype='a?<0>(?)<0>(?)<0>(?b)', contains_wildcard='0111')", + + R"(logtype='<0>(a?)<0>(?)<0>(?)<0>(?b)', contains_wildcard='1111')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with an escaped '*' character. + */ +TEST_CASE("escaped_star_query", "[Query]") { + constexpr string_view cRawQueryString{R"(a\*b)"}; + constexpr string_view cProcessedQueryString{R"(a\*b)"}; + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + R"(logtype='a\*b', contains_wildcard='0')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with an escaped '*' character. + * + * NOTE: This has a static-text case as strings "1", "2", and "3" in isolation aren't surrounded by + * delimiters. These tokens then build up the interpretation "123". Although additional + * interpretations don't impact correctness, they may impact performance. We can optimize these out, + * but it'll make the code messy. Instead, we should eventually remove the explicit tracking of + * static-tokens, in favor of only tracking variable tokens. + */ +TEST_CASE("int_query", "[Query]") { + constexpr string_view cRawQueryString{"123"}; + constexpr string_view cProcessedQueryString{"123"}; + vector const schema_rules{{R"(int:\d+)"}}; + set const expected_serialized_interpretations{ + R"(logtype='123', contains_wildcard='0')", + R"(logtype='<0>(123)', contains_wildcard='0')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with multiple variable types. + * + * This test ensures that each non-wildcard token is assigned to the highest priority variable. + * + * NOTE: Similar to the above `int_query` test there are unneeded interpretations due to aggresively + * generating static-text tokens. + */ +TEST_CASE("non_wildcard_multi_variable_query", "[Query]") { + constexpr string_view cRawQueryString{"abc123 123"}; + constexpr string_view cProcessedQueryString{"abc123 123"}; + + SECTION("int_priority") { + vector const schema_rules{{R"(int:(\d+))"}, {R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + R"(logtype='abc123 123', contains_wildcard='0')", + R"(logtype='abc123 <0>(123)', contains_wildcard='00')", + R"(logtype='<1>(abc123) 123', contains_wildcard='00')", + R"(logtype='<1>(abc123) <0>(123)', contains_wildcard='000')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); + } + + SECTION("has_number_priority") { + vector const schema_rules{{R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}, {R"(int:(\d+))"}}; + set const expected_serialized_interpretations{ + R"(logtype='abc123 123', contains_wildcard='0')", + R"(logtype='abc123 <0>(123)', contains_wildcard='00')", + R"(logtype='<0>(abc123) 123', contains_wildcard='00')", + R"(logtype='<0>(abc123) <0>(123)', contains_wildcard='000')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); + } +} + +/** + * @ingroup unit_tests_query + * @brief Creates and tests a query with multiple variable types. + * + * This test ensures that each greedy wildcard token is identified as all correct token types. + * + * NOTE: Similar to the above `int_query` test there are unneeded interpretations due to aggresively + * generating static-text tokens. This same issue causes interpretations with redundant wildcards. + */ +TEST_CASE("wildcard_multi_variable_query", "[Query]") { + constexpr string_view cRawQueryString{"abc123* *123"}; + constexpr string_view cProcessedQueryString{"abc123* *123"}; + + vector const schema_rules{{R"(int:(\d+))"}, {R"(hasNumber:[A-Za-z]*\d+[A-Za-z]*)"}}; + set const expected_serialized_interpretations{ + R"(logtype='abc123* *123', contains_wildcard='0')", + R"(logtype='abc123*** *123', contains_wildcard='0')", + R"(logtype='abc123* ***123', contains_wildcard='0')", + R"(logtype='abc123*** ***123', contains_wildcard='0')", + R"(logtype='abc123* **<0>(*123)', contains_wildcard='01')", + R"(logtype='abc123*** **<0>(*123)', contains_wildcard='01')", + R"(logtype='abc123* **<1>(*123)', contains_wildcard='01')", + R"(logtype='abc123*** **<1>(*123)', contains_wildcard='01')", + R"(logtype='<1>(abc123*)** *123', contains_wildcard='10')", + R"(logtype='<1>(abc123*)** ***123', contains_wildcard='10')", + R"(logtype='<1>(abc123*)** **<0>(*123)', contains_wildcard='101')", + R"(logtype='<1>(abc123*)** **<1>(*123)', contains_wildcard='101')" + }; + + test_query( + cRawQueryString, + cProcessedQueryString, + schema_rules, + expected_serialized_interpretations + ); +}