diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ad59f75..1a1a8482 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,20 @@ set(SOURCE_FILES src/log_surgeon/Constants.hpp src/log_surgeon/FileReader.cpp src/log_surgeon/FileReader.hpp + src/log_surgeon/finite_automata/Capture.hpp + src/log_surgeon/finite_automata/Dfa.hpp + src/log_surgeon/finite_automata/DfaState.hpp + src/log_surgeon/finite_automata/DfaStatePair.hpp + src/log_surgeon/finite_automata/Nfa.hpp + src/log_surgeon/finite_automata/NfaState.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp + src/log_surgeon/finite_automata/RegexAST.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp + src/log_surgeon/finite_automata/StateType.hpp + src/log_surgeon/finite_automata/TaggedTransition.hpp + src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp + src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp src/log_surgeon/Lalr1Parser.cpp src/log_surgeon/Lalr1Parser.hpp src/log_surgeon/Lalr1Parser.tpp @@ -93,20 +107,8 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp - src/log_surgeon/finite_automata/PrefixTree.cpp - src/log_surgeon/finite_automata/PrefixTree.hpp - src/log_surgeon/finite_automata/RegexAST.hpp - src/log_surgeon/finite_automata/Dfa.hpp - src/log_surgeon/finite_automata/DfaState.hpp - src/log_surgeon/finite_automata/DfaStatePair.hpp - src/log_surgeon/finite_automata/Nfa.hpp - src/log_surgeon/finite_automata/NfaState.hpp - src/log_surgeon/finite_automata/RegisterHandler.hpp - src/log_surgeon/finite_automata/StateType.hpp - src/log_surgeon/finite_automata/Tag.hpp - src/log_surgeon/finite_automata/TaggedTransition.hpp - src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp - src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp + src/log_surgeon/types.hpp + src/log_surgeon/UniqueIdGenerator.hpp ) set(LCHIP_INSTALL_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/log_surgeon) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 4f68a168..5e9672b1 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -2,22 +2,22 @@ #define LOG_SURGEON_LEXER_HPP #include -#include #include #include +#include #include #include -#include +#include #include #include #include #include -#include #include #include #include #include +#include namespace log_surgeon { template @@ -35,13 +35,11 @@ class Lexer { /** * Add lexical rule to the lexer's list of rules - * @param id - * @param regex + * @param rule_id + * @param rule */ - auto add_rule( - uint32_t const& id, - std::unique_ptr> rule - ) -> void; + auto add_rule(rule_id_t rule_id, std::unique_ptr> rule) + -> void; /** * Return regex pattern for a rule name @@ -51,7 +49,8 @@ class Lexer { auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** - * Generate DFA for lexer + * Generate DFA for lexer. + * @throw std::invalid_argument if `m_rules` contains multipe captures with the same name. */ auto generate() -> void; @@ -122,8 +121,75 @@ class Lexer { return m_dfa; } - std::unordered_map m_symbol_id; - std::unordered_map m_id_symbol; + /** + * @param rule_id ID associated with a rule. + * @return A vector of capture IDs corresponding to each rule that contain the variable on + * success. + * @return std::nullopt if the variable is never captured in any rule. + */ + [[nodiscard]] auto get_capture_ids_from_rule_id(rule_id_t const rule_id + ) const -> std::optional> { + if (m_rule_id_to_capture_ids.contains(rule_id)) { + return m_rule_id_to_capture_ids.at(rule_id); + } + return std::nullopt; + } + + /** + * @param capture_id ID associated with a capture within a rule. + * @return The start and end tag of the capture on success. + * @return std::nullopt if no capture is associated with the given capture ID. + */ + [[nodiscard]] auto get_tag_id_pair_from_capture_id(capture_id_t const capture_id + ) const -> std::optional> { + if (m_capture_id_to_tag_id_pair.contains(capture_id)) { + return m_capture_id_to_tag_id_pair.at(capture_id); + } + return std::nullopt; + } + + /** + * @param tag_id ID associated with a tag. + * @return The final register ID tracking the value of the tag ID during DFA simulation on + * success. + * @return std::nullopt if no tag is associated with the given tag ID. + */ + [[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id + ) const -> std::optional { + if (m_tag_to_reg_id.contains(tag_id)) { + return m_tag_to_reg_id.at(tag_id); + } + return std::nullopt; + } + + /** + * @param capture_id ID associated with a capture within a rule. + * @return The start and end final register IDs tracking the position of the capture on success. + * @return std::nullopt if no capture is associated with the given capture ID. + */ + [[nodiscard]] auto get_reg_ids_from_capture_id(capture_id_t const capture_id + ) const -> std::optional> { + auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id(capture_id)}; + if (false == optional_tag_id_pair.has_value()) { + return std::nullopt; + } + auto const [start_tag_id, end_tag_id]{optional_tag_id_pair.value()}; + + auto const optional_start_reg_id{get_reg_id_from_tag_id(start_tag_id)}; + if (false == optional_start_reg_id.has_value()) { + return std::nullopt; + } + + auto const optional_end_reg_id{get_reg_id_from_tag_id(end_tag_id)}; + if (false == optional_end_reg_id.has_value()) { + return std::nullopt; + } + + return {optional_start_reg_id.value(), optional_end_reg_id.value()}; + } + + std::unordered_map m_symbol_id; + std::unordered_map m_id_symbol; private: /** @@ -148,6 +214,9 @@ class Lexer { std::unique_ptr> m_dfa; bool m_asked_for_more_data{false}; TypedDfaState const* m_prev_state{nullptr}; + std::unordered_map> m_rule_id_to_capture_ids; + std::unordered_map> m_capture_id_to_tag_id_pair; + std::unordered_map m_tag_to_reg_id; }; namespace lexers { diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index a4e36f55..799be801 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -4,11 +4,13 @@ #include #include #include +#include #include #include #include #include +#include /** * utf8 format (https://en.wikipedia.org/wiki/UTF-8) @@ -358,17 +360,17 @@ void Lexer::add_delimiters(std::vector c template void Lexer::add_rule( - uint32_t const& id, + rule_id_t const rule_id, std::unique_ptr> rule ) { - m_rules.emplace_back(id, std::move(rule)); + m_rules.emplace_back(rule_id, std::move(rule)); } template -auto Lexer::get_rule(uint32_t const variable_id +auto Lexer::get_rule(rule_id_t const rule_id ) -> finite_automata::RegexAST* { for (auto const& rule : m_rules) { - if (rule.get_variable_id() == variable_id) { + if (rule.get_variable_id() == rule_id) { return rule.get_regex(); } } @@ -377,8 +379,31 @@ auto Lexer::get_rule(uint32_t const variable_id template void Lexer::generate() { - finite_automata::Nfa nfa{std::move(m_rules)}; - // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" + for (auto const& rule : m_rules) { + for (auto const* capture : rule.get_captures()) { + std::string const capture_name{capture->get_name()}; + if (m_symbol_id.contains(capture_name)) { + throw std::invalid_argument("`m_rules` contains capture names that are not unique." + ); + } + auto const capture_id{m_symbol_id.size()}; + m_symbol_id.emplace(capture_name, capture_id); + m_id_symbol.emplace(capture_id, capture_name); + + auto const rule_id{rule.get_variable_id()}; + m_rule_id_to_capture_ids.try_emplace(rule_id); + m_rule_id_to_capture_ids.at(rule_id).push_back(capture_id); + } + } + + finite_automata::Nfa nfa{m_rules}; + for (auto const& [capture, tag_id_pair] : nfa.get_capture_to_tag_id_pair()) { + std::string const capture_name{capture->get_name()}; + auto const capture_id{m_symbol_id.at(capture_name)}; + m_capture_id_to_tag_id_pair.emplace(capture_id, tag_id_pair); + } + + // TODO: DFA ignores captures. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = std::make_unique>(std::move(nfa)); auto const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index 6ab7e861..b962bd8d 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -23,6 +23,10 @@ class LexicalRule { */ auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; + [[nodiscard]] auto get_captures() const -> std::vector const& { + return m_regex->get_subtree_positive_captures(); + } + [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { @@ -40,7 +44,7 @@ void LexicalRule::add_to_nfa(finite_automata::Nfa* auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); - m_regex->add_to_nfa_with_negative_tags(nfa, end_state); + m_regex->add_to_nfa_with_negative_captures(nfa, end_state); } } // namespace log_surgeon diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index d36271ca..1960e997 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -9,8 +9,8 @@ #include #include +#include #include -#include #include #include #include @@ -167,7 +167,7 @@ static auto regex_capture_rule(NonTerminal const* m) -> std::unique_ptrnon_terminal_cast(5)->get_parser_ast()->get>(); return std::make_unique(make_unique( std::move(r6), - std::make_unique(r4->m_name) + std::make_unique(r4->m_name) )); } @@ -202,7 +202,7 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr { static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); - // To handle negative tags we treat `R*` as `R+ | ∅`. + // To handle negative captures we treat `R*` as `R+ | ∅`. return make_unique(make_unique( make_unique(), make_unique(std::move(r1), 1, 0) @@ -248,7 +248,7 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); if (0 == min) { - // To handle negative tags we treat `R*` as `R+ | ∅`. + // To handle negative captures we treat `R*` as `R+ | ∅`. return make_unique(make_unique( make_unique(), make_unique(std::move(r1), 1, max) diff --git a/src/log_surgeon/UniqueIdGenerator.hpp b/src/log_surgeon/UniqueIdGenerator.hpp new file mode 100644 index 00000000..647853ff --- /dev/null +++ b/src/log_surgeon/UniqueIdGenerator.hpp @@ -0,0 +1,16 @@ +#ifndef LOG_SURGEON_UNIQUEIDGENERATOR_HPP +#define LOG_SURGEON_UNIQUEIDGENERATOR_HPP + +#include + +namespace log_surgeon { +class UniqueIdGenerator { +public: + [[nodiscard]] auto generate_id() -> uint32_t { return m_current_id++; } + +private: + uint32_t m_current_id{0}; +}; +} // namespace log_surgeon + +#endif // LOG_SURGEON_UNIQUEIDGENERATOR_HPP diff --git a/src/log_surgeon/finite_automata/Tag.hpp b/src/log_surgeon/finite_automata/Capture.hpp similarity index 55% rename from src/log_surgeon/finite_automata/Tag.hpp rename to src/log_surgeon/finite_automata/Capture.hpp index 3a3b4d7f..84480eab 100644 --- a/src/log_surgeon/finite_automata/Tag.hpp +++ b/src/log_surgeon/finite_automata/Capture.hpp @@ -1,14 +1,14 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG -#define LOG_SURGEON_FINITE_AUTOMATA_TAG +#ifndef LOG_SURGEON_FINITE_AUTOMATA_CAPTURE +#define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE #include #include #include namespace log_surgeon::finite_automata { -class Tag { +class Capture { public: - explicit Tag(std::string name) : m_name{std::move(name)} {} + explicit Capture(std::string name) : m_name{std::move(name)} {} [[nodiscard]] auto get_name() const -> std::string_view { return m_name; } @@ -17,4 +17,4 @@ class Tag { }; } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_TAG +#endif // LOG_SURGEON_FINITE_AUTOMATA_CAPTURE diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index baceaec3..577d3c71 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP #include +#include #include #include #include diff --git a/src/log_surgeon/finite_automata/Nfa.hpp b/src/log_surgeon/finite_automata/Nfa.hpp index 8eaaaadd..a6b04b98 100644 --- a/src/log_surgeon/finite_automata/Nfa.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -15,14 +15,24 @@ #include #include #include +#include +#include namespace log_surgeon::finite_automata { +/** + * Represents a Non-Deterministic Finite Automaton (NFA) designed to recognize a language based on + * a set of rules provided during initialization. This class serves as an intermediate + * representation used for generating the corresponding Deterministic Finite Automaton (DFA). + * + * NOTE: It is assumed that all capture groups have unique names, even across different rules. + * @tparam TypedNfaState + */ template class Nfa { public: using StateVec = std::vector; - explicit Nfa(std::vector> rules); + explicit Nfa(std::vector> const& rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. @@ -30,40 +40,28 @@ class Nfa { */ [[nodiscard]] auto new_state() -> TypedNfaState*; - /** - * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to - * `m_states`. - * @param tag - * @param dest_state - * @return A new state with a positive tagged end transition to `dest_state`. - */ - [[nodiscard]] auto new_state_with_positive_tagged_end_transition( - Tag const* tag, - TypedNfaState const* dest_state - ) -> TypedNfaState*; - /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. - * @param tags + * @param captures * @param dest_state * @return TypedNfaState* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( - std::vector tags, + std::vector const& captures, TypedNfaState const* dest_state ) -> TypedNfaState*; /** * Creates the start and end states for a capture group. - * @param tag The tag associated with the capture group. + * @param capture The capture associated with the capture group. * @param dest_state * @return A pair of states: * - A new state with a positive tagged start transition from `m_root`. * - A new state with a positive tagged end transition to `dest_state`. */ [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( - Tag const* tag, + Capture const* capture, TypedNfaState const* dest_state ) -> std::pair; @@ -86,23 +84,61 @@ class Nfa { auto get_root() -> TypedNfaState* { return m_root; } + [[nodiscard]] auto get_capture_to_tag_id_pair( + ) const -> std::unordered_map> const& { + return m_capture_to_tag_id_pair; + } + private: + /** + * Creates start and end tags for the specified capture if they don't currently exist. + * @param capture The variable to be captured. + * @return A pair of tags: + * - The start tag for the `capture`. + * - The end tag for the `capture`. + */ + [[nodiscard]] auto get_or_create_capture_tag_pair(Capture const* capture + ) -> std::pair; + + /** + * Creates a `unique_ptr` for an NFA state with a positive tagged end transition and adds it to + * `m_states`. + * @param tag_id + * @param dest_state + * @return A new state with a positive tagged end transition to `dest_state`. + */ + [[nodiscard]] auto new_state_with_positive_tagged_end_transition( + tag_id_t tag_id, + TypedNfaState const* dest_state + ) -> TypedNfaState*; + std::vector> m_states; + // TODO: Lexer currently enforces unique naming across capture groups. However, this limits use + // cases. Possibly initialize this in the lexer and pass it in during construction. + std::unordered_map> m_capture_to_tag_id_pair; TypedNfaState* m_root; - // Store the rules locally as they contain information needed by the NFA. E.g., transitions in - // the NFA point to tags in the rule ASTs. - std::vector> m_rules; + UniqueIdGenerator m_unique_id_generator; }; template -Nfa::Nfa(std::vector> rules) - : m_root{new_state()}, - m_rules{std::move(rules)} { - for (auto const& rule : m_rules) { +Nfa::Nfa(std::vector> const& rules) + : m_root{new_state()} { + for (auto const& rule : rules) { rule.add_to_nfa(this); } } +template +auto Nfa::get_or_create_capture_tag_pair(Capture const* capture +) -> std::pair { + if (false == m_capture_to_tag_id_pair.contains(capture)) { + auto const start_tag{m_unique_id_generator.generate_id()}; + auto const end_tag{m_unique_id_generator.generate_id()}; + m_capture_to_tag_id_pair.emplace(capture, std::make_pair(start_tag, end_tag)); + } + return m_capture_to_tag_id_pair.at(capture); +} + template auto Nfa::new_state() -> TypedNfaState* { m_states.emplace_back(std::make_unique()); @@ -111,31 +147,38 @@ auto Nfa::new_state() -> TypedNfaState* { template auto Nfa::new_state_with_positive_tagged_end_transition( - Tag const* tag, + tag_id_t const tag_id, TypedNfaState const* dest_state ) -> TypedNfaState* { - m_states.emplace_back(std::make_unique(tag, dest_state)); + m_states.emplace_back(std::make_unique(tag_id, dest_state)); return m_states.back().get(); } template auto Nfa::new_state_with_negative_tagged_transition( - std::vector tags, + std::vector const& captures, TypedNfaState const* dest_state ) -> TypedNfaState* { + std::vector tags; + for (auto const capture : captures) { + auto const [start_tag, end_tag]{get_or_create_capture_tag_pair(capture)}; + tags.push_back(start_tag); + tags.push_back(end_tag); + } + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } template auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( - Tag const* tag, + Capture const* capture, TypedNfaState const* dest_state ) -> std::pair { + auto const [start_tag, end_tag]{get_or_create_capture_tag_pair(capture)}; auto* start_state = new_state(); - m_root->add_positive_tagged_start_transition(tag, start_state); - - auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); + m_root->add_positive_tagged_start_transition(start_tag, start_state); + auto* end_state{new_state_with_positive_tagged_end_transition(end_tag, dest_state)}; return {start_state, end_state}; } diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 590c1607..e80e231e 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace log_surgeon::finite_automata { template @@ -31,11 +32,12 @@ class NfaState { NfaState() = default; - NfaState(Tag const* tag, NfaState const* dest_state) - : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} + NfaState(tag_id_t tag_id, NfaState const* dest_state) + : m_positive_tagged_end_transition{PositiveTaggedTransition{tag_id, dest_state}} {} - NfaState(std::vector tags, NfaState const* dest_state) - : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} + NfaState(std::vector tag_ids, NfaState const* dest_state) + : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tag_ids), dest_state} + } {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -49,8 +51,9 @@ class NfaState { return m_matching_variable_id; } - auto add_positive_tagged_start_transition(Tag const* tag, NfaState const* dest_state) -> void { - m_positive_tagged_start_transitions.emplace_back(tag, dest_state); + auto add_positive_tagged_start_transition(tag_id_t const tag_id, NfaState const* dest_state) + -> void { + m_positive_tagged_start_transitions.emplace_back(tag_id, dest_state); } [[nodiscard]] auto get_positive_tagged_start_transitions( diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index bb55f62d..974eaf02 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include namespace log_surgeon::finite_automata { @@ -30,12 +30,12 @@ class Nfa; // TODO: rename `RegexAST` to `RegexASTNode` /** * Base class for a Regex AST node. - * Unique integer tags are used to differentiate each capture group node. Every node will maintain - * two sets of tags: - * 1. `m_subtree_positive_tags`: the set of tags matched by all capture groups within the subtree - * rooted at this node. - * 2. `m_negative_tags`: the set of tags that are guaranteed to be unmatched when traversing this - * node, as the alternative path contains these tags. + * Unique capture pointers are used to differentiate each capture group node. Every node will + * maintain two sets of captures: + * 1. `m_subtree_positive_captures`: the set of captures matched by all capture groups within the + * subtree rooted at this node. + * 2. `m_negative_captures`: the set of captures that are guaranteed to be unmatched when traversing + * this node, as the alternative path contains these captures. * * ASTs built using this class are assumed to be constructed in a bottom-up manner, where all * descendant nodes are created first. @@ -83,24 +83,26 @@ class RegexAST { */ [[nodiscard]] virtual auto serialize() const -> std::u32string = 0; - [[nodiscard]] auto get_subtree_positive_tags() const -> std::vector const& { - return m_subtree_positive_tags; + [[nodiscard]] auto get_subtree_positive_captures() const -> std::vector const& { + return m_subtree_positive_captures; } - auto set_subtree_positive_tags(std::vector subtree_positive_tags) -> void { - m_subtree_positive_tags = std::move(subtree_positive_tags); + auto set_subtree_positive_captures(std::vector subtree_positive_captures + ) -> void { + m_subtree_positive_captures = std::move(subtree_positive_captures); } - auto add_subtree_positive_tags(std::vector const& subtree_positive_tags) -> void { - m_subtree_positive_tags.insert( - m_subtree_positive_tags.end(), - subtree_positive_tags.cbegin(), - subtree_positive_tags.cend() + auto add_subtree_positive_captures(std::vector const& subtree_positive_captures + ) -> void { + m_subtree_positive_captures.insert( + m_subtree_positive_captures.end(), + subtree_positive_captures.cbegin(), + subtree_positive_captures.cend() ); } - auto set_negative_tags(std::vector negative_tags) -> void { - m_negative_tags = std::move(negative_tags); + auto set_negative_captures(std::vector negative_captures) -> void { + m_negative_captures = std::move(negative_captures); } /** @@ -108,13 +110,15 @@ class RegexAST { * @param nfa * @param end_state */ - auto - add_to_nfa_with_negative_tags(Nfa* nfa, TypedNfaState* end_state) const -> void { - // Handle negative tags as: - // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state - if (false == m_negative_tags.empty()) { - auto* state_with_negative_tagged_transition - = nfa->new_state_with_negative_tagged_transition(m_negative_tags, end_state); + auto add_to_nfa_with_negative_captures(Nfa* nfa, TypedNfaState* end_state) const + -> void { + // Handle negative captures as: + // root --(regex)--> state_with_negative_tagged_transition --(negative captures)--> + // end_state + if (false == m_negative_captures.empty()) { + auto* state_with_negative_tagged_transition{ + nfa->new_state_with_negative_tagged_transition(m_negative_captures, end_state) + }; add_to_nfa(nfa, state_with_negative_tagged_transition); } else { add_to_nfa(nfa, end_state); @@ -127,27 +131,29 @@ class RegexAST { RegexAST(RegexAST&& rhs) noexcept = delete; auto operator=(RegexAST&& rhs) noexcept -> RegexAST& = delete; - [[nodiscard]] auto serialize_negative_tags() const -> std::u32string { - if (m_negative_tags.empty()) { + [[nodiscard]] auto serialize_negative_captures() const -> std::u32string { + if (m_negative_captures.empty()) { return U""; } - auto const transformed_negative_tags - = m_negative_tags | std::ranges::views::transform([](Tag const* tag) { - return fmt::format("<~{}>", tag->get_name()); - }); - auto const negative_tags_string - = fmt::format("{}", fmt::join(transformed_negative_tags, "")); + auto const transformed_negative_captures{ + m_negative_captures | std::ranges::views::transform([](Capture const* capture) { + return fmt::format("<~{}>", capture->get_name()); + }) + }; + auto const negative_captures_string{ + fmt::format("{}", fmt::join(transformed_negative_captures, "")) + }; return fmt::format( U"{}", - std::u32string(negative_tags_string.begin(), negative_tags_string.end()) + std::u32string(negative_captures_string.begin(), negative_captures_string.end()) ); } private: - std::vector m_subtree_positive_tags; - std::vector m_negative_tags; + std::vector m_subtree_positive_captures; + std::vector m_negative_captures; }; /** @@ -624,8 +630,8 @@ class RegexASTMultiplication : public RegexAST { /** * Represents a capture group AST node. * NOTE: - * - `m_tag` is always expected to be non-null. - * - `m_group_regex_ast` is always expected to be non-null. + * - `m_capture` must be non-null as it represents the capture group being matched. + * - `m_capture_regex_ast` must be non-null as it contains the regex pattern for the capture group. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ template @@ -634,34 +640,36 @@ class RegexASTCapture : public RegexAST { ~RegexASTCapture() override = default; /** - * @param group_regex_ast - * @param tag - * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. + * @param capture_regex_ast + * @param capture + * @throw std::invalid_argument if `capture_regex_ast` or `capture` are `nullptr`. */ RegexASTCapture( - std::unique_ptr> group_regex_ast, - std::unique_ptr tag + std::unique_ptr> capture_regex_ast, + std::unique_ptr capture ) - : m_group_regex_ast{( - nullptr == group_regex_ast - ? throw std::invalid_argument("Group regex AST cannot be null") - : std::move(group_regex_ast) - )}, - m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") - : std::move(tag)} { - RegexAST::set_subtree_positive_tags( - m_group_regex_ast->get_subtree_positive_tags() + : m_capture_regex_ast{std::move(capture_regex_ast)}, + m_capture{std::move(capture)} { + if (nullptr == m_capture_regex_ast) { + throw std::invalid_argument("Group regex AST cannot be null"); + } + if (nullptr == m_capture) { + throw std::invalid_argument("Capture cannot be null"); + } + + RegexAST::set_subtree_positive_captures( + m_capture_regex_ast->get_subtree_positive_captures() ); - RegexAST::add_subtree_positive_tags({m_tag.get()}); + RegexAST::add_subtree_positive_captures({m_capture.get()}); } RegexASTCapture(RegexASTCapture const& rhs) : RegexAST{rhs}, - m_group_regex_ast{ - std::unique_ptr>(rhs.m_group_regex_ast->clone()) + m_capture_regex_ast{ + std::unique_ptr>(rhs.m_capture_regex_ast->clone()) }, - m_tag{std::make_unique(*rhs.m_tag)} { - RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); + m_capture{std::make_unique(*rhs.m_capture)} { + RegexAST::set_subtree_positive_captures(rhs.get_subtree_positive_captures()); } /** @@ -679,7 +687,7 @@ class RegexASTCapture : public RegexAST { */ auto set_possible_inputs_to_true(std::array& is_possible_input ) const -> void override { - m_group_regex_ast->set_possible_inputs_to_true(is_possible_input); + m_capture_regex_ast->set_possible_inputs_to_true(is_possible_input); } /** @@ -688,7 +696,7 @@ class RegexASTCapture : public RegexAST { * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void override { - m_group_regex_ast->remove_delimiters_from_wildcard(delimiters); + m_capture_regex_ast->remove_delimiters_from_wildcard(delimiters); } /** @@ -701,21 +709,23 @@ class RegexASTCapture : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } + [[nodiscard]] auto get_capture_name() const -> std::string_view { + return m_capture->get_name(); + } - [[nodiscard]] auto get_group_regex_ast( + [[nodiscard]] auto get_capture_regex_ast( ) const -> std::unique_ptr> const& { - return m_group_regex_ast; + return m_capture_regex_ast; } private: - std::unique_ptr> m_group_regex_ast; - std::unique_ptr m_tag; + std::unique_ptr> m_capture_regex_ast; + std::unique_ptr m_capture; }; template [[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { - return fmt::format(U"{}", RegexAST::serialize_negative_tags()); + return fmt::format(U"{}", RegexAST::serialize_negative_captures()); } template @@ -732,7 +742,7 @@ template return fmt::format( U"{}{}", static_cast(m_character), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -763,7 +773,7 @@ template return fmt::format( U"{}{}", std::u32string(digits_string.begin(), digits_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -774,17 +784,18 @@ RegexASTOr::RegexASTOr( ) : m_left(std::move(left)), m_right(std::move(right)) { - m_left->set_negative_tags(m_right->get_subtree_positive_tags()); - m_right->set_negative_tags(m_left->get_subtree_positive_tags()); - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + m_left->set_negative_captures(m_right->get_subtree_positive_captures()); + m_right->set_negative_captures(m_left->get_subtree_positive_captures()); + RegexAST::set_subtree_positive_captures(m_left->get_subtree_positive_captures()); + RegexAST::add_subtree_positive_captures(m_right->get_subtree_positive_captures() + ); } template void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { - m_left->add_to_nfa_with_negative_tags(nfa, end_state); - m_right->add_to_nfa_with_negative_tags(nfa, end_state); + m_left->add_to_nfa_with_negative_captures(nfa, end_state); + m_right->add_to_nfa_with_negative_captures(nfa, end_state); } template @@ -793,7 +804,7 @@ template U"({})|({}){}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -804,8 +815,9 @@ RegexASTCat::RegexASTCat( ) : m_left(std::move(left)), m_right(std::move(right)) { - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_captures(m_left->get_subtree_positive_captures()); + RegexAST::add_subtree_positive_captures(m_right->get_subtree_positive_captures() + ); } template @@ -813,9 +825,9 @@ void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaSta const { TypedNfaState* saved_root = nfa->get_root(); TypedNfaState* intermediate_state = nfa->new_state(); - m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); + m_left->add_to_nfa_with_negative_captures(nfa, intermediate_state); nfa->set_root(intermediate_state); - m_right->add_to_nfa_with_negative_tags(nfa, end_state); + m_right->add_to_nfa_with_negative_captures(nfa, end_state); nfa->set_root(saved_root); } @@ -825,7 +837,7 @@ template U"{}{}{}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -838,7 +850,8 @@ RegexASTMultiplication::RegexASTMultiplication( : m_operand(std::move(operand)), m_min(min), m_max(max) { - RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_captures(m_operand->get_subtree_positive_captures( + )); } template @@ -852,27 +865,27 @@ void RegexASTMultiplication::add_to_nfa( } else { for (uint32_t i = 1; i < m_min; i++) { TypedNfaState* intermediate_state = nfa->new_state(); - m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_captures(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add_to_nfa_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_captures(nfa, end_state); } if (is_infinite()) { nfa->set_root(end_state); - m_operand->add_to_nfa_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_captures(nfa, end_state); } else if (m_max > m_min) { if (m_min != 0) { TypedNfaState* intermediate_state = nfa->new_state(); - m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_captures(nfa, intermediate_state); nfa->set_root(intermediate_state); } for (uint32_t i = m_min + 1; i < m_max; ++i) { - m_operand->add_to_nfa_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_captures(nfa, end_state); TypedNfaState* intermediate_state = nfa->new_state(); - m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); + m_operand->add_to_nfa_with_negative_captures(nfa, intermediate_state); nfa->set_root(intermediate_state); } - m_operand->add_to_nfa_with_negative_tags(nfa, end_state); + m_operand->add_to_nfa_with_negative_captures(nfa, end_state); } nfa->set_root(saved_root); } @@ -887,7 +900,7 @@ template nullptr != m_operand ? m_operand->serialize() : U"null", std::u32string(min_string.begin(), min_string.end()), is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } @@ -900,7 +913,7 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // +---------------------+ // | `m_root` | // +---------------------+ - // | `m_tag` start + // | `m_capture` start ID // | (positive tagged start transition) // v // +---------------------+ @@ -910,16 +923,16 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // | (epsilon transition) // v // +---------------------+ - // | `m_group_regex_ast` | + // |`m_capture_regex_ast`| // | (nested NFA) | // +---------------------+ - // | `m_negative_tags` + // | `m_negative_captures` // | (negative tagged transition) // v // +---------------------+ // | `capture_end_state` | // +---------------------+ - // | `m_tag` end + // | `m_capture` end ID // | (positive tagged end transition) // v // +---------------------+ @@ -927,24 +940,26 @@ auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNf // +---------------------+ auto [capture_start_state, capture_end_state] = nfa->new_start_and_end_states_with_positive_tagged_transitions( - m_tag.get(), + m_capture.get(), dest_state ); auto* initial_root = nfa->get_root(); nfa->set_root(capture_start_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, capture_end_state); + m_capture_regex_ast->add_to_nfa_with_negative_captures(nfa, capture_end_state); nfa->set_root(initial_root); } template [[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { - auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); + auto const capture_name_u32{ + std::u32string(m_capture->get_name().cbegin(), m_capture->get_name().cend()) + }; return fmt::format( U"({})<{}>{}", - m_group_regex_ast->serialize(), - tag_name_u32, - RegexAST::serialize_negative_tags() + m_capture_regex_ast->serialize(), + capture_name_u32, + RegexAST::serialize_negative_captures() ); } @@ -1100,7 +1115,7 @@ template U"[{}{}]{}", m_negate ? U"^" : U"", ranges_serialized, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_captures() ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 43315b2a..07b7bb92 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -1,33 +1,27 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION #define LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION -#include #include #include -#include #include #include +#include +#include #include -#include +#include namespace log_surgeon::finite_automata { /** - * Represents an NFA transition indicating that a capture group has been matched. - * NOTE: `m_tag` is always expected to be non-null. + * Represents an NFA transition indicating that a tag ID has been matched. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ template class PositiveTaggedTransition { public: - /** - * @param tag - * @param dest_state - * @throw std::invalid_argument if `tag` is `nullptr`. - */ - PositiveTaggedTransition(Tag const* tag, TypedNfaState const* dest_state) - : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, + PositiveTaggedTransition(tag_id_t const tag_id, TypedNfaState const* dest_state) + : m_tag_id{tag_id}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } @@ -39,38 +33,26 @@ class PositiveTaggedTransition { */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; + if (state_ids.contains(m_dest_state)) { + return fmt::format("{}[{}]", state_ids.at(m_dest_state), m_tag_id); } - return fmt::format("{}[{}]", state_id_it->second, m_tag->get_name()); + return std::nullopt; } private: - Tag const* m_tag; + tag_id_t m_tag_id; TypedNfaState const* m_dest_state; }; /** - * Represents an NFA transition indicating that a capture group has been unmatched. - * NOTE: All tags in `m_tags` are always expected to be non-null. + * Represents an NFA transition indicating that multiple tags have been unmatched. * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ template class NegativeTaggedTransition { public: - /** - * @param tags - * @param dest_state - * @throw std::invalid_argument if any elements in `tags` is `nullptr`. - */ - NegativeTaggedTransition(std::vector tags, TypedNfaState const* dest_state) - : m_tags{[&tags] { - if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { - throw std::invalid_argument("Tags cannot contain null elements"); - } - return std::move(tags); - }()}, + NegativeTaggedTransition(std::vector tag_ids, TypedNfaState const* dest_state) + : m_tag_ids{std::move(tag_ids)}, m_dest_state{dest_state} {} [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } @@ -82,18 +64,14 @@ class NegativeTaggedTransition { */ [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { - auto const state_id_it = state_ids.find(m_dest_state); - if (state_id_it == state_ids.end()) { - return std::nullopt; + if (state_ids.contains(m_dest_state)) { + return fmt::format("{}[{}]", state_ids.at(m_dest_state), fmt::join(m_tag_ids, ",")); } - - auto const tag_names = m_tags | std::ranges::views::transform(&Tag::get_name); - - return fmt::format("{}[{}]", state_id_it->second, fmt::join(tag_names, ",")); + return std::nullopt; } private: - std::vector m_tags; + std::vector m_tag_ids; TypedNfaState const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/types.hpp b/src/log_surgeon/types.hpp new file mode 100644 index 00000000..8260e530 --- /dev/null +++ b/src/log_surgeon/types.hpp @@ -0,0 +1,13 @@ +#ifndef LOG_SURGEON_ALIASES_HPP +#define LOG_SURGEON_ALIASES_HPP + +#include + +namespace log_surgeon { +using capture_id_t = uint32_t; +using reg_id_t = uint32_t; +using rule_id_t = uint32_t; +using tag_id_t = uint32_t; +} // namespace log_surgeon + +#endif // LOG_SURGEON_ALIASES_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 652ecebc..d2cf9d00 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,14 +2,14 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/Capture.hpp + ../src/log_surgeon/finite_automata/Nfa.hpp + ../src/log_surgeon/finite_automata/NfaState.hpp ../src/log_surgeon/finite_automata/PrefixTree.cpp ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp - ../src/log_surgeon/finite_automata/Nfa.hpp - ../src/log_surgeon/finite_automata/NfaState.hpp ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/StateType.hpp - ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/Lalr1Parser.cpp ../src/log_surgeon/Lalr1Parser.hpp @@ -22,9 +22,18 @@ set( ../src/log_surgeon/SchemaParser.hpp ../src/log_surgeon/Token.cpp ../src/log_surgeon/Token.hpp + ../src/log_surgeon/types.hpp + ../src/log_surgeon/UniqueIdGenerator.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-nfa.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) +set( + SOURCES_TESTS + test-lexer.cpp + test-nfa.cpp + test-prefix-tree.cpp + test-register-handler.cpp + test-capture.cpp +) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-capture.cpp b/tests/test-capture.cpp new file mode 100644 index 00000000..224bc03c --- /dev/null +++ b/tests/test-capture.cpp @@ -0,0 +1,36 @@ +#include + +#include + +#include + +using log_surgeon::finite_automata::Capture; + +TEST_CASE("Capture operations", "[Capture]") { + SECTION("Basic name retrieval works correctly") { + Capture const capture{"uID"}; + REQUIRE("uID" == capture.get_name()); + } + + SECTION("Empty capture name is handled correctly") { + Capture const empty_capture{""}; + REQUIRE(empty_capture.get_name().empty()); + } + + SECTION("Special characters in capture names are preserved") { + Capture const special_capture{"user.id-123_@"}; + REQUIRE("user.id-123_@" == special_capture.get_name()); + } + + SECTION("Copy constructor works correctly") { + Capture assign_capture{"target"}; + assign_capture = Capture{"new_source"}; + REQUIRE("new_source" == assign_capture.get_name()); + } + + SECTION("Move constructor works correctly") { + Capture original_capture{"source"}; + Capture const moved_capture{std::move(original_capture)}; + REQUIRE("source" == moved_capture.get_name()); + } +} diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index 48b2185c..ffe7220b 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -1,17 +1,27 @@ #include +#include #include +#include #include #include -#include +#include #include -#include +#include #include +#include +#include #include #include +#include +using log_surgeon::lexers::ByteLexer; +using log_surgeon::Schema; +using log_surgeon::SchemaAST; +using log_surgeon::SymbolId; using std::codecvt_utf8; +using std::make_unique; using std::string; using std::string_view; using std::u32string; @@ -49,9 +59,28 @@ auto test_regex_ast(string_view var_schema, u32string const& expected_serialized */ [[nodiscard]] auto u32string_to_string(u32string const& u32_str) -> string; +/** + * Creates a lexer with a constant set of delimiters (space and newline) and the given schema. + * The delimiters are used to separate tokens in the input. + * @param schema_ast The schema variables are used to set the lexer's symbol mappings. + * @return The lexer. + */ +[[nodiscard]] auto create_lexer(std::unique_ptr schema_ast) -> ByteLexer; + +/** + * Lexes the given input and verifies the output is a token for the given rule name, folowed by the + * end of input token. + * + * @param lexer The lexer to scan the input with. + * @param input The input to lex. + * @param rule_name The expected symbol to match. + */ +auto test_scanning_input(ByteLexer& lexer, std::string_view input, std::string_view rule_name) + -> void; + auto test_regex_ast(string_view const var_schema, u32string const& expected_serialized_ast) -> void { - log_surgeon::Schema schema; + Schema schema; schema.add_variable(var_schema, -1); auto const schema_ast = schema.release_schema_ast_ptr(); @@ -67,11 +96,80 @@ auto u32string_to_string(u32string const& u32_str) -> string { wstring_convert, char32_t> converter; return converter.to_bytes(u32_str.data(), u32_str.data() + u32_str.size()); } + +auto create_lexer(std::unique_ptr schema_ast) -> ByteLexer { + vector const delimiters{' ', '\n'}; + + ByteLexer lexer; + lexer.add_delimiters(delimiters); + + vector lexer_delimiters; + for (uint32_t i{0}; i < log_surgeon::cSizeOfByte; ++i) { + if (lexer.is_delimiter(i)) { + lexer_delimiters.push_back(i); + } + } + + lexer.m_symbol_id.emplace(log_surgeon::cTokenEnd, static_cast(SymbolId::TokenEnd)); + lexer.m_symbol_id.emplace( + log_surgeon::cTokenUncaughtString, + static_cast(SymbolId::TokenUncaughtString) + ); + lexer.m_id_symbol.emplace(static_cast(SymbolId::TokenEnd), log_surgeon::cTokenEnd); + lexer.m_id_symbol.emplace( + static_cast(SymbolId::TokenUncaughtString), + log_surgeon::cTokenUncaughtString + ); + + for (auto const& m_schema_var : schema_ast->m_schema_vars) { + // For log-specific lexing: modify variable regex to contain a delimiter at the start. + auto delimiter_group{make_unique(RegexASTGroupByte(lexer_delimiters))}; + auto* rule{dynamic_cast(m_schema_var.get())}; + rule->m_regex_ptr = make_unique( + std::move(delimiter_group), + std::move(rule->m_regex_ptr) + ); + if (false == lexer.m_symbol_id.contains(rule->m_name)) { + lexer.m_symbol_id.emplace(rule->m_name, lexer.m_symbol_id.size()); + lexer.m_id_symbol.emplace(lexer.m_symbol_id.at(rule->m_name), rule->m_name); + } + lexer.add_rule(lexer.m_symbol_id.at(rule->m_name), std::move(rule->m_regex_ptr)); + } + lexer.generate(); + return lexer; +} + +auto test_scanning_input(ByteLexer& lexer, std::string_view input, std::string_view rule_name) + -> void { + lexer.reset(); + + log_surgeon::ParserInputBuffer input_buffer; + string token_string{input}; + input_buffer.set_storage(token_string.data(), token_string.size(), 0, true); + lexer.prepend_start_of_file_char(input_buffer); + + log_surgeon::Token token; + auto error_code{lexer.scan(input_buffer, token)}; + REQUIRE(log_surgeon::ErrorCode::Success == error_code); + REQUIRE(nullptr != token.m_type_ids_ptr); + REQUIRE(1 == token.m_type_ids_ptr->size()); + REQUIRE(rule_name == lexer.m_id_symbol[token.m_type_ids_ptr->at(0)]); + REQUIRE(input == token.to_string_view()); + + error_code = lexer.scan(input_buffer, token); + REQUIRE(log_surgeon::ErrorCode::Success == error_code); + REQUIRE(nullptr != token.m_type_ids_ptr); + REQUIRE(1 == token.m_type_ids_ptr->size()); + REQUIRE(log_surgeon::cTokenEnd == lexer.m_id_symbol[token.m_type_ids_ptr->at(0)]); + REQUIRE(token.to_string_view().empty()); + + // TODO: Add verification of register values after implementing the DFA simulation. +} } // namespace TEST_CASE("Test the Schema class", "[Schema]") { SECTION("Add a number variable to schema") { - log_surgeon::Schema schema; + Schema schema; string const var_name = "myNumber"; string const var_schema = var_name + string(":") + string("123"); schema.add_variable(string_view(var_schema), -1); @@ -89,7 +187,7 @@ TEST_CASE("Test the Schema class", "[Schema]") { } SECTION("Add a capture variable to schema") { - log_surgeon::Schema schema; + Schema schema; std::string const var_name = "capture"; string const var_schema = var_name + string(":") + string("u(?[0-9]+)"); schema.add_variable(var_schema, -1); @@ -103,30 +201,31 @@ TEST_CASE("Test the Schema class", "[Schema]") { auto& schema_var_ast = dynamic_cast(*schema_var_ast_ptr); REQUIRE(var_name == schema_var_ast.m_name); - auto* regex_ast_cat_ptr = dynamic_cast(schema_var_ast.m_regex_ptr.get()); + auto const* regex_ast_cat_ptr + = dynamic_cast(schema_var_ast.m_regex_ptr.get()); REQUIRE(nullptr != regex_ast_cat_ptr); REQUIRE(nullptr != regex_ast_cat_ptr->get_left()); REQUIRE(nullptr != regex_ast_cat_ptr->get_right()); - auto* regex_ast_literal + auto const* regex_ast_literal = dynamic_cast(regex_ast_cat_ptr->get_left()); REQUIRE(nullptr != regex_ast_literal); REQUIRE('u' == regex_ast_literal->get_character()); - auto* regex_ast_capture + auto const* regex_ast_capture = dynamic_cast(regex_ast_cat_ptr->get_right()); REQUIRE(nullptr != regex_ast_capture); - REQUIRE("uID" == regex_ast_capture->get_group_name()); + REQUIRE("uID" == regex_ast_capture->get_capture_name()); - auto* regex_ast_multiplication_ast = dynamic_cast( - regex_ast_capture->get_group_regex_ast().get() + auto const* regex_ast_multiplication_ast = dynamic_cast( + regex_ast_capture->get_capture_regex_ast().get() ); REQUIRE(nullptr != regex_ast_multiplication_ast); REQUIRE(1 == regex_ast_multiplication_ast->get_min()); REQUIRE(0 == regex_ast_multiplication_ast->get_max()); REQUIRE(regex_ast_multiplication_ast->is_infinite()); - auto* regex_ast_group_ast + auto const* regex_ast_group_ast = dynamic_cast(regex_ast_multiplication_ast->get_operand().get() ); REQUIRE(false == regex_ast_group_ast->is_wildcard()); @@ -208,3 +307,57 @@ TEST_CASE("Test the Schema class", "[Schema]") { ); } } + +TEST_CASE("Test basic Lexer", "[Lexer]") { + constexpr string_view cVarName{"myVar"}; + constexpr string_view cVarSchema{"myVar:123"}; + constexpr string_view cTokenString1{"123"}; + constexpr string_view cTokenString2{"234"}; + + Schema schema; + schema.add_variable(cVarSchema, -1); + + ByteLexer lexer{create_lexer(std::move(schema.release_schema_ast_ptr()))}; + + test_scanning_input(lexer, cTokenString1, cVarName); + test_scanning_input(lexer, cTokenString2, log_surgeon::cTokenUncaughtString); +} + +TEST_CASE("Test Lexer with capture groups", "[Lexer]") { + constexpr string_view cVarName{"myVar"}; + constexpr string_view cCaptureName{"uid"}; + constexpr string_view cVarSchema{"myVar:userID=(?123)"}; + constexpr string_view cTokenString1{"userID=123"}; + constexpr string_view cTokenString2{"userID=234"}; + constexpr string_view cTokenString3{"123"}; + + Schema schema; + schema.add_variable(cVarSchema, -1); + + ByteLexer lexer{create_lexer(std::move(schema.release_schema_ast_ptr()))}; + + string const var_name{cVarName}; + REQUIRE(lexer.m_symbol_id.contains(var_name)); + + string const capture_name{cCaptureName}; + REQUIRE(lexer.m_symbol_id.contains(capture_name)); + + auto const optional_capture_ids{lexer.get_capture_ids_from_rule_id(lexer.m_symbol_id.at(var_name + ))}; + REQUIRE(optional_capture_ids.has_value()); + REQUIRE(1 == optional_capture_ids.value().size()); + REQUIRE(lexer.m_symbol_id.at(capture_name) == optional_capture_ids.value()[0]); + + auto const optional_tag_id_pair{ + lexer.get_tag_id_pair_from_capture_id(optional_capture_ids.value()[0]) + }; + REQUIRE(optional_tag_id_pair.has_value()); + REQUIRE(std::make_pair(0U, 1U) == optional_tag_id_pair.value()); + + // TODO: Add check for `get_reg_id_from_tag_id` and `get_reg_ids_from_capture_id` when TDFA's + // determinization is implemented. + + test_scanning_input(lexer, cTokenString1, cVarName); + test_scanning_input(lexer, cTokenString2, log_surgeon::cTokenUncaughtString); + test_scanning_input(lexer, cTokenString3, log_surgeon::cTokenUncaughtString); +} diff --git a/tests/test-nfa.cpp b/tests/test-nfa.cpp index 719a168e..2c4e3477 100644 --- a/tests/test-nfa.cpp +++ b/tests/test-nfa.cpp @@ -44,9 +44,10 @@ TEST_CASE("Test NFA", "[NFA]") { auto& capture_rule_ast = dynamic_cast(*schema_ast->m_schema_vars[0]); vector rules; rules.emplace_back(0, std::move(capture_rule_ast.m_regex_ptr)); - ByteNfa const nfa{std::move(rules)}; + ByteNfa const nfa{rules}; // Compare against expected output + // capture order(tags in brackets): letter1(0,1), letter2(2,3), letter(4,5), containerID(6,7) string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," @@ -54,18 +55,17 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transition={}\n"; expected_serialized_nfa += "1:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={3[letter]}," + "positive_tagged_start_transitions={3[4]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa - += "2:byte_transitions={}," - "epsilon_transitions={}," - "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={}," - "negative_tagged_transition={4[letter1,letter2,letter,containerID]}\n"; + expected_serialized_nfa += "2:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={4[0,1,2,3,4,5,6,7]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={5[letter1],6[letter2]}," + "positive_tagged_start_transitions={5[0],6[2]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "4:accepting_tag=0,byte_transitions={}," @@ -86,27 +86,27 @@ TEST_CASE("Test NFA", "[NFA]") { expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={9[letter1]}," + "positive_tagged_end_transitions={9[1]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={10[letter2]}," + "positive_tagged_end_transitions={10[3]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "9:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={11[letter2]}\n"; + "negative_tagged_transition={11[2,3]}\n"; expected_serialized_nfa += "10:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," "positive_tagged_end_transitions={}," - "negative_tagged_transition={11[letter1]}\n"; + "negative_tagged_transition={11[0,1]}\n"; expected_serialized_nfa += "11:byte_transitions={}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={12[letter]}," + "positive_tagged_end_transitions={12[5]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "12:byte_transitions={B-->13}," "epsilon_transitions={}," @@ -115,7 +115,7 @@ TEST_CASE("Test NFA", "[NFA]") { "negative_tagged_transition={}\n"; expected_serialized_nfa += "13:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_start_transitions={14[containerID]}," + "positive_tagged_start_transitions={14[6]}," "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "14:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" @@ -128,7 +128,7 @@ TEST_CASE("Test NFA", "[NFA]") { "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," "positive_tagged_start_transitions={}," - "positive_tagged_end_transitions={16[containerID]}," + "positive_tagged_end_transitions={16[7]}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "16:byte_transitions={C-->4}," "epsilon_transitions={}," diff --git a/tests/test-tag.cpp b/tests/test-tag.cpp deleted file mode 100644 index 41f8a2ef..00000000 --- a/tests/test-tag.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include - -#include - -using log_surgeon::finite_automata::Tag; - -TEST_CASE("Tag operations", "[Tag]") { - SECTION("Basic name retrieval works correctly") { - Tag const tag{"uID"}; - REQUIRE("uID" == tag.get_name()); - } - - SECTION("Empty tag name is handled correctly") { - Tag const empty_tag{""}; - REQUIRE(empty_tag.get_name().empty()); - } - - SECTION("Special characters in tag names are preserved") { - Tag const special_tag{"user.id-123_@"}; - REQUIRE("user.id-123_@" == special_tag.get_name()); - } - - SECTION("Copy constructor works correctly") { - Tag assign_tag{"target"}; - assign_tag = Tag{"new_source"}; - REQUIRE("new_source" == assign_tag.get_name()); - } - - SECTION("Move constructor works correctly") { - Tag original_tag{"source"}; - Tag moved_tag{std::move(original_tag)}; - REQUIRE("source" == moved_tag.get_name()); - } -}