y-scope · SharafMohamed · Mar 10, 2025 · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025
@@ -72,13 +72,15 @@ set(SOURCE_FILES
     src/log_surgeon/finite_automata/Dfa.hpp
     src/log_surgeon/finite_automata/DfaState.hpp
     src/log_surgeon/finite_automata/DfaStatePair.hpp
+    src/log_surgeon/finite_automata/DfaTransition.hpp
     src/log_surgeon/finite_automata/Nfa.hpp
     src/log_surgeon/finite_automata/NfaSpontaneousTransition.hpp
     src/log_surgeon/finite_automata/NfaState.hpp
     src/log_surgeon/finite_automata/PrefixTree.cpp
     src/log_surgeon/finite_automata/PrefixTree.hpp
     src/log_surgeon/finite_automata/RegexAST.hpp
     src/log_surgeon/finite_automata/RegisterHandler.hpp
+    src/log_surgeon/finite_automata/RegisterOperation.hpp
     src/log_surgeon/finite_automata/StateType.hpp
     src/log_surgeon/finite_automata/TagOperation.hpp
     src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp

@@ -158,8 +158,8 @@ class Lexer {
      */
     [[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id
     ) const -> std::optional<reg_id_t> {
-        if (m_tag_to_reg_id.contains(tag_id)) {
-            return m_tag_to_reg_id.at(tag_id);
+        if (m_tag_to_final_reg_id.contains(tag_id)) {
+            return m_tag_to_final_reg_id.at(tag_id);
         }
         return std::nullopt;
     }
@@ -218,7 +218,7 @@ class Lexer {
     TypedDfaState const* m_prev_state{nullptr};
     std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
     std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
-    std::unordered_map<tag_id_t, reg_id_t> m_tag_to_reg_id;
+    std::map<tag_id_t, reg_id_t> m_tag_to_final_reg_id;
 };
 
 namespace lexers {

@@ -84,19 +84,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer,
             m_match_pos = prev_byte_buf_pos;
             m_match_line = m_line;
         }
-        auto* dest_state = state->get_dest_state(next_char);
+        auto const& optional_transition{state->get_transition(next_char)};
         if (next_char == '\n') {
             m_line++;
             if (m_has_delimiters && !m_match) {
-                dest_state = m_dfa->get_root()->get_dest_state(next_char);
+                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
+                )};
-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
                 m_match = true;
                 m_type_ids = &(dest_state->get_matching_variable_ids());
                 m_start_pos = prev_byte_buf_pos;
                 m_match_pos = input_buffer.storage().pos();
                 m_match_line = m_line;
             }
         }
-        if (input_buffer.log_fully_consumed() || nullptr == dest_state) {
+        if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) {
             if (m_match) {
                 input_buffer.set_log_fully_consumed(false);
                 input_buffer.set_pos(m_match_pos);
@@ -165,7 +166,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer,
             state = m_dfa->get_root();
             continue;
         }
-        state = dest_state;
+        state = optional_transition->get_dest_state();
     }
 }
 
@@ -215,19 +216,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
             m_match_pos = prev_byte_buf_pos;
             m_match_line = m_line;
         }
-        TypedDfaState const* dest_state{state->get_dest_state(next_char)};
+        auto const& optional_transition{state->get_transition(next_char)};
         if (next_char == '\n') {
             m_line++;
             if (m_has_delimiters && !m_match) {
-                dest_state = m_dfa->get_root()->get_dest_state(next_char);
+                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
+                )};
-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
                 m_match = true;
                 m_type_ids = &(dest_state->get_matching_variable_ids());
                 m_start_pos = prev_byte_buf_pos;
                 m_match_pos = input_buffer.storage().pos();
                 m_match_line = m_line;
             }
         }
-        if (input_buffer.log_fully_consumed() || nullptr == dest_state) {
+        if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) {
             assert(input_buffer.log_fully_consumed());
             if (!m_match || (m_match && m_match_pos != input_buffer.storage().pos())) {
                 token
@@ -243,7 +245,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
                 // BFS (keep track of m_type_ids)
                 if (wildcard == '?') {
                     for (uint32_t byte = 0; byte < cSizeOfByte; byte++) {
-                        auto* dest_state{state->get_dest_state(byte)};
+                        auto const* dest_state{state->get_transition(byte)->get_dest_state()};
-                        auto const* dest_state{state->get_transition(byte)->get_dest_state()};
+                        auto const& transition{state->get_transition(byte)};
+                        if (false == transition.has_value()) {
+                            token = Token{m_last_match_pos,
+                                          input_buffer.storage().pos(),
+                                          input_buffer.storage().get_active_buffer(),
+                                          input_buffer.storage().size(),
+                                          m_last_match_line,
+                                          &cTokenUncaughtStringTypes};
+                            return ErrorCode::Success;
+                        }
+                        auto const* dest_state{transition->get_dest_state()};
-                        auto const* dest_state{state->get_transition(byte)->get_dest_state()};
+                        auto const& transition{state->get_transition(byte)};
+                        if (false == transition.has_value()) {
+                            token = Token{m_last_match_pos,
+                                          input_buffer.storage().pos(),
+                                          input_buffer.storage().get_active_buffer(),
+                                          input_buffer.storage().size(),
+                                          m_last_match_line,
+                                          &cTokenUncaughtStringTypes};
+                            return ErrorCode::Success;
+                        }
+                        auto const* dest_state{transition->get_dest_state()};
                         if (false == dest_state->is_accepting()) {
                             token
                                     = Token{m_last_match_pos,
@@ -277,7 +279,14 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
                             if (m_is_delimiter[byte]) {
                                 continue;
                             }
-                            TypedDfaState const* dest_state{current_state->get_dest_state(byte)};
+                            auto const& optional_wildcard_transition{
+                                    current_state->get_transition(byte)
+                            };
+                            if (false == optional_wildcard_transition.has_value()) {
+                                unvisited_states.push(nullptr);
+                                continue;
+                            }
+                            auto const* dest_state{optional_wildcard_transition->get_dest_state()};
                             if (false == visited_states.contains(dest_state)) {
                                 unvisited_states.push(dest_state);
                             }
@@ -299,7 +308,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
                 return ErrorCode::Success;
             }
         }
-        state = dest_state;
+        state = optional_transition->get_dest_state();
     }
 }
 
@@ -337,7 +346,7 @@ void Lexer<TypedNfaState, TypedDfaState>::reset() {
 template <typename TypedNfaState, typename TypedDfaState>
 void Lexer<TypedNfaState, TypedDfaState>::prepend_start_of_file_char(ParserInputBuffer& input_buffer
 ) {
-    m_prev_state = m_dfa->get_root()->get_dest_state(utf8::cCharStartOfFile);
+    m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
-    m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
+    auto const& transition{m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)};
+    if (false == transition.has_value()) {
+        // Handle the case where there's no transition for start of file
+        m_prev_state = m_dfa->get_root();
+    } else {
+        m_prev_state = transition->get_dest_state();
+    }
-    m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
+    auto const& transition{m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)};
+    if (false == transition.has_value()) {
+        // Handle the case where there's no transition for start of file
+        m_prev_state = m_dfa->get_root();
+    } else {
+        m_prev_state = transition->get_dest_state();
+    }
     m_asked_for_more_data = true;
     m_start_pos = input_buffer.storage().pos();
     m_match_pos = input_buffer.storage().pos();
@@ -407,11 +416,7 @@ void Lexer<TypedNfaState, TypedDfaState>::generate() {
     m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa);
     auto const* state = m_dfa->get_root();
     for (uint32_t i = 0; i < cSizeOfByte; i++) {
-        if (nullptr != state->get_dest_state(i)) {
-            m_is_first_char[i] = true;
-        } else {
-            m_is_first_char[i] = false;
-        }
+        m_is_first_char[i] = state->get_transition(i).has_value();
     }
 }
 }  // namespace log_surgeon

@@ -4,10 +4,15 @@
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
 #include <set>
 #include <stack>
 #include <vector>
 
+#include <fmt/core.h>
+#include <fmt/format.h>
+
+#include <log_surgeon/Constants.hpp>
 #include <log_surgeon/finite_automata/DfaStatePair.hpp>
 #include <log_surgeon/finite_automata/Nfa.hpp>
 
@@ -17,6 +22,12 @@ class Dfa {
 public:
     explicit Dfa(Nfa<TypedNfaState> const& nfa);
 
+    /**
+     * @return A string representation of the DFA.
+     * @return Forwards `DfaState::serialize`'s return value (std::nullopt) on failure.
+     */
+    [[nodiscard]] auto serialize() const -> std::optional<std::string>;
+
     /**
      * Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
      * @param nfa_state_set The set of NFA states represented by this DFA state.
@@ -38,6 +49,12 @@ class Dfa {
     [[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set<uint32_t>;
 
 private:
+    /**
+     * @return A vector representing the traversal order of the DFA states using breadth-first
+     * search (BFS).
+     */
+    [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector<TypedDfaState const*>;
+
     std::vector<std::unique_ptr<TypedDfaState>> m_states;
 };
 
@@ -61,10 +78,10 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
         auto set = unmarked_sets.top();
         unmarked_sets.pop();
         auto* dfa_state = dfa_states.at(set);
-        std::map<uint32_t, StateSet> ascii_transitions_map;
+        std::map<uint8_t, StateSet> ascii_transitions_map;
         // map<Interval, StateSet> transitions_map;
         for (auto const* s0 : set) {
-            for (uint32_t i = 0; i < cSizeOfByte; i++) {
+            for (uint16_t i{0}; i < cSizeOfByte; ++i) {
                 for (auto* const s1 : s0->get_byte_transitions(i)) {
                     StateSet closure = s1->epsilon_closure();
                     ascii_transitions_map[i].insert(closure.begin(), closure.end());
@@ -83,9 +100,9 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
             }
             return state;
         };
-        for (auto const& kv : ascii_transitions_map) {
-            auto* dest_state = next_dfa_state(kv.second);
-            dfa_state->add_byte_transition(kv.first, dest_state);
+        for (auto const& [byte, nfa_state_set] : ascii_transitions_map) {
+            auto* dest_state{next_dfa_state(nfa_state_set)};
+            dfa_state->add_byte_transition(byte, {{}, dest_state});
         }
         // TODO: add this for the utf8 case
     }
@@ -125,6 +142,60 @@ auto Dfa<TypedDfaState, TypedNfaState>::get_intersect(Dfa const* dfa_in
     }
     return schema_types;
 }
+
+template <typename TypedDfaState, typename TypedNfaState>
+auto Dfa<TypedDfaState, TypedNfaState>::get_bfs_traversal_order(
+) const -> std::vector<TypedDfaState const*> {
+    std::queue<TypedDfaState const*> state_queue;
+    std::unordered_set<TypedDfaState const*> visited_states;
+    std::vector<TypedDfaState const*> visited_order;
+    visited_states.reserve(m_states.size());
+    visited_order.reserve(m_states.size());
+
+    auto try_add_to_queue_and_visited
+            = [&state_queue, &visited_states](TypedDfaState const* dest_state) {
+                  if (visited_states.insert(dest_state).second) {
+                      state_queue.push(dest_state);
+                  }
+              };
+
+    try_add_to_queue_and_visited(get_root());
+    while (false == state_queue.empty()) {
+        auto const* current_state = state_queue.front();
+        visited_order.push_back(current_state);
+        state_queue.pop();
+        // TODO: Handle the utf8 case
+        for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
+            auto const& transition{current_state->get_transition(idx)};
+            if (transition.has_value()) {
+                auto const* dest_state{transition->get_dest_state()};
+                try_add_to_queue_and_visited(dest_state);
+            }
+        }
+    }
+    return visited_order;
+}
+
+template <typename TypedDfaState, typename TypedNfaState>
+auto Dfa<TypedDfaState, TypedNfaState>::serialize() const -> std::optional<std::string> {
+    auto const traversal_order = get_bfs_traversal_order();
+
+    std::unordered_map<TypedDfaState const*, uint32_t> state_ids;
+    state_ids.reserve(traversal_order.size());
+    for (auto const* state : traversal_order) {
+        state_ids.emplace(state, state_ids.size());
+    }
+
+    std::vector<std::string> serialized_states;
+    for (auto const* state : traversal_order) {
+        auto const optional_serialized_state{state->serialize(state_ids)};
+        if (false == optional_serialized_state.has_value()) {
+            return std::nullopt;
+        }
+        serialized_states.emplace_back(optional_serialized_state.value());
+    }
+    return fmt::format("{}\n", fmt::join(serialized_states, "\n"));
+}
 }  // namespace log_surgeon::finite_automata
 
 #endif  // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP