-
Notifications
You must be signed in to change notification settings - Fork 10
feat: Add RegisterOperation for TDFA into DFA transitions.
#89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 31 commits
c42a214
228e8b7
765598f
34a7f5f
2d64cc5
0e41cfb
5f7dce1
aa1ce5f
8eef207
415be20
939f0ab
7cbc7ee
3da3300
d70d4ef
1da6391
345655d
12e06d9
45dd3a4
8f8da2a
37aa5f8
a97103e
30db002
82e2195
59292da
32ad10d
0586f90
03ae78e
ffa78b1
e5b1cce
3b1dafc
0b397e2
11cb2a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -84,19 +84,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer, | |||||||||||||||||||||||||
| m_match_pos = prev_byte_buf_pos; | ||||||||||||||||||||||||||
| m_match_line = m_line; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| auto* dest_state = state->get_dest_state(next_char); | ||||||||||||||||||||||||||
| auto const& optional_transition{state->get_transition(next_char)}; | ||||||||||||||||||||||||||
| if (next_char == '\n') { | ||||||||||||||||||||||||||
| m_line++; | ||||||||||||||||||||||||||
| if (m_has_delimiters && !m_match) { | ||||||||||||||||||||||||||
| dest_state = m_dfa->get_root()->get_dest_state(next_char); | ||||||||||||||||||||||||||
| auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state( | ||||||||||||||||||||||||||
| )}; | ||||||||||||||||||||||||||
|
Comment on lines
+91
to
+92
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing transition validity check. The code directly accesses the destination state from a transition without first checking if the transition exists using - auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
- )};
+ auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+ if (false == root_transition.has_value()) {
+ // Handle the case where there's no transition for newline
+ continue;
+ }
+ auto const* dest_state{root_transition->get_dest_state()};📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||
| m_match = true; | ||||||||||||||||||||||||||
| m_type_ids = &(dest_state->get_matching_variable_ids()); | ||||||||||||||||||||||||||
| m_start_pos = prev_byte_buf_pos; | ||||||||||||||||||||||||||
| m_match_pos = input_buffer.storage().pos(); | ||||||||||||||||||||||||||
| m_match_line = m_line; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| if (input_buffer.log_fully_consumed() || nullptr == dest_state) { | ||||||||||||||||||||||||||
| if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) { | ||||||||||||||||||||||||||
| if (m_match) { | ||||||||||||||||||||||||||
| input_buffer.set_log_fully_consumed(false); | ||||||||||||||||||||||||||
| input_buffer.set_pos(m_match_pos); | ||||||||||||||||||||||||||
|
|
@@ -165,7 +166,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer, | |||||||||||||||||||||||||
| state = m_dfa->get_root(); | ||||||||||||||||||||||||||
| continue; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| state = dest_state; | ||||||||||||||||||||||||||
| state = optional_transition->get_dest_state(); | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
|
|
@@ -215,19 +216,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard( | |||||||||||||||||||||||||
| m_match_pos = prev_byte_buf_pos; | ||||||||||||||||||||||||||
| m_match_line = m_line; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| TypedDfaState const* dest_state{state->get_dest_state(next_char)}; | ||||||||||||||||||||||||||
| auto const& optional_transition{state->get_transition(next_char)}; | ||||||||||||||||||||||||||
| if (next_char == '\n') { | ||||||||||||||||||||||||||
| m_line++; | ||||||||||||||||||||||||||
| if (m_has_delimiters && !m_match) { | ||||||||||||||||||||||||||
| dest_state = m_dfa->get_root()->get_dest_state(next_char); | ||||||||||||||||||||||||||
| auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state( | ||||||||||||||||||||||||||
| )}; | ||||||||||||||||||||||||||
|
Comment on lines
+223
to
+224
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing transition validity check. Similar to the earlier instance, the code directly accesses the destination state from a transition without checking if it exists, which could lead to undefined behavior. - auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
- )};
+ auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+ if (false == root_transition.has_value()) {
+ // Handle the case where there's no transition for newline
+ continue;
+ }
+ auto const* dest_state{root_transition->get_dest_state()};📝 Committable suggestion
Suggested change
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so I guess There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea the transition should always exist for those two cases. |
||||||||||||||||||||||||||
| m_match = true; | ||||||||||||||||||||||||||
| m_type_ids = &(dest_state->get_matching_variable_ids()); | ||||||||||||||||||||||||||
| m_start_pos = prev_byte_buf_pos; | ||||||||||||||||||||||||||
| m_match_pos = input_buffer.storage().pos(); | ||||||||||||||||||||||||||
| m_match_line = m_line; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| if (input_buffer.log_fully_consumed() || nullptr == dest_state) { | ||||||||||||||||||||||||||
| if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) { | ||||||||||||||||||||||||||
| assert(input_buffer.log_fully_consumed()); | ||||||||||||||||||||||||||
| if (!m_match || (m_match && m_match_pos != input_buffer.storage().pos())) { | ||||||||||||||||||||||||||
| token | ||||||||||||||||||||||||||
|
|
@@ -243,7 +245,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard( | |||||||||||||||||||||||||
| // BFS (keep track of m_type_ids) | ||||||||||||||||||||||||||
| if (wildcard == '?') { | ||||||||||||||||||||||||||
| for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { | ||||||||||||||||||||||||||
| auto* dest_state{state->get_dest_state(byte)}; | ||||||||||||||||||||||||||
| auto const* dest_state{state->get_transition(byte)->get_dest_state()}; | ||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing transition validity check in wildcard code. The code directly accesses the destination state from a transition without checking if it exists using - auto const* dest_state{state->get_transition(byte)->get_dest_state()};
+ auto const& transition{state->get_transition(byte)};
+ if (false == transition.has_value()) {
+ token = Token{m_last_match_pos,
+ input_buffer.storage().pos(),
+ input_buffer.storage().get_active_buffer(),
+ input_buffer.storage().size(),
+ m_last_match_line,
+ &cTokenUncaughtStringTypes};
+ return ErrorCode::Success;
+ }
+ auto const* dest_state{transition->get_dest_state()};📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||
| if (false == dest_state->is_accepting()) { | ||||||||||||||||||||||||||
| token | ||||||||||||||||||||||||||
| = Token{m_last_match_pos, | ||||||||||||||||||||||||||
|
|
@@ -277,7 +279,14 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard( | |||||||||||||||||||||||||
| if (m_is_delimiter[byte]) { | ||||||||||||||||||||||||||
| continue; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| TypedDfaState const* dest_state{current_state->get_dest_state(byte)}; | ||||||||||||||||||||||||||
| auto const& optional_wildcard_transition{ | ||||||||||||||||||||||||||
| current_state->get_transition(byte) | ||||||||||||||||||||||||||
| }; | ||||||||||||||||||||||||||
| if (false == optional_wildcard_transition.has_value()) { | ||||||||||||||||||||||||||
| unvisited_states.push(nullptr); | ||||||||||||||||||||||||||
| continue; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| auto const* dest_state{optional_wildcard_transition->get_dest_state()}; | ||||||||||||||||||||||||||
| if (false == visited_states.contains(dest_state)) { | ||||||||||||||||||||||||||
| unvisited_states.push(dest_state); | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
@@ -299,7 +308,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard( | |||||||||||||||||||||||||
| return ErrorCode::Success; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| state = dest_state; | ||||||||||||||||||||||||||
| state = optional_transition->get_dest_state(); | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
|
|
@@ -337,7 +346,7 @@ void Lexer<TypedNfaState, TypedDfaState>::reset() { | |||||||||||||||||||||||||
| template <typename TypedNfaState, typename TypedDfaState> | ||||||||||||||||||||||||||
| void Lexer<TypedNfaState, TypedDfaState>::prepend_start_of_file_char(ParserInputBuffer& input_buffer | ||||||||||||||||||||||||||
| ) { | ||||||||||||||||||||||||||
| m_prev_state = m_dfa->get_root()->get_dest_state(utf8::cCharStartOfFile); | ||||||||||||||||||||||||||
| m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state(); | ||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing transition validity check. The code directly accesses the destination state from a transition without first checking if the transition exists. This could lead to undefined behavior if there's no transition for the start of file character. - m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
+ auto const& transition{m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)};
+ if (false == transition.has_value()) {
+ // Handle the case where there's no transition for start of file
+ m_prev_state = m_dfa->get_root();
+ } else {
+ m_prev_state = transition->get_dest_state();
+ }📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||
| m_asked_for_more_data = true; | ||||||||||||||||||||||||||
| m_start_pos = input_buffer.storage().pos(); | ||||||||||||||||||||||||||
| m_match_pos = input_buffer.storage().pos(); | ||||||||||||||||||||||||||
|
|
@@ -407,11 +416,7 @@ void Lexer<TypedNfaState, TypedDfaState>::generate() { | |||||||||||||||||||||||||
| m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa); | ||||||||||||||||||||||||||
| auto const* state = m_dfa->get_root(); | ||||||||||||||||||||||||||
| for (uint32_t i = 0; i < cSizeOfByte; i++) { | ||||||||||||||||||||||||||
| if (nullptr != state->get_dest_state(i)) { | ||||||||||||||||||||||||||
| m_is_first_char[i] = true; | ||||||||||||||||||||||||||
| } else { | ||||||||||||||||||||||||||
| m_is_first_char[i] = false; | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| m_is_first_char[i] = state->get_transition(i).has_value(); | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } // namespace log_surgeon | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,10 +4,15 @@ | |
| #include <cstdint> | ||
| #include <map> | ||
| #include <memory> | ||
| #include <optional> | ||
| #include <set> | ||
| #include <stack> | ||
| #include <vector> | ||
|
|
||
| #include <fmt/core.h> | ||
| #include <fmt/format.h> | ||
|
|
||
| #include <log_surgeon/Constants.hpp> | ||
| #include <log_surgeon/finite_automata/DfaStatePair.hpp> | ||
| #include <log_surgeon/finite_automata/Nfa.hpp> | ||
|
|
||
|
|
@@ -17,6 +22,12 @@ class Dfa { | |
| public: | ||
| explicit Dfa(Nfa<TypedNfaState> const& nfa); | ||
|
|
||
| /** | ||
| * @return A string representation of the DFA. | ||
| * @return Forwards `DfaState::serialize`'s return value (std::nullopt) on failure. | ||
| */ | ||
| [[nodiscard]] auto serialize() const -> std::optional<std::string>; | ||
|
|
||
| /** | ||
| * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. | ||
| * @param nfa_state_set The set of NFA states represented by this DFA state. | ||
|
|
@@ -38,6 +49,12 @@ class Dfa { | |
| [[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set<uint32_t>; | ||
|
|
||
| private: | ||
| /** | ||
| * @return A vector representing the traversal order of the DFA states using breadth-first | ||
| * search (BFS). | ||
| */ | ||
| [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector<TypedDfaState const*>; | ||
|
|
||
| std::vector<std::unique_ptr<TypedDfaState>> m_states; | ||
| }; | ||
|
|
||
|
|
@@ -61,10 +78,10 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) { | |
| auto set = unmarked_sets.top(); | ||
| unmarked_sets.pop(); | ||
| auto* dfa_state = dfa_states.at(set); | ||
| std::map<uint32_t, StateSet> ascii_transitions_map; | ||
| std::map<uint8_t, StateSet> ascii_transitions_map; | ||
| // map<Interval, StateSet> transitions_map; | ||
| for (auto const* s0 : set) { | ||
| for (uint32_t i = 0; i < cSizeOfByte; i++) { | ||
| for (uint16_t i{0}; i < cSizeOfByte; ++i) { | ||
| for (auto* const s1 : s0->get_byte_transitions(i)) { | ||
| StateSet closure = s1->epsilon_closure(); | ||
| ascii_transitions_map[i].insert(closure.begin(), closure.end()); | ||
|
|
@@ -83,9 +100,9 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) { | |
| } | ||
| return state; | ||
| }; | ||
| for (auto const& kv : ascii_transitions_map) { | ||
| auto* dest_state = next_dfa_state(kv.second); | ||
| dfa_state->add_byte_transition(kv.first, dest_state); | ||
| for (auto const& [byte, nfa_state_set] : ascii_transitions_map) { | ||
| auto* dest_state{next_dfa_state(nfa_state_set)}; | ||
| dfa_state->add_byte_transition(byte, {{}, dest_state}); | ||
| } | ||
| // TODO: add this for the utf8 case | ||
| } | ||
|
|
@@ -125,6 +142,60 @@ auto Dfa<TypedDfaState, TypedNfaState>::get_intersect(Dfa const* dfa_in | |
| } | ||
| return schema_types; | ||
| } | ||
|
|
||
| template <typename TypedDfaState, typename TypedNfaState> | ||
| auto Dfa<TypedDfaState, TypedNfaState>::get_bfs_traversal_order( | ||
| ) const -> std::vector<TypedDfaState const*> { | ||
| std::queue<TypedDfaState const*> state_queue; | ||
| std::unordered_set<TypedDfaState const*> visited_states; | ||
| std::vector<TypedDfaState const*> visited_order; | ||
| visited_states.reserve(m_states.size()); | ||
| visited_order.reserve(m_states.size()); | ||
|
|
||
| auto try_add_to_queue_and_visited | ||
| = [&state_queue, &visited_states](TypedDfaState const* dest_state) { | ||
| if (visited_states.insert(dest_state).second) { | ||
| state_queue.push(dest_state); | ||
| } | ||
| }; | ||
|
|
||
| try_add_to_queue_and_visited(get_root()); | ||
| while (false == state_queue.empty()) { | ||
| auto const* current_state = state_queue.front(); | ||
| visited_order.push_back(current_state); | ||
| state_queue.pop(); | ||
| // TODO: Handle the utf8 case | ||
| for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { | ||
|
Comment on lines
+167
to
+168
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainTODO comment should be addressed. The TODO comment about handling the UTF-8 case should be addressed in this implementation since the PR focuses on DFA transitions. 🏁 Script executed: #!/bin/bash
# Check if there are other TODO comments related to UTF-8 in DFA-related files
rg "TODO.*utf8" src/log_surgeon/finite_automata/Length of output: 378 Action Required: Address UTF-8 Handling TODOs The DFA transition implementation still contains several TODO comments related to UTF-8 support. In particular, note these instances:
Given that this PR focuses on DFA transitions, please either implement the actual UTF-8 handling logic or provide a clear plan (e.g. a follow-up ticket) to address these comments consistently. This will help ensure that incomplete logic does not propagate in the codebase. |
||
| auto const& transition{current_state->get_transition(idx)}; | ||
| if (transition.has_value()) { | ||
| auto const* dest_state{transition->get_dest_state()}; | ||
| try_add_to_queue_and_visited(dest_state); | ||
| } | ||
| } | ||
| } | ||
| return visited_order; | ||
| } | ||
|
|
||
| template <typename TypedDfaState, typename TypedNfaState> | ||
| auto Dfa<TypedDfaState, TypedNfaState>::serialize() const -> std::optional<std::string> { | ||
| auto const traversal_order = get_bfs_traversal_order(); | ||
|
|
||
| std::unordered_map<TypedDfaState const*, uint32_t> state_ids; | ||
SharafMohamed marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| state_ids.reserve(traversal_order.size()); | ||
| for (auto const* state : traversal_order) { | ||
SharafMohamed marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| state_ids.emplace(state, state_ids.size()); | ||
| } | ||
|
|
||
| std::vector<std::string> serialized_states; | ||
| for (auto const* state : traversal_order) { | ||
| auto const optional_serialized_state{state->serialize(state_ids)}; | ||
| if (false == optional_serialized_state.has_value()) { | ||
| return std::nullopt; | ||
| } | ||
| serialized_states.emplace_back(optional_serialized_state.value()); | ||
| } | ||
| return fmt::format("{}\n", fmt::join(serialized_states, "\n")); | ||
| } | ||
| } // namespace log_surgeon::finite_automata | ||
|
|
||
| #endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP | ||
Uh oh!
There was an error while loading. Please reload this page.