Skip to content
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c42a214
Add register operations; Add DFA test.
SharafMohamed Feb 25, 2025
228e8b7
Merge branch 'main' into add-register-operations
SharafMohamed Feb 26, 2025
765598f
Fix capitalization.
SharafMohamed Feb 26, 2025
34a7f5f
Reserve state_ids size.
SharafMohamed Feb 27, 2025
2d64cc5
Fix typo in previous commit.
SharafMohamed Feb 27, 2025
0e41cfb
Rename lambda for clarity.
SharafMohamed Feb 27, 2025
5f7dce1
Use structured binding; Use uint8_t for ascii; Don't pass uint8_t by …
SharafMohamed Feb 27, 2025
aa1ce5f
Use reference for structured binding for the Nfa state set.
SharafMohamed Feb 27, 2025
8eef207
Lint.
SharafMohamed Feb 27, 2025
415be20
Have to use a size larger than the elements in the for loop otherwise…
SharafMohamed Feb 27, 2025
939f0ab
Use std::array.
SharafMohamed Feb 28, 2025
7cbc7ee
Use {}.
SharafMohamed Feb 28, 2025
3da3300
Use const.
SharafMohamed Feb 28, 2025
d70d4ef
Use {} again.
SharafMohamed Feb 28, 2025
1da6391
Move register operation type enum into RegisterOperation class.
SharafMohamed Feb 28, 2025
345655d
Add docstring to RegisterOperation.
SharafMohamed Feb 28, 2025
12e06d9
Fix typo.
SharafMohamed Feb 28, 2025
45dd3a4
Return nullopt for invalid case; Update docstring.
SharafMohamed Feb 28, 2025
8f8da2a
Update docstring for better clarity.
SharafMohamed Feb 28, 2025
37aa5f8
Add factory functions to RegisterOperation.
SharafMohamed Mar 1, 2025
a97103e
Make const.
SharafMohamed Mar 1, 2025
30db002
Propogate const change.
SharafMohamed Mar 1, 2025
82e2195
Return const&.
SharafMohamed Mar 1, 2025
59292da
Update src/log_surgeon/finite_automata/DfaTransition.hpp
SharafMohamed Mar 2, 2025
32ad10d
Add DfaTransition docstring.
SharafMohamed Mar 2, 2025
0586f90
Update docstring.
SharafMohamed Mar 4, 2025
03ae78e
Fix typo.
SharafMohamed Mar 4, 2025
ffa78b1
Make RegisterOperation::Type enum public.
SharafMohamed Mar 4, 2025
e5b1cce
Remove DfaTransition default constructor.
SharafMohamed Mar 4, 2025
3b1dafc
Remove UTF-8 case as its handled incorrectly.
SharafMohamed Mar 5, 2025
0b397e2
Explicit specialization of byte case.
SharafMohamed Mar 6, 2025
11cb2a7
Switch to unordered map.
SharafMohamed Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,15 @@ set(SOURCE_FILES
src/log_surgeon/finite_automata/Dfa.hpp
src/log_surgeon/finite_automata/DfaState.hpp
src/log_surgeon/finite_automata/DfaStatePair.hpp
src/log_surgeon/finite_automata/DfaTransition.hpp
src/log_surgeon/finite_automata/Nfa.hpp
src/log_surgeon/finite_automata/NfaSpontaneousTransition.hpp
src/log_surgeon/finite_automata/NfaState.hpp
src/log_surgeon/finite_automata/PrefixTree.cpp
src/log_surgeon/finite_automata/PrefixTree.hpp
src/log_surgeon/finite_automata/RegexAST.hpp
src/log_surgeon/finite_automata/RegisterHandler.hpp
src/log_surgeon/finite_automata/RegisterOperation.hpp
src/log_surgeon/finite_automata/StateType.hpp
src/log_surgeon/finite_automata/TagOperation.hpp
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp
Expand Down
6 changes: 3 additions & 3 deletions src/log_surgeon/Lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ class Lexer {
*/
[[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id
) const -> std::optional<reg_id_t> {
if (m_tag_to_reg_id.contains(tag_id)) {
return m_tag_to_reg_id.at(tag_id);
if (m_tag_to_final_reg_id.contains(tag_id)) {
return m_tag_to_final_reg_id.at(tag_id);
}
return std::nullopt;
}
Expand Down Expand Up @@ -218,7 +218,7 @@ class Lexer {
TypedDfaState const* m_prev_state{nullptr};
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
std::unordered_map<tag_id_t, reg_id_t> m_tag_to_reg_id;
std::map<tag_id_t, reg_id_t> m_tag_to_final_reg_id;
};

namespace lexers {
Expand Down
37 changes: 21 additions & 16 deletions src/log_surgeon/Lexer.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,19 +84,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer,
m_match_pos = prev_byte_buf_pos;
m_match_line = m_line;
}
auto* dest_state = state->get_dest_state(next_char);
auto const& optional_transition{state->get_transition(next_char)};
if (next_char == '\n') {
m_line++;
if (m_has_delimiters && !m_match) {
dest_state = m_dfa->get_root()->get_dest_state(next_char);
auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
)};
Comment on lines +91 to +92
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Missing transition validity check.

The code directly accesses the destination state from a transition without first checking if the transition exists using has_value(). This could lead to undefined behavior if there's no transition for the newline character.

-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
)};
auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
if (false == root_transition.has_value()) {
// Handle the case where there's no transition for newline
continue;
}
auto const* dest_state{root_transition->get_dest_state()};

m_match = true;
m_type_ids = &(dest_state->get_matching_variable_ids());
m_start_pos = prev_byte_buf_pos;
m_match_pos = input_buffer.storage().pos();
m_match_line = m_line;
}
}
if (input_buffer.log_fully_consumed() || nullptr == dest_state) {
if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) {
if (m_match) {
input_buffer.set_log_fully_consumed(false);
input_buffer.set_pos(m_match_pos);
Expand Down Expand Up @@ -165,7 +166,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer,
state = m_dfa->get_root();
continue;
}
state = dest_state;
state = optional_transition->get_dest_state();
}
}

Expand Down Expand Up @@ -215,19 +216,20 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
m_match_pos = prev_byte_buf_pos;
m_match_line = m_line;
}
TypedDfaState const* dest_state{state->get_dest_state(next_char)};
auto const& optional_transition{state->get_transition(next_char)};
if (next_char == '\n') {
m_line++;
if (m_has_delimiters && !m_match) {
dest_state = m_dfa->get_root()->get_dest_state(next_char);
auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
)};
Comment on lines +223 to +224
Copy link

@coderabbitai coderabbitai bot Mar 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Missing transition validity check.

Similar to the earlier instance, the code directly accesses the destination state from a transition without checking if it exists, which could lead to undefined behavior.

-                auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
-                )};
+                auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
+                if (false == root_transition.has_value()) {
+                    // Handle the case where there's no transition for newline
+                    continue;
+                }
+                auto const* dest_state{root_transition->get_dest_state()};
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
auto const* dest_state{m_dfa->get_root()->get_transition(next_char)->get_dest_state(
)};
auto const& root_transition{m_dfa->get_root()->get_transition(next_char)};
if (false == root_transition.has_value()) {
// Handle the case where there's no transition for newline
continue;
}
auto const* dest_state{root_transition->get_dest_state()};

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I guess \n and ? always exist? @SharafMohamed

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea the transition should always exist for those two cases.

m_match = true;
m_type_ids = &(dest_state->get_matching_variable_ids());
m_start_pos = prev_byte_buf_pos;
m_match_pos = input_buffer.storage().pos();
m_match_line = m_line;
}
}
if (input_buffer.log_fully_consumed() || nullptr == dest_state) {
if (input_buffer.log_fully_consumed() || false == optional_transition.has_value()) {
assert(input_buffer.log_fully_consumed());
if (!m_match || (m_match && m_match_pos != input_buffer.storage().pos())) {
token
Expand All @@ -243,7 +245,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
// BFS (keep track of m_type_ids)
if (wildcard == '?') {
for (uint32_t byte = 0; byte < cSizeOfByte; byte++) {
auto* dest_state{state->get_dest_state(byte)};
auto const* dest_state{state->get_transition(byte)->get_dest_state()};
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Missing transition validity check in wildcard code.

The code directly accesses the destination state from a transition without checking if it exists using has_value(). This could cause a runtime error if there's no transition for a specific byte.

-                        auto const* dest_state{state->get_transition(byte)->get_dest_state()};
+                        auto const& transition{state->get_transition(byte)};
+                        if (false == transition.has_value()) {
+                            token = Token{m_last_match_pos,
+                                    input_buffer.storage().pos(),
+                                    input_buffer.storage().get_active_buffer(),
+                                    input_buffer.storage().size(),
+                                    m_last_match_line,
+                                    &cTokenUncaughtStringTypes};
+                            return ErrorCode::Success;
+                        }
+                        auto const* dest_state{transition->get_dest_state()};
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
auto const* dest_state{state->get_transition(byte)->get_dest_state()};
auto const& transition{state->get_transition(byte)};
if (false == transition.has_value()) {
token = Token{m_last_match_pos,
input_buffer.storage().pos(),
input_buffer.storage().get_active_buffer(),
input_buffer.storage().size(),
m_last_match_line,
&cTokenUncaughtStringTypes};
return ErrorCode::Success;
}
auto const* dest_state{transition->get_dest_state()};

if (false == dest_state->is_accepting()) {
token
= Token{m_last_match_pos,
Expand Down Expand Up @@ -277,7 +279,14 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
if (m_is_delimiter[byte]) {
continue;
}
TypedDfaState const* dest_state{current_state->get_dest_state(byte)};
auto const& optional_wildcard_transition{
current_state->get_transition(byte)
};
if (false == optional_wildcard_transition.has_value()) {
unvisited_states.push(nullptr);
continue;
}
auto const* dest_state{optional_wildcard_transition->get_dest_state()};
if (false == visited_states.contains(dest_state)) {
unvisited_states.push(dest_state);
}
Expand All @@ -299,7 +308,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan_with_wildcard(
return ErrorCode::Success;
}
}
state = dest_state;
state = optional_transition->get_dest_state();
}
}

Expand Down Expand Up @@ -337,7 +346,7 @@ void Lexer<TypedNfaState, TypedDfaState>::reset() {
template <typename TypedNfaState, typename TypedDfaState>
void Lexer<TypedNfaState, TypedDfaState>::prepend_start_of_file_char(ParserInputBuffer& input_buffer
) {
m_prev_state = m_dfa->get_root()->get_dest_state(utf8::cCharStartOfFile);
m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Missing transition validity check.

The code directly accesses the destination state from a transition without first checking if the transition exists. This could lead to undefined behavior if there's no transition for the start of file character.

-    m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
+    auto const& transition{m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)};
+    if (false == transition.has_value()) {
+        // Handle the case where there's no transition for start of file
+        m_prev_state = m_dfa->get_root();
+    } else {
+        m_prev_state = transition->get_dest_state();
+    }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
m_prev_state = m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)->get_dest_state();
auto const& transition{m_dfa->get_root()->get_transition(utf8::cCharStartOfFile)};
if (false == transition.has_value()) {
// Handle the case where there's no transition for start of file
m_prev_state = m_dfa->get_root();
} else {
m_prev_state = transition->get_dest_state();
}

m_asked_for_more_data = true;
m_start_pos = input_buffer.storage().pos();
m_match_pos = input_buffer.storage().pos();
Expand Down Expand Up @@ -407,11 +416,7 @@ void Lexer<TypedNfaState, TypedDfaState>::generate() {
m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa);
auto const* state = m_dfa->get_root();
for (uint32_t i = 0; i < cSizeOfByte; i++) {
if (nullptr != state->get_dest_state(i)) {
m_is_first_char[i] = true;
} else {
m_is_first_char[i] = false;
}
m_is_first_char[i] = state->get_transition(i).has_value();
}
}
} // namespace log_surgeon
Expand Down
81 changes: 76 additions & 5 deletions src/log_surgeon/finite_automata/Dfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
#include <cstdint>
#include <map>
#include <memory>
#include <optional>
#include <set>
#include <stack>
#include <vector>

#include <fmt/core.h>
#include <fmt/format.h>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/DfaStatePair.hpp>
#include <log_surgeon/finite_automata/Nfa.hpp>

Expand All @@ -17,6 +22,12 @@ class Dfa {
public:
explicit Dfa(Nfa<TypedNfaState> const& nfa);

/**
* @return A string representation of the DFA.
* @return Forwards `DfaState::serialize`'s return value (std::nullopt) on failure.
*/
[[nodiscard]] auto serialize() const -> std::optional<std::string>;

/**
* Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
* @param nfa_state_set The set of NFA states represented by this DFA state.
Expand All @@ -38,6 +49,12 @@ class Dfa {
[[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set<uint32_t>;

private:
/**
* @return A vector representing the traversal order of the DFA states using breadth-first
* search (BFS).
*/
[[nodiscard]] auto get_bfs_traversal_order() const -> std::vector<TypedDfaState const*>;

std::vector<std::unique_ptr<TypedDfaState>> m_states;
};

Expand All @@ -61,10 +78,10 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
auto set = unmarked_sets.top();
unmarked_sets.pop();
auto* dfa_state = dfa_states.at(set);
std::map<uint32_t, StateSet> ascii_transitions_map;
std::map<uint8_t, StateSet> ascii_transitions_map;
// map<Interval, StateSet> transitions_map;
for (auto const* s0 : set) {
for (uint32_t i = 0; i < cSizeOfByte; i++) {
for (uint16_t i{0}; i < cSizeOfByte; ++i) {
for (auto* const s1 : s0->get_byte_transitions(i)) {
StateSet closure = s1->epsilon_closure();
ascii_transitions_map[i].insert(closure.begin(), closure.end());
Expand All @@ -83,9 +100,9 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
}
return state;
};
for (auto const& kv : ascii_transitions_map) {
auto* dest_state = next_dfa_state(kv.second);
dfa_state->add_byte_transition(kv.first, dest_state);
for (auto const& [byte, nfa_state_set] : ascii_transitions_map) {
auto* dest_state{next_dfa_state(nfa_state_set)};
dfa_state->add_byte_transition(byte, {{}, dest_state});
}
// TODO: add this for the utf8 case
}
Expand Down Expand Up @@ -125,6 +142,60 @@ auto Dfa<TypedDfaState, TypedNfaState>::get_intersect(Dfa const* dfa_in
}
return schema_types;
}

template <typename TypedDfaState, typename TypedNfaState>
auto Dfa<TypedDfaState, TypedNfaState>::get_bfs_traversal_order(
) const -> std::vector<TypedDfaState const*> {
std::queue<TypedDfaState const*> state_queue;
std::unordered_set<TypedDfaState const*> visited_states;
std::vector<TypedDfaState const*> visited_order;
visited_states.reserve(m_states.size());
visited_order.reserve(m_states.size());

auto try_add_to_queue_and_visited
= [&state_queue, &visited_states](TypedDfaState const* dest_state) {
if (visited_states.insert(dest_state).second) {
state_queue.push(dest_state);
}
};

try_add_to_queue_and_visited(get_root());
while (false == state_queue.empty()) {
auto const* current_state = state_queue.front();
visited_order.push_back(current_state);
state_queue.pop();
// TODO: Handle the utf8 case
for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
Comment on lines +167 to +168
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

TODO comment should be addressed.

The TODO comment about handling the UTF-8 case should be addressed in this implementation since the PR focuses on DFA transitions.


🏁 Script executed:

#!/bin/bash
# Check if there are other TODO comments related to UTF-8 in DFA-related files
rg "TODO.*utf8" src/log_surgeon/finite_automata/

Length of output: 378


Action Required: Address UTF-8 Handling TODOs

The DFA transition implementation still contains several TODO comments related to UTF-8 support. In particular, note these instances:

  • In src/log_surgeon/finite_automata/Dfa.hpp, the lines with:
    • // TODO: Handle the utf8 case
    • // TODO: add this for the utf8 case (appearing twice)
  • In src/log_surgeon/finite_automata/Nfa.hpp:
    • // TODO: handle the utf8 case

Given that this PR focuses on DFA transitions, please either implement the actual UTF-8 handling logic or provide a clear plan (e.g. a follow-up ticket) to address these comments consistently. This will help ensure that incomplete logic does not propagate in the codebase.

auto const& transition{current_state->get_transition(idx)};
if (transition.has_value()) {
auto const* dest_state{transition->get_dest_state()};
try_add_to_queue_and_visited(dest_state);
}
}
}
return visited_order;
}

template <typename TypedDfaState, typename TypedNfaState>
auto Dfa<TypedDfaState, TypedNfaState>::serialize() const -> std::optional<std::string> {
auto const traversal_order = get_bfs_traversal_order();

std::unordered_map<TypedDfaState const*, uint32_t> state_ids;
state_ids.reserve(traversal_order.size());
for (auto const* state : traversal_order) {
state_ids.emplace(state, state_ids.size());
}

std::vector<std::string> serialized_states;
for (auto const* state : traversal_order) {
auto const optional_serialized_state{state->serialize(state_ids)};
if (false == optional_serialized_state.has_value()) {
return std::nullopt;
}
serialized_states.emplace_back(optional_serialized_state.value());
}
return fmt::format("{}\n", fmt::join(serialized_states, "\n"));
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP
Loading
Loading