Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c42a214
Add register operations; Add DFA test.
SharafMohamed Feb 25, 2025
228e8b7
Merge branch 'main' into add-register-operations
SharafMohamed Feb 26, 2025
765598f
Fix capitalization.
SharafMohamed Feb 26, 2025
34a7f5f
Reserve state_ids size.
SharafMohamed Feb 27, 2025
2d64cc5
Fix typo in previous commit.
SharafMohamed Feb 27, 2025
0e41cfb
Rename lambda for clarity.
SharafMohamed Feb 27, 2025
5f7dce1
Use structured binding; Use uint8_t for ascii; Don't pass uint8_t by …
SharafMohamed Feb 27, 2025
aa1ce5f
Use reference for structured binding for the Nfa state set.
SharafMohamed Feb 27, 2025
8eef207
Lint.
SharafMohamed Feb 27, 2025
415be20
Have to use a size larger than the elements in the for loop otherwise…
SharafMohamed Feb 27, 2025
939f0ab
Use std::array.
SharafMohamed Feb 28, 2025
7cbc7ee
Use {}.
SharafMohamed Feb 28, 2025
3da3300
Use const.
SharafMohamed Feb 28, 2025
d70d4ef
Use {} again.
SharafMohamed Feb 28, 2025
1da6391
Move register operation type enum into RegisterOperation class.
SharafMohamed Feb 28, 2025
345655d
Add docstring to RegisterOperation.
SharafMohamed Feb 28, 2025
12e06d9
Fix typo.
SharafMohamed Feb 28, 2025
45dd3a4
Return nullopt for invalid case; Update docstring.
SharafMohamed Feb 28, 2025
8f8da2a
Update docstring for better clarity.
SharafMohamed Feb 28, 2025
37aa5f8
Add factory functions to RegisterOperation.
SharafMohamed Mar 1, 2025
a97103e
Make const.
SharafMohamed Mar 1, 2025
30db002
Propogate const change.
SharafMohamed Mar 1, 2025
82e2195
Return const&.
SharafMohamed Mar 1, 2025
59292da
Update src/log_surgeon/finite_automata/DfaTransition.hpp
SharafMohamed Mar 2, 2025
32ad10d
Add DfaTransition docstring.
SharafMohamed Mar 2, 2025
0586f90
Update docstring.
SharafMohamed Mar 4, 2025
03ae78e
Fix typo.
SharafMohamed Mar 4, 2025
ffa78b1
Make RegisterOperation::Type enum public.
SharafMohamed Mar 4, 2025
e5b1cce
Remove DfaTransition default constructor.
SharafMohamed Mar 4, 2025
3b1dafc
Remove UTF-8 case as its handled incorrectly.
SharafMohamed Mar 5, 2025
0b397e2
Explicit specialization of byte case.
SharafMohamed Mar 6, 2025
11cb2a7
Switch to unordered map.
SharafMohamed Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,15 @@ set(SOURCE_FILES
src/log_surgeon/finite_automata/Dfa.hpp
src/log_surgeon/finite_automata/DfaState.hpp
src/log_surgeon/finite_automata/DfaStatePair.hpp
src/log_surgeon/finite_automata/DfaTransition.hpp
src/log_surgeon/finite_automata/Nfa.hpp
src/log_surgeon/finite_automata/NfaSpontaneousTransition.hpp
src/log_surgeon/finite_automata/NfaState.hpp
src/log_surgeon/finite_automata/PrefixTree.cpp
src/log_surgeon/finite_automata/PrefixTree.hpp
src/log_surgeon/finite_automata/RegexAST.hpp
src/log_surgeon/finite_automata/RegisterHandler.hpp
src/log_surgeon/finite_automata/RegisterOperation.hpp
src/log_surgeon/finite_automata/StateType.hpp
src/log_surgeon/finite_automata/TagOperation.hpp
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp
Expand Down
6 changes: 3 additions & 3 deletions src/log_surgeon/Lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ class Lexer {
*/
[[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id
) const -> std::optional<reg_id_t> {
if (m_tag_to_reg_id.contains(tag_id)) {
return m_tag_to_reg_id.at(tag_id);
if (m_tag_to_final_reg_id.contains(tag_id)) {
return m_tag_to_final_reg_id.at(tag_id);
}
return std::nullopt;
}
Expand Down Expand Up @@ -218,7 +218,7 @@ class Lexer {
TypedDfaState const* m_prev_state{nullptr};
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
std::unordered_map<tag_id_t, reg_id_t> m_tag_to_reg_id;
std::map<tag_id_t, reg_id_t> m_tag_to_final_reg_id;
};

namespace lexers {
Expand Down
74 changes: 69 additions & 5 deletions src/log_surgeon/finite_automata/Dfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
#include <stack>
#include <vector>

#include <fmt/core.h>
#include <fmt/format.h>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/DfaStatePair.hpp>
#include <log_surgeon/finite_automata/Nfa.hpp>

Expand All @@ -17,6 +21,11 @@ class Dfa {
public:
explicit Dfa(Nfa<TypedNfaState> const& nfa);

/**
* @return A string representation of the DFA.
*/
[[nodiscard]] auto serialize() const -> std::string;

/**
* Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
* @param nfa_state_set The set of NFA states represented by this DFA state.
Expand All @@ -38,6 +47,12 @@ class Dfa {
[[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set<uint32_t>;

private:
/**
* @return A vector representing the traversal order of the DFA states using breadth-first
* search (BFS).
*/
[[nodiscard]] auto get_bfs_traversal_order() const -> std::vector<TypedDfaState const*>;

std::vector<std::unique_ptr<TypedDfaState>> m_states;
};

Expand All @@ -61,10 +76,10 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
auto set = unmarked_sets.top();
unmarked_sets.pop();
auto* dfa_state = dfa_states.at(set);
std::map<uint32_t, StateSet> ascii_transitions_map;
std::map<uint8_t, StateSet> ascii_transitions_map;
// map<Interval, StateSet> transitions_map;
for (auto const* s0 : set) {
for (uint32_t i = 0; i < cSizeOfByte; i++) {
for (uint16_t i{0}; i < cSizeOfByte; ++i) {
for (auto* const s1 : s0->get_byte_transitions(i)) {
StateSet closure = s1->epsilon_closure();
ascii_transitions_map[i].insert(closure.begin(), closure.end());
Expand All @@ -83,9 +98,9 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
}
return state;
};
for (auto const& kv : ascii_transitions_map) {
auto* dest_state = next_dfa_state(kv.second);
dfa_state->add_byte_transition(kv.first, dest_state);
for (auto const& [byte, nfa_state_set] : ascii_transitions_map) {
auto* dest_state{next_dfa_state(nfa_state_set)};
dfa_state->add_byte_transition(byte, {{}, dest_state});
}
// TODO: add this for the utf8 case
}
Expand Down Expand Up @@ -125,6 +140,55 @@ auto Dfa<TypedDfaState, TypedNfaState>::get_intersect(Dfa const* dfa_in
}
return schema_types;
}

template <typename TypedDfaState, typename TypedNfaState>
auto Dfa<TypedDfaState, TypedNfaState>::get_bfs_traversal_order(
) const -> std::vector<TypedDfaState const*> {
std::queue<TypedDfaState const*> state_queue;
std::unordered_set<TypedDfaState const*> visited_states;
std::vector<TypedDfaState const*> visited_order;
visited_states.reserve(m_states.size());
visited_order.reserve(m_states.size());

auto try_add_to_queue_and_visited
= [&state_queue, &visited_states](TypedDfaState const* dest_state) {
if (visited_states.insert(dest_state).second) {
state_queue.push(dest_state);
}
};

try_add_to_queue_and_visited(get_root());
while (false == state_queue.empty()) {
auto const* current_state = state_queue.front();
visited_order.push_back(current_state);
state_queue.pop();
// TODO: Handle the utf8 case
for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
Comment on lines +167 to +168
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

TODO comment should be addressed.

The TODO comment about handling the UTF-8 case should be addressed in this implementation since the PR focuses on DFA transitions.


🏁 Script executed:

#!/bin/bash
# Check if there are other TODO comments related to UTF-8 in DFA-related files
rg "TODO.*utf8" src/log_surgeon/finite_automata/

Length of output: 378


Action Required: Address UTF-8 Handling TODOs

The DFA transition implementation still contains several TODO comments related to UTF-8 support. In particular, note these instances:

  • In src/log_surgeon/finite_automata/Dfa.hpp, the lines with:
    • // TODO: Handle the utf8 case
    • // TODO: add this for the utf8 case (appearing twice)
  • In src/log_surgeon/finite_automata/Nfa.hpp:
    • // TODO: handle the utf8 case

Given that this PR focuses on DFA transitions, please either implement the actual UTF-8 handling logic or provide a clear plan (e.g. a follow-up ticket) to address these comments consistently. This will help ensure that incomplete logic does not propagate in the codebase.

auto const dest_state{current_state->get_dest_state(idx)};
if (nullptr != dest_state) {
try_add_to_queue_and_visited(dest_state);
}
}
}
return visited_order;
}

template <typename TypedDfaState, typename TypedNfaState>
auto Dfa<TypedDfaState, TypedNfaState>::serialize() const -> std::string {
auto const traversal_order = get_bfs_traversal_order();

std::unordered_map<TypedDfaState const*, uint32_t> state_ids;
state_ids.reserve(traversal_order.size());
for (auto const* state : traversal_order) {
state_ids.emplace(state, state_ids.size());
}

std::vector<std::string> serialized_states;
for (auto const* state : traversal_order) {
serialized_states.emplace_back(state->serialize(state_ids));
}
return fmt::format("{}\n", fmt::join(serialized_states, "\n"));
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP
81 changes: 73 additions & 8 deletions src/log_surgeon/finite_automata/DfaState.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@
#include <cassert>
#include <cstdint>
#include <memory>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <vector>

#include <fmt/core.h>
#include <fmt/format.h>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/DfaTransition.hpp>
#include <log_surgeon/finite_automata/RegisterOperation.hpp>
#include <log_surgeon/finite_automata/StateType.hpp>
#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>

Expand All @@ -24,7 +31,11 @@ class DfaState {
public:
using Tree = UnicodeIntervalTree<DfaState*>;

DfaState() { std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); }
DfaState() {
for (auto& transition : m_bytes_transition) {
transition = DfaTransition<state_type>{{}, nullptr};
}
}

auto add_matching_variable_id(uint32_t const variable_id) -> void {
m_matching_variable_ids.push_back(variable_id);
Expand All @@ -38,19 +49,31 @@ class DfaState {
return false == m_matching_variable_ids.empty();
}

auto add_byte_transition(uint8_t const& byte, DfaState* dest_state) -> void {
m_bytes_transition[byte] = dest_state;
auto add_byte_transition(uint8_t const byte, DfaTransition<state_type> dfa_transition) -> void {
m_bytes_transition[byte] = dfa_transition;
}

auto add_accepting_op(RegisterOperation const reg_op) -> void {
m_accepting_ops.push_back(reg_op);
}

/**
* @param state_ids A map of states to their unique identifiers.
* @return A string representation of the DFA state.
*/
[[nodiscard]] auto serialize(std::unordered_map<DfaState const*, uint32_t> const& state_ids
) const -> std::string;

/**
* @param character The character (byte or utf8) to transition on.
* @return A pointer to the DFA state reached after transitioning on `character`.
* @return The destination DFA state reached after transitioning on `character`.
*/
[[nodiscard]] auto get_dest_state(uint32_t character) const -> DfaState const*;

private:
std::vector<uint32_t> m_matching_variable_ids;
DfaState* m_bytes_transition[cSizeOfByte];
std::vector<RegisterOperation> m_accepting_ops;
DfaTransition<state_type> m_bytes_transition[cSizeOfByte];
// NOTE: We don't need m_tree_transitions for the `state_type == StateType::Byte` case, so we
// use an empty class (`std::tuple<>`) in that case.
std::conditional_t<state_type == StateType::Utf8, Tree, std::tuple<>> m_tree_transitions;
Expand All @@ -59,20 +82,62 @@ class DfaState {
template <StateType state_type>
auto DfaState<state_type>::get_dest_state(uint32_t character) const -> DfaState const* {
if constexpr (StateType::Byte == state_type) {
return m_bytes_transition[character];
return m_bytes_transition[character].get_dest_state();
} else {
if (character < cSizeOfByte) {
return m_bytes_transition[character];
return m_bytes_transition[character].get_dest_state();
}
std::unique_ptr<std::vector<typename Tree::Data>> result
= m_tree_transitions.find(Interval(character, character));
assert(result->size() <= 1);
if (false == result->empty()) {
return result->front().m_value;
return result->front().m_value.get_dest_state();
}
return nullptr;
}
}

template <StateType state_type>
auto DfaState<state_type>::serialize(std::unordered_map<DfaState const*, uint32_t> const& state_ids
) const -> std::string {
auto const accepting_tags_string = is_accepting()
? fmt::format(
"accepting_tags={{{}}},",
fmt::join(m_matching_variable_ids, ",")
)
: "";

std::vector<std::string> accepting_op_strings;
for (auto const& accepting_op : m_accepting_ops) {
auto serialized_accepting_op{accepting_op.serialize()};
if (serialized_accepting_op.has_value()) {
accepting_op_strings.push_back(serialized_accepting_op.value());
}
}
auto const accepting_ops_string = is_accepting() ? fmt::format(
"accepting_operations={{{}}},",
fmt::join(accepting_op_strings, ",")
)
: "";

std::vector<std::string> transition_strings;
for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
auto const byte_transition_string{m_bytes_transition[idx].serialize(state_ids)};
if (byte_transition_string.has_value()) {
transition_strings.push_back(
fmt::format("{}{}", static_cast<char>(idx), byte_transition_string.value())
);
}
}

return fmt::format(
"{}:{}{}byte_transitions={{{}}}",
state_ids.at(this),
accepting_tags_string,
accepting_ops_string,
fmt::join(transition_strings, ",")
);
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE
70 changes: 70 additions & 0 deletions src/log_surgeon/finite_automata/DfaTransition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP
#define LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP

#include <cstdint>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include <fmt/core.h>
#include <fmt/format.h>

#include <log_surgeon/finite_automata/RegisterOperation.hpp>
#include <log_surgeon/finite_automata/StateType.hpp>

namespace log_surgeon::finite_automata {
template <StateType state_type>
class DfaState;

template <StateType state_type>
class DfaTransition {
public:
DfaTransition() = default;

DfaTransition(std::vector<RegisterOperation> reg_ops, DfaState<state_type>* dest_state)
: m_reg_ops{std::move(reg_ops)},
m_dest_state{dest_state} {}

[[nodiscard]] auto get_reg_ops() const -> std::vector<RegisterOperation> { return m_reg_ops; }

[[nodiscard]] auto get_dest_state() const -> DfaState<state_type>* { return m_dest_state; }

/**
* @param state_ids A map of states to their unique identifiers.
* @return A string representation of the DFA transition on success.
* @return Forwards `RegisterOperation::serialize`'s return value (std::nullopt) on failure.
* @return std::nullopt if `m_dest_state` is not in `statd_ids`.
*/
[[nodiscard]] auto serialize(
std::unordered_map<DfaState<state_type> const*, uint32_t> const& state_ids
) const -> std::optional<std::string>;

private:
std::vector<RegisterOperation> m_reg_ops;
DfaState<state_type>* m_dest_state{nullptr};
};

template <StateType state_type>
auto DfaTransition<state_type>::serialize(
std::unordered_map<DfaState<state_type> const*, uint32_t> const& state_ids
) const -> std::optional<std::string> {
if (false == state_ids.contains(m_dest_state)) {
return std::nullopt;
}

std::vector<std::string> transformed_ops;
for (auto const& reg_op : m_reg_ops) {
auto const optional_serialized_op{reg_op.serialize()};
if (false == optional_serialized_op.has_value()) {
return std::nullopt;
}
transformed_ops.push_back(optional_serialized_op.value());
}

return fmt::format("-({})->{}", fmt::join(transformed_ops, ","), state_ids.at(m_dest_state));
}
} // namespace log_surgeon::finite_automata

#endif // LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP
Loading
Loading