Skip to content

Commit c42a214

Browse files
committed
Add register operations; Add DFA test.
1 parent 91c5094 commit c42a214

9 files changed

Lines changed: 401 additions & 22 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,15 @@ set(SOURCE_FILES
7272
src/log_surgeon/finite_automata/Dfa.hpp
7373
src/log_surgeon/finite_automata/DfaState.hpp
7474
src/log_surgeon/finite_automata/DfaStatePair.hpp
75+
src/log_surgeon/finite_automata/DfaTransition.hpp
7576
src/log_surgeon/finite_automata/Nfa.hpp
7677
src/log_surgeon/finite_automata/NfaSpontaneousTransition.hpp
7778
src/log_surgeon/finite_automata/NfaState.hpp
7879
src/log_surgeon/finite_automata/PrefixTree.cpp
7980
src/log_surgeon/finite_automata/PrefixTree.hpp
8081
src/log_surgeon/finite_automata/RegexAST.hpp
8182
src/log_surgeon/finite_automata/RegisterHandler.hpp
83+
src/log_surgeon/finite_automata/RegisterOperation.hpp
8284
src/log_surgeon/finite_automata/StateType.hpp
8385
src/log_surgeon/finite_automata/TagOperation.hpp
8486
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp

src/log_surgeon/Lexer.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ class Lexer {
158158
*/
159159
[[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id
160160
) const -> std::optional<reg_id_t> {
161-
if (m_tag_to_reg_id.contains(tag_id)) {
162-
return m_tag_to_reg_id.at(tag_id);
161+
if (m_tag_to_final_reg_id.contains(tag_id)) {
162+
return m_tag_to_final_reg_id.at(tag_id);
163163
}
164164
return std::nullopt;
165165
}
@@ -218,7 +218,7 @@ class Lexer {
218218
TypedDfaState const* m_prev_state{nullptr};
219219
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
220220
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
221-
std::unordered_map<tag_id_t, reg_id_t> m_tag_to_reg_id;
221+
std::map<tag_id_t, reg_id_t> m_tag_to_final_reg_id;
222222
};
223223

224224
namespace lexers {

src/log_surgeon/finite_automata/Dfa.hpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
#include <stack>
99
#include <vector>
1010

11+
#include <fmt/core.h>
12+
#include <fmt/format.h>
13+
14+
#include <log_surgeon/Constants.hpp>
1115
#include <log_surgeon/finite_automata/DfaStatePair.hpp>
1216
#include <log_surgeon/finite_automata/Nfa.hpp>
1317

@@ -17,6 +21,11 @@ class Dfa {
1721
public:
1822
explicit Dfa(Nfa<TypedNfaState> const& nfa);
1923

24+
/**
25+
* @return A string representation of the DFA.
26+
*/
27+
[[nodiscard]] auto serialize() const -> std::string;
28+
2029
/**
2130
* Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
2231
* @param nfa_state_set The set of NFA states represented by this DFA state.
@@ -38,6 +47,12 @@ class Dfa {
3847
[[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set<uint32_t>;
3948

4049
private:
50+
/**
51+
* @return A vector representing the traversal order of the DFA states using breadth-first
52+
* search (BFS).
53+
*/
54+
[[nodiscard]] auto get_bfs_traversal_order() const -> std::vector<TypedDfaState const*>;
55+
4156
std::vector<std::unique_ptr<TypedDfaState>> m_states;
4257
};
4358

@@ -85,7 +100,7 @@ Dfa<TypedDfaState, TypedNfaState>::Dfa(Nfa<TypedNfaState> const& nfa) {
85100
};
86101
for (auto const& kv : ascii_transitions_map) {
87102
auto* dest_state = next_dfa_state(kv.second);
88-
dfa_state->add_byte_transition(kv.first, dest_state);
103+
dfa_state->add_byte_transition(kv.first, {{}, dest_state});
89104
}
90105
// TODO: add this for the utf8 case
91106
}
@@ -125,6 +140,54 @@ auto Dfa<TypedDfaState, TypedNfaState>::get_intersect(Dfa const* dfa_in
125140
}
126141
return schema_types;
127142
}
143+
144+
template <typename TypedDfaState, typename TypedNfaState>
145+
auto Dfa<TypedDfaState, TypedNfaState>::get_bfs_traversal_order(
146+
) const -> std::vector<TypedDfaState const*> {
147+
std::queue<TypedDfaState const*> state_queue;
148+
std::unordered_set<TypedDfaState const*> visited_states;
149+
std::vector<TypedDfaState const*> visited_order;
150+
visited_states.reserve(m_states.size());
151+
visited_order.reserve(m_states.size());
152+
153+
auto add_to_queue_and_visited
154+
= [&state_queue, &visited_states](TypedDfaState const* dest_state) {
155+
if (visited_states.insert(dest_state).second) {
156+
state_queue.push(dest_state);
157+
}
158+
};
159+
160+
add_to_queue_and_visited(get_root());
161+
while (false == state_queue.empty()) {
162+
auto const* current_state = state_queue.front();
163+
visited_order.push_back(current_state);
164+
state_queue.pop();
165+
// TODO: handle the utf8 case
166+
for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
167+
auto const dest_state{current_state->get_dest_state(idx)};
168+
if (nullptr != dest_state) {
169+
add_to_queue_and_visited(dest_state);
170+
}
171+
}
172+
}
173+
return visited_order;
174+
}
175+
176+
template <typename TypedDfaState, typename TypedNfaState>
177+
auto Dfa<TypedDfaState, TypedNfaState>::serialize() const -> std::string {
178+
auto const traversal_order = get_bfs_traversal_order();
179+
180+
std::unordered_map<TypedDfaState const*, uint32_t> state_ids;
181+
for (auto const* state : traversal_order) {
182+
state_ids.emplace(state, state_ids.size());
183+
}
184+
185+
std::vector<std::string> serialized_states;
186+
for (auto const* state : traversal_order) {
187+
serialized_states.emplace_back(state->serialize(state_ids));
188+
}
189+
return fmt::format("{}\n", fmt::join(serialized_states, "\n"));
190+
}
128191
} // namespace log_surgeon::finite_automata
129192

130193
#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP

src/log_surgeon/finite_automata/DfaState.hpp

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,18 @@
44
#include <cassert>
55
#include <cstdint>
66
#include <memory>
7+
#include <string>
78
#include <tuple>
89
#include <type_traits>
10+
#include <unordered_map>
911
#include <vector>
1012

13+
#include <fmt/core.h>
14+
#include <fmt/format.h>
15+
1116
#include <log_surgeon/Constants.hpp>
17+
#include <log_surgeon/finite_automata/DfaTransition.hpp>
18+
#include <log_surgeon/finite_automata/RegisterOperation.hpp>
1219
#include <log_surgeon/finite_automata/StateType.hpp>
1320
#include <log_surgeon/finite_automata/UnicodeIntervalTree.hpp>
1421

@@ -24,7 +31,11 @@ class DfaState {
2431
public:
2532
using Tree = UnicodeIntervalTree<DfaState*>;
2633

27-
DfaState() { std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); }
34+
DfaState() {
35+
for (auto& transition : m_bytes_transition) {
36+
transition = DfaTransition<state_type>{{}, nullptr};
37+
}
38+
}
2839

2940
auto add_matching_variable_id(uint32_t const variable_id) -> void {
3041
m_matching_variable_ids.push_back(variable_id);
@@ -38,19 +49,32 @@ class DfaState {
3849
return false == m_matching_variable_ids.empty();
3950
}
4051

41-
auto add_byte_transition(uint8_t const& byte, DfaState* dest_state) -> void {
42-
m_bytes_transition[byte] = dest_state;
52+
auto
53+
add_byte_transition(uint8_t const& byte, DfaTransition<state_type> dfa_transition) -> void {
54+
m_bytes_transition[byte] = dfa_transition;
55+
}
56+
57+
auto add_accepting_op(RegisterOperation const reg_op) -> void {
58+
m_accepting_ops.push_back(reg_op);
4359
}
4460

61+
/**
62+
* @param state_ids A map of states to their unique identifiers.
63+
* @return A string representation of the DFA state.
64+
*/
65+
[[nodiscard]] auto serialize(std::unordered_map<DfaState const*, uint32_t> const& state_ids
66+
) const -> std::string;
67+
4568
/**
4669
* @param character The character (byte or utf8) to transition on.
47-
* @return A pointer to the DFA state reached after transitioning on `character`.
70+
* @return The destination DFA state reached after transitioning on `character`.
4871
*/
4972
[[nodiscard]] auto get_dest_state(uint32_t character) const -> DfaState const*;
5073

5174
private:
5275
std::vector<uint32_t> m_matching_variable_ids;
53-
DfaState* m_bytes_transition[cSizeOfByte];
76+
std::vector<RegisterOperation> m_accepting_ops;
77+
DfaTransition<state_type> m_bytes_transition[cSizeOfByte];
5478
// NOTE: We don't need m_tree_transitions for the `state_type == StateType::Byte` case, so we
5579
// use an empty class (`std::tuple<>`) in that case.
5680
std::conditional_t<state_type == StateType::Utf8, Tree, std::tuple<>> m_tree_transitions;
@@ -59,20 +83,62 @@ class DfaState {
5983
template <StateType state_type>
6084
auto DfaState<state_type>::get_dest_state(uint32_t character) const -> DfaState const* {
6185
if constexpr (StateType::Byte == state_type) {
62-
return m_bytes_transition[character];
86+
return m_bytes_transition[character].get_dest_state();
6387
} else {
6488
if (character < cSizeOfByte) {
65-
return m_bytes_transition[character];
89+
return m_bytes_transition[character].get_dest_state();
6690
}
6791
std::unique_ptr<std::vector<typename Tree::Data>> result
6892
= m_tree_transitions.find(Interval(character, character));
6993
assert(result->size() <= 1);
7094
if (false == result->empty()) {
71-
return result->front().m_value;
95+
return result->front().m_value.get_dest_state();
7296
}
7397
return nullptr;
7498
}
7599
}
100+
101+
template <StateType state_type>
102+
auto DfaState<state_type>::serialize(std::unordered_map<DfaState const*, uint32_t> const& state_ids
103+
) const -> std::string {
104+
auto const accepting_tags_string = is_accepting()
105+
? fmt::format(
106+
"accepting_tags={{{}}},",
107+
fmt::join(m_matching_variable_ids, ",")
108+
)
109+
: "";
110+
111+
std::vector<std::string> accepting_op_strings;
112+
for (auto const& accepting_op : m_accepting_ops) {
113+
auto serialized_accepting_op{accepting_op.serialize()};
114+
if (serialized_accepting_op.has_value()) {
115+
accepting_op_strings.push_back(serialized_accepting_op.value());
116+
}
117+
}
118+
auto const accepting_ops_string = is_accepting() ? fmt::format(
119+
"accepting_operations={{{}}},",
120+
fmt::join(accepting_op_strings, ",")
121+
)
122+
: "";
123+
124+
std::vector<std::string> transition_strings;
125+
for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) {
126+
auto const byte_transition_string{m_bytes_transition[idx].serialize(state_ids)};
127+
if (byte_transition_string.has_value()) {
128+
transition_strings.push_back(
129+
fmt::format("{}{}", static_cast<char>(idx), byte_transition_string.value())
130+
);
131+
}
132+
}
133+
134+
return fmt::format(
135+
"{}:{}{}byte_transitions={{{}}}",
136+
state_ids.at(this),
137+
accepting_tags_string,
138+
accepting_ops_string,
139+
fmt::join(transition_strings, ",")
140+
);
141+
}
76142
} // namespace log_surgeon::finite_automata
77143

78144
#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#ifndef LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP
2+
#define LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP
3+
4+
#include <cstdint>
5+
#include <optional>
6+
#include <string>
7+
#include <unordered_map>
8+
#include <utility>
9+
#include <vector>
10+
11+
#include <fmt/core.h>
12+
#include <fmt/format.h>
13+
14+
#include <log_surgeon/finite_automata/RegisterOperation.hpp>
15+
#include <log_surgeon/finite_automata/StateType.hpp>
16+
17+
namespace log_surgeon::finite_automata {
18+
template <StateType state_type>
19+
class DfaState;
20+
21+
template <StateType state_type>
22+
class DfaTransition {
23+
public:
24+
DfaTransition() = default;
25+
26+
DfaTransition(std::vector<RegisterOperation> reg_ops, DfaState<state_type>* dest_state)
27+
: m_reg_ops{std::move(reg_ops)},
28+
m_dest_state{dest_state} {}
29+
30+
[[nodiscard]] auto get_reg_ops() const -> std::vector<RegisterOperation> { return m_reg_ops; }
31+
32+
[[nodiscard]] auto get_dest_state() const -> DfaState<state_type>* { return m_dest_state; }
33+
34+
/**
35+
* @param state_ids A map of states to their unique identifiers.
36+
* @return A string representation of the DFA transition on success.
37+
* @return Forwards `RegisterOperation::serialize`'s return value (std::nullopt) on failure.
38+
* @return std::nullopt if `m_dest_state` is not in `statd_ids`.
39+
*/
40+
[[nodiscard]] auto serialize(
41+
std::unordered_map<DfaState<state_type> const*, uint32_t> const& state_ids
42+
) const -> std::optional<std::string>;
43+
44+
private:
45+
std::vector<RegisterOperation> m_reg_ops;
46+
DfaState<state_type>* m_dest_state{nullptr};
47+
};
48+
49+
template <StateType state_type>
50+
auto DfaTransition<state_type>::serialize(
51+
std::unordered_map<DfaState<state_type> const*, uint32_t> const& state_ids
52+
) const -> std::optional<std::string> {
53+
if (false == state_ids.contains(m_dest_state)) {
54+
return std::nullopt;
55+
}
56+
57+
std::vector<std::string> transformed_ops;
58+
for (auto const& reg_op : m_reg_ops) {
59+
auto const optional_serialized_op{reg_op.serialize()};
60+
if (false == optional_serialized_op.has_value()) {
61+
return std::nullopt;
62+
}
63+
transformed_ops.push_back(optional_serialized_op.value());
64+
}
65+
66+
return fmt::format("-({})->{}", fmt::join(transformed_ops, ","), state_ids.at(m_dest_state));
67+
}
68+
} // namespace log_surgeon::finite_automata
69+
70+
#endif // LOG_SURGEON_FINITE_AUTOMATA_DFATRANSITION_HPP

src/log_surgeon/finite_automata/RegisterHandler.hpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
#include <vector>
66

77
#include <log_surgeon/finite_automata/PrefixTree.hpp>
8+
#include <log_surgeon/types.hpp>
89

910
namespace log_surgeon::finite_automata {
10-
using register_id_t = uint32_t;
11-
1211
/**
1312
* The register handler maintains a prefix tree that is sufficient to represent all registers.
1413
* The register handler also contains a vector of registers, and performs the set, copy, and append
@@ -27,37 +26,36 @@ class RegisterHandler {
2726
return added_registers;
2827
}
2928

30-
auto add_register() -> register_id_t {
29+
auto add_register() -> reg_id_t {
3130
auto const prefix_tree_node_id{
3231
m_prefix_tree.insert(PrefixTree::cRootId, PrefixTree::cDefaultPos)
3332
};
3433
m_registers.emplace_back(prefix_tree_node_id);
3534
return m_registers.size() - 1;
3635
}
3736

38-
auto add_register(PrefixTree::id_t const prefix_tree_parent_node_id) -> register_id_t {
37+
auto add_register(PrefixTree::id_t const prefix_tree_parent_node_id) -> reg_id_t {
3938
auto const prefix_tree_node_id{
4039
m_prefix_tree.insert(prefix_tree_parent_node_id, PrefixTree::cDefaultPos)
4140
};
4241
m_registers.emplace_back(prefix_tree_node_id);
4342
return m_registers.size() - 1;
4443
}
4544

46-
auto set_register(register_id_t const reg_id, PrefixTree::position_t const position) -> void {
45+
auto set_register(reg_id_t const reg_id, PrefixTree::position_t const position) -> void {
4746
m_prefix_tree.set(m_registers.at(reg_id), position);
4847
}
4948

50-
auto copy_register(register_id_t const dest_reg_id, register_id_t const source_reg_id) -> void {
49+
auto copy_register(reg_id_t const dest_reg_id, reg_id_t const source_reg_id) -> void {
5150
m_registers.at(dest_reg_id) = m_registers.at(source_reg_id);
5251
}
5352

54-
auto
55-
append_position(register_id_t const reg_id, PrefixTree::position_t const position) -> void {
53+
auto append_position(reg_id_t const reg_id, PrefixTree::position_t const position) -> void {
5654
auto const node_id{m_registers.at(reg_id)};
5755
m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position);
5856
}
5957

60-
[[nodiscard]] auto get_reversed_positions(register_id_t const reg_id
58+
[[nodiscard]] auto get_reversed_positions(reg_id_t const reg_id
6159
) const -> std::vector<PrefixTree::position_t> {
6260
return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id));
6361
}

0 commit comments

Comments
 (0)