Skip to content

Commit 6904e51

Browse files
feat: Separate concept tags from captures and store capture-to-tag mapping in the lexer.
Co-authored-by: Lin Zhihao <[email protected]>
1 parent a53358d commit 6904e51

18 files changed

Lines changed: 616 additions & 283 deletions

CMakeLists.txt

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,20 @@ set(SOURCE_FILES
6868
src/log_surgeon/Constants.hpp
6969
src/log_surgeon/FileReader.cpp
7070
src/log_surgeon/FileReader.hpp
71+
src/log_surgeon/finite_automata/Capture.hpp
72+
src/log_surgeon/finite_automata/Dfa.hpp
73+
src/log_surgeon/finite_automata/DfaState.hpp
74+
src/log_surgeon/finite_automata/DfaStatePair.hpp
75+
src/log_surgeon/finite_automata/Nfa.hpp
76+
src/log_surgeon/finite_automata/NfaState.hpp
77+
src/log_surgeon/finite_automata/PrefixTree.cpp
78+
src/log_surgeon/finite_automata/PrefixTree.hpp
79+
src/log_surgeon/finite_automata/RegexAST.hpp
80+
src/log_surgeon/finite_automata/RegisterHandler.hpp
81+
src/log_surgeon/finite_automata/StateType.hpp
82+
src/log_surgeon/finite_automata/TaggedTransition.hpp
83+
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp
84+
src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp
7185
src/log_surgeon/Lalr1Parser.cpp
7286
src/log_surgeon/Lalr1Parser.hpp
7387
src/log_surgeon/Lalr1Parser.tpp
@@ -93,20 +107,8 @@ set(SOURCE_FILES
93107
src/log_surgeon/SchemaParser.hpp
94108
src/log_surgeon/Token.cpp
95109
src/log_surgeon/Token.hpp
96-
src/log_surgeon/finite_automata/PrefixTree.cpp
97-
src/log_surgeon/finite_automata/PrefixTree.hpp
98-
src/log_surgeon/finite_automata/RegexAST.hpp
99-
src/log_surgeon/finite_automata/Dfa.hpp
100-
src/log_surgeon/finite_automata/DfaState.hpp
101-
src/log_surgeon/finite_automata/DfaStatePair.hpp
102-
src/log_surgeon/finite_automata/Nfa.hpp
103-
src/log_surgeon/finite_automata/NfaState.hpp
104-
src/log_surgeon/finite_automata/RegisterHandler.hpp
105-
src/log_surgeon/finite_automata/StateType.hpp
106-
src/log_surgeon/finite_automata/Tag.hpp
107-
src/log_surgeon/finite_automata/TaggedTransition.hpp
108-
src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp
109-
src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp
110+
src/log_surgeon/types.hpp
111+
src/log_surgeon/UniqueIdGenerator.hpp
110112
)
111113

112114
set(LCHIP_INSTALL_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/log_surgeon)

src/log_surgeon/Lexer.hpp

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@
22
#define LOG_SURGEON_LEXER_HPP
33

44
#include <array>
5-
#include <bitset>
65
#include <cstdint>
76
#include <memory>
7+
#include <optional>
88
#include <string>
99
#include <unordered_map>
10-
#include <unordered_set>
10+
#include <utility>
1111
#include <vector>
1212

1313
#include <log_surgeon/Constants.hpp>
1414
#include <log_surgeon/finite_automata/Dfa.hpp>
1515
#include <log_surgeon/finite_automata/DfaState.hpp>
16-
#include <log_surgeon/finite_automata/Nfa.hpp>
1716
#include <log_surgeon/finite_automata/RegexAST.hpp>
1817
#include <log_surgeon/LexicalRule.hpp>
1918
#include <log_surgeon/ParserInputBuffer.hpp>
2019
#include <log_surgeon/Token.hpp>
20+
#include <log_surgeon/types.hpp>
2121

2222
namespace log_surgeon {
2323
template <typename TypedNfaState, typename TypedDfaState>
@@ -35,13 +35,11 @@ class Lexer {
3535

3636
/**
3737
* Add lexical rule to the lexer's list of rules
38-
* @param id
39-
* @param regex
38+
* @param rule_id
39+
* @param rule
4040
*/
41-
auto add_rule(
42-
uint32_t const& id,
43-
std::unique_ptr<finite_automata::RegexAST<TypedNfaState>> rule
44-
) -> void;
41+
auto add_rule(rule_id_t rule_id, std::unique_ptr<finite_automata::RegexAST<TypedNfaState>> rule)
42+
-> void;
4543

4644
/**
4745
* Return regex pattern for a rule name
@@ -51,7 +49,8 @@ class Lexer {
5149
auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST<TypedNfaState>*;
5250

5351
/**
54-
* Generate DFA for lexer
52+
* Generate DFA for lexer.
53+
* @throw std::invalid_argument if `m_rules` contains multipe captures with the same name.
5554
*/
5655
auto generate() -> void;
5756

@@ -122,8 +121,75 @@ class Lexer {
122121
return m_dfa;
123122
}
124123

125-
std::unordered_map<std::string, uint32_t> m_symbol_id;
126-
std::unordered_map<uint32_t, std::string> m_id_symbol;
124+
/**
125+
* @param rule_id ID associated with a rule.
126+
* @return A vector of capture IDs corresponding to each rule that contain the variable on
127+
* success.
128+
* @return std::nullopt if the variable is never captured in any rule.
129+
*/
130+
[[nodiscard]] auto get_capture_ids_from_rule_id(rule_id_t const rule_id
131+
) const -> std::optional<std::vector<capture_id_t>> {
132+
if (m_rule_id_to_capture_ids.contains(rule_id)) {
133+
return m_rule_id_to_capture_ids.at(rule_id);
134+
}
135+
return std::nullopt;
136+
}
137+
138+
/**
139+
* @param capture_id ID associated with a capture within a rule.
140+
* @return The start and end tag of the capture on success.
141+
* @return std::nullopt if no capture is associated with the given capture ID.
142+
*/
143+
[[nodiscard]] auto get_tag_id_pair_from_capture_id(capture_id_t const capture_id
144+
) const -> std::optional<std::pair<tag_id_t, tag_id_t>> {
145+
if (m_capture_id_to_tag_id_pair.contains(capture_id)) {
146+
return m_capture_id_to_tag_id_pair.at(capture_id);
147+
}
148+
return std::nullopt;
149+
}
150+
151+
/**
152+
* @param tag_id ID associated with a tag.
153+
* @return The final register ID tracking the value of the tag ID during DFA simulation on
154+
* success.
155+
* @return std::nullopt if no tag is associated with the given tag ID.
156+
*/
157+
[[nodiscard]] auto get_reg_id_from_tag_id(tag_id_t const tag_id
158+
) const -> std::optional<reg_id_t> {
159+
if (m_tag_to_reg_id.contains(tag_id)) {
160+
return m_tag_to_reg_id.at(tag_id);
161+
}
162+
return std::nullopt;
163+
}
164+
165+
/**
166+
* @param capture_id ID associated with a capture within a rule.
167+
* @return The start and end final register IDs tracking the position of the capture on success.
168+
* @return std::nullopt if no capture is associated with the given capture ID.
169+
*/
170+
[[nodiscard]] auto get_reg_ids_from_capture_id(capture_id_t const capture_id
171+
) const -> std::optional<std::pair<reg_id_t, reg_id_t>> {
172+
auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id(capture_id)};
173+
if (false == optional_tag_id_pair.has_value()) {
174+
return std::nullopt;
175+
}
176+
auto const [start_tag_id, end_tag_id]{optional_tag_id_pair.value()};
177+
178+
auto const optional_start_reg_id{get_reg_id_from_tag_id(start_tag_id)};
179+
if (false == optional_start_reg_id.has_value()) {
180+
return std::nullopt;
181+
}
182+
183+
auto const optional_end_reg_id{get_reg_id_from_tag_id(end_tag_id)};
184+
if (false == optional_end_reg_id.has_value()) {
185+
return std::nullopt;
186+
}
187+
188+
return {optional_start_reg_id.value(), optional_end_reg_id.value()};
189+
}
190+
191+
std::unordered_map<std::string, rule_id_t> m_symbol_id;
192+
std::unordered_map<rule_id_t, std::string> m_id_symbol;
127193

128194
private:
129195
/**
@@ -148,6 +214,9 @@ class Lexer {
148214
std::unique_ptr<finite_automata::Dfa<TypedDfaState>> m_dfa;
149215
bool m_asked_for_more_data{false};
150216
TypedDfaState const* m_prev_state{nullptr};
217+
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
218+
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
219+
std::unordered_map<tag_id_t, reg_id_t> m_tag_to_reg_id;
151220
};
152221

153222
namespace lexers {

src/log_surgeon/Lexer.tpp

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
#include <cassert>
55
#include <memory>
66
#include <stack>
7+
#include <stdexcept>
78
#include <string>
89
#include <vector>
910

1011
#include <log_surgeon/Constants.hpp>
1112
#include <log_surgeon/finite_automata/RegexAST.hpp>
13+
#include <log_surgeon/types.hpp>
1214

1315
/**
1416
* utf8 format (https://en.wikipedia.org/wiki/UTF-8)
@@ -358,17 +360,17 @@ void Lexer<TypedNfaState, TypedDfaState>::add_delimiters(std::vector<uint32_t> c
358360

359361
template <typename TypedNfaState, typename TypedDfaState>
360362
void Lexer<TypedNfaState, TypedDfaState>::add_rule(
361-
uint32_t const& id,
363+
rule_id_t const rule_id,
362364
std::unique_ptr<finite_automata::RegexAST<TypedNfaState>> rule
363365
) {
364-
m_rules.emplace_back(id, std::move(rule));
366+
m_rules.emplace_back(rule_id, std::move(rule));
365367
}
366368

367369
template <typename TypedNfaState, typename TypedDfaState>
368-
auto Lexer<TypedNfaState, TypedDfaState>::get_rule(uint32_t const variable_id
370+
auto Lexer<TypedNfaState, TypedDfaState>::get_rule(rule_id_t const rule_id
369371
) -> finite_automata::RegexAST<TypedNfaState>* {
370372
for (auto const& rule : m_rules) {
371-
if (rule.get_variable_id() == variable_id) {
373+
if (rule.get_variable_id() == rule_id) {
372374
return rule.get_regex();
373375
}
374376
}
@@ -377,8 +379,31 @@ auto Lexer<TypedNfaState, TypedDfaState>::get_rule(uint32_t const variable_id
377379

378380
template <typename TypedNfaState, typename TypedDfaState>
379381
void Lexer<TypedNfaState, TypedDfaState>::generate() {
380-
finite_automata::Nfa<TypedNfaState> nfa{std::move(m_rules)};
381-
// TODO: DFA ignores tags. E.g., treats "capture:user=(?<user_id>\d+)" as "capture:user=\d+"
382+
for (auto const& rule : m_rules) {
383+
for (auto const* capture : rule.get_captures()) {
384+
std::string const capture_name{capture->get_name()};
385+
if (m_symbol_id.contains(capture_name)) {
386+
throw std::invalid_argument("`m_rules` contains capture names that are not unique."
387+
);
388+
}
389+
auto const capture_id{m_symbol_id.size()};
390+
m_symbol_id.emplace(capture_name, capture_id);
391+
m_id_symbol.emplace(capture_id, capture_name);
392+
393+
auto const rule_id{rule.get_variable_id()};
394+
m_rule_id_to_capture_ids.try_emplace(rule_id);
395+
m_rule_id_to_capture_ids.at(rule_id).push_back(capture_id);
396+
}
397+
}
398+
399+
finite_automata::Nfa<TypedNfaState> nfa{m_rules};
400+
for (auto const& [capture, tag_id_pair] : nfa.get_capture_to_tag_id_pair()) {
401+
std::string const capture_name{capture->get_name()};
402+
auto const capture_id{m_symbol_id.at(capture_name)};
403+
m_capture_id_to_tag_id_pair.emplace(capture_id, tag_id_pair);
404+
}
405+
406+
// TODO: DFA ignores captures. E.g., treats "capture:user=(?<user_id>\d+)" as "capture:user=\d+"
382407
m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState>>(std::move(nfa));
383408
auto const* state = m_dfa->get_root();
384409
for (uint32_t i = 0; i < cSizeOfByte; i++) {

src/log_surgeon/LexicalRule.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class LexicalRule {
2323
*/
2424
auto add_to_nfa(finite_automata::Nfa<TypedNfaState>* nfa) const -> void;
2525

26+
[[nodiscard]] auto get_captures() const -> std::vector<finite_automata::Capture const*> const& {
27+
return m_regex->get_subtree_positive_captures();
28+
}
29+
2630
[[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; }
2731

2832
[[nodiscard]] auto get_regex() const -> finite_automata::RegexAST<TypedNfaState>* {
@@ -40,7 +44,7 @@ void LexicalRule<TypedNfaState>::add_to_nfa(finite_automata::Nfa<TypedNfaState>*
4044
auto* end_state = nfa->new_state();
4145
end_state->set_accepting(true);
4246
end_state->set_matching_variable_id(m_variable_id);
43-
m_regex->add_to_nfa_with_negative_tags(nfa, end_state);
47+
m_regex->add_to_nfa_with_negative_captures(nfa, end_state);
4448
}
4549
} // namespace log_surgeon
4650

src/log_surgeon/SchemaParser.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
#include <log_surgeon/Constants.hpp>
1111
#include <log_surgeon/FileReader.hpp>
12+
#include <log_surgeon/finite_automata/Capture.hpp>
1213
#include <log_surgeon/finite_automata/RegexAST.hpp>
13-
#include <log_surgeon/finite_automata/Tag.hpp>
1414
#include <log_surgeon/Lalr1Parser.hpp>
1515
#include <log_surgeon/Lexer.hpp>
1616
#include <log_surgeon/utils.hpp>
@@ -167,7 +167,7 @@ static auto regex_capture_rule(NonTerminal const* m) -> std::unique_ptr<ParserAS
167167
auto& r6 = m->non_terminal_cast(5)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
168168
return std::make_unique<ParserValueRegex>(make_unique<RegexASTCaptureByte>(
169169
std::move(r6),
170-
std::make_unique<finite_automata::Tag>(r4->m_name)
170+
std::make_unique<finite_automata::Capture>(r4->m_name)
171171
));
172172
}
173173

@@ -202,7 +202,7 @@ static auto regex_or_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
202202
static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
203203
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
204204

205-
// To handle negative tags we treat `R*` as `R+ | ∅`.
205+
// To handle negative captures we treat `R*` as `R+ | ∅`.
206206
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
207207
make_unique<RegexASTEmptyByte>(),
208208
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, 0)
@@ -248,7 +248,7 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr<ParserAST> {
248248
auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get<unique_ptr<RegexASTByte>>();
249249

250250
if (0 == min) {
251-
// To handle negative tags we treat `R*` as `R+ | ∅`.
251+
// To handle negative captures we treat `R*` as `R+ | ∅`.
252252
return make_unique<ParserValueRegex>(make_unique<RegexASTOrByte>(
253253
make_unique<RegexASTEmptyByte>(),
254254
make_unique<RegexASTMultiplicationByte>(std::move(r1), 1, max)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef LOG_SURGEON_UNIQUEIDGENERATOR_HPP
2+
#define LOG_SURGEON_UNIQUEIDGENERATOR_HPP
3+
4+
#include <cstdint>
5+
6+
namespace log_surgeon {
7+
class UniqueIdGenerator {
8+
public:
9+
[[nodiscard]] auto generate_id() -> uint32_t { return m_current_id++; }
10+
11+
private:
12+
uint32_t m_current_id{0};
13+
};
14+
} // namespace log_surgeon
15+
16+
#endif // LOG_SURGEON_UNIQUEIDGENERATOR_HPP
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
#ifndef LOG_SURGEON_FINITE_AUTOMATA_TAG
2-
#define LOG_SURGEON_FINITE_AUTOMATA_TAG
1+
#ifndef LOG_SURGEON_FINITE_AUTOMATA_CAPTURE
2+
#define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE
33

44
#include <string>
55
#include <string_view>
66
#include <utility>
77

88
namespace log_surgeon::finite_automata {
9-
class Tag {
9+
class Capture {
1010
public:
11-
explicit Tag(std::string name) : m_name{std::move(name)} {}
11+
explicit Capture(std::string name) : m_name{std::move(name)} {}
1212

1313
[[nodiscard]] auto get_name() const -> std::string_view { return m_name; }
1414

@@ -17,4 +17,4 @@ class Tag {
1717
};
1818
} // namespace log_surgeon::finite_automata
1919

20-
#endif // LOG_SURGEON_FINITE_AUTOMATA_TAG
20+
#endif // LOG_SURGEON_FINITE_AUTOMATA_CAPTURE

src/log_surgeon/finite_automata/Dfa.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP
33

44
#include <cstdint>
5+
#include <map>
56
#include <memory>
67
#include <set>
78
#include <vector>

0 commit comments

Comments
 (0)