Skip to content

Commit 5df1981

Browse files
feat: Support non-unique capture names (fixes #180). (#193)
Co-authored-by: SharafMohamed <[email protected]>
1 parent 22a56bc commit 5df1981

9 files changed

Lines changed: 182 additions & 83 deletions

File tree

src/log_surgeon/Lexer.hpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
#include <array>
55
#include <cstdint>
6-
#include <map>
76
#include <memory>
87
#include <optional>
98
#include <set>
@@ -13,6 +12,7 @@
1312
#include <vector>
1413

1514
#include <log_surgeon/Constants.hpp>
15+
#include <log_surgeon/finite_automata/Capture.hpp>
1616
#include <log_surgeon/finite_automata/Dfa.hpp>
1717
#include <log_surgeon/finite_automata/DfaState.hpp>
1818
#include <log_surgeon/finite_automata/NfaState.hpp>
@@ -166,29 +166,29 @@ class Lexer {
166166
}
167167

168168
/**
169-
* Retrieves a list of capture IDs for a given rule ID.
170-
* These capture IDs correspond to the captures in the rule that were matched during lexing.
169+
* Retrieves a list of capture pointers for a given rule ID.
170+
* These pointers correspond to the captures in the rule that were matched during lexing.
171171
* @param rule_id The ID of the rule to search for captures.
172-
* @return A vector of capture IDs if the rule contains captures;
172+
* @return A vector of capture pointers if the rule contains captures;
173173
* @return std::nullopt if no captures are found for the rule.
174174
*/
175-
[[nodiscard]] auto get_capture_ids_from_rule_id(rule_id_t const rule_id) const
176-
-> std::optional<std::vector<capture_id_t>> {
177-
if (m_rule_id_to_capture_ids.contains(rule_id)) {
178-
return m_rule_id_to_capture_ids.at(rule_id);
175+
[[nodiscard]] auto get_captures_from_rule_id(rule_id_t const rule_id) const
176+
-> std::optional<std::vector<finite_automata::Capture const*>> {
177+
if (m_rule_id_to_capture.contains(rule_id)) {
178+
return m_rule_id_to_capture.at(rule_id);
179179
}
180180
return std::nullopt;
181181
}
182182

183183
/**
184-
* @param capture_id ID associated with a capture within a rule.
184+
* @param capture Pointer to the capture.
185185
* @return The start and end tag of the capture on success.
186-
* @return std::nullopt if no capture is associated with the given capture ID.
186+
* @return std::nullopt if no tags are associated with the given capture.
187187
*/
188-
[[nodiscard]] auto get_tag_id_pair_from_capture_id(capture_id_t const capture_id) const
188+
[[nodiscard]] auto get_tag_id_pair_from_capture(finite_automata::Capture const* const capture) const
189189
-> std::optional<std::pair<tag_id_t, tag_id_t>> {
190-
if (m_capture_id_to_tag_id_pair.contains(capture_id)) {
191-
return m_capture_id_to_tag_id_pair.at(capture_id);
190+
if (m_capture_to_tag_id_pair.contains(capture)) {
191+
return m_capture_to_tag_id_pair.at(capture);
192192
}
193193
return std::nullopt;
194194
}
@@ -209,14 +209,14 @@ class Lexer {
209209
}
210210

211211
/**
212-
* Retrieves the register IDs for the start and end tags associated with a given capture ID.
213-
* @param capture_id The ID of the capture to search for.
212+
* Retrieves the register IDs for the start and end tags associated with a given capture.
213+
* @param capture Pointer to the capture to search for.
214214
* @return A pair of register IDs corresponding to the start and end tags of the capture.
215215
* @return std::nullopt if no such capture is found.
216216
*/
217-
[[nodiscard]] auto get_reg_ids_from_capture_id(capture_id_t const capture_id) const
217+
[[nodiscard]] auto get_reg_ids_from_capture(finite_automata::Capture const* const capture) const
218218
-> std::optional<std::pair<reg_id_t, reg_id_t>> {
219-
auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id(capture_id)};
219+
auto const optional_tag_id_pair{get_tag_id_pair_from_capture(capture)};
220220
if (false == optional_tag_id_pair.has_value()) {
221221
return std::nullopt;
222222
}
@@ -265,8 +265,10 @@ class Lexer {
265265
bool m_asked_for_more_data{false};
266266
TypedDfaState const* m_prev_state{nullptr};
267267
TypedDfaState const* m_state{nullptr};
268-
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
269-
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
268+
std::unordered_map<rule_id_t, std::vector<finite_automata::Capture const*>>
269+
m_rule_id_to_capture;
270+
std::unordered_map<finite_automata::Capture const*, std::pair<tag_id_t, tag_id_t>>
271+
m_capture_to_tag_id_pair;
270272
};
271273

272274
namespace lexers {

src/log_surgeon/Lexer.tpp

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <cassert>
55
#include <memory>
66
#include <stack>
7-
#include <stdexcept>
87
#include <string>
98
#include <vector>
109

@@ -438,28 +437,18 @@ auto Lexer<TypedNfaState, TypedDfaState>::get_highest_priority_rule(rule_id_t co
438437
template <typename TypedNfaState, typename TypedDfaState>
439438
void Lexer<TypedNfaState, TypedDfaState>::generate() {
440439
for (auto const& rule : m_rules) {
441-
for (auto const* capture : rule.get_captures()) {
442-
std::string const capture_name{capture->get_name()};
443-
if (m_symbol_id.contains(capture_name)) {
444-
throw std::invalid_argument(
445-
"`m_rules` contains capture names that are not unique."
446-
);
440+
auto const rule_id{rule.get_variable_id()};
441+
if (false == rule.get_captures().empty()) {
442+
auto& captures_vec{m_rule_id_to_capture.try_emplace(rule_id).first->second};
443+
for (auto const* capture : rule.get_captures()) {
444+
captures_vec.push_back(capture);
447445
}
448-
auto const capture_id{m_symbol_id.size()};
449-
m_symbol_id.emplace(capture_name, capture_id);
450-
m_id_symbol.emplace(capture_id, capture_name);
451-
452-
auto const rule_id{rule.get_variable_id()};
453-
m_rule_id_to_capture_ids.try_emplace(rule_id);
454-
m_rule_id_to_capture_ids.at(rule_id).push_back(capture_id);
455446
}
456447
}
457448

458449
finite_automata::Nfa<TypedNfaState> nfa{m_rules};
459450
for (auto const& [capture, tag_id_pair] : nfa.get_capture_to_tag_id_pair()) {
460-
std::string const capture_name{capture->get_name()};
461-
auto const capture_id{m_symbol_id.at(capture_name)};
462-
m_capture_id_to_tag_id_pair.emplace(capture_id, tag_id_pair);
451+
m_capture_to_tag_id_pair.emplace(capture, tag_id_pair);
463452
}
464453

465454
m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa);

src/log_surgeon/LogEvent.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,16 @@ auto LogEventView::get_logtype() const -> std::string {
6565
{
6666
logtype += token_view.release_delimiter();
6767
}
68-
if (auto const& optional_capture_ids{
69-
m_log_parser.m_lexer.get_capture_ids_from_rule_id(rule_id)
68+
if (auto const& optional_captures{
69+
m_log_parser.m_lexer.get_captures_from_rule_id(rule_id)
7070
};
71-
optional_capture_ids.has_value())
71+
optional_captures.has_value())
7272
{
7373
auto capture_view{token_view};
74-
auto const& capture_ids{optional_capture_ids.value()};
75-
for (auto const capture_id : capture_ids) {
74+
auto const& captures{optional_captures.value()};
75+
for (auto const capture : captures) {
7676
auto const& optional_reg_id_pair{
77-
m_log_parser.m_lexer.get_reg_ids_from_capture_id(capture_id)
77+
m_log_parser.m_lexer.get_reg_ids_from_capture(capture)
7878
};
7979
if (false == optional_reg_id_pair.has_value()) {
8080
continue;
@@ -86,7 +86,7 @@ auto LogEventView::get_logtype() const -> std::string {
8686
capture_view.get_reversed_reg_positions(optional_reg_id_pair->second)
8787
};
8888

89-
auto capture_name{m_log_parser.get_id_symbol(capture_id)};
89+
auto const& capture_name{capture->get_name()};
9090
if (false == start_positions.empty() && -1 < start_positions[0]
9191
&& false == end_positions.empty() && -1 < end_positions[0])
9292
{

src/log_surgeon/finite_automata/Capture.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22
#define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE
33

44
#include <string>
5-
#include <string_view>
65
#include <utility>
76

87
namespace log_surgeon::finite_automata {
98
class Capture {
109
public:
1110
explicit Capture(std::string name) : m_name{std::move(name)} {}
1211

13-
[[nodiscard]] auto get_name() const -> std::string_view { return m_name; }
12+
[[nodiscard]] auto get_name() const -> std::string const& { return m_name; }
1413

1514
private:
1615
std::string m_name;

src/log_surgeon/finite_automata/Nfa.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP
22
#define LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP
33

4-
#include <cstddef>
54
#include <cstdint>
65
#include <memory>
76
#include <optional>

src/log_surgeon/types.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <cstdint>
55

66
namespace log_surgeon {
7-
using capture_id_t = uint32_t;
87
using reg_id_t = uint32_t;
98
using rule_id_t = uint32_t;
109
using tag_id_t = uint32_t;

tests/test-buffer-parser.cpp

Lines changed: 101 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,33 @@
1717
#include <fmt/format.h>
1818

1919
using log_surgeon::BufferParser;
20-
using log_surgeon::capture_id_t;
2120
using log_surgeon::ErrorCode;
2221
using log_surgeon::finite_automata::PrefixTree;
2322
using log_surgeon::rule_id_t;
2423
using log_surgeon::Schema;
2524
using log_surgeon::SymbolId;
25+
using std::pair;
2626
using std::string;
2727
using std::string_view;
2828
using std::unordered_map;
2929
using std::vector;
3030

3131
namespace {
3232
struct CapturePositions {
33-
std::vector<PrefixTree::position_t> m_start_positions;
34-
std::vector<PrefixTree::position_t> m_end_positions;
33+
vector<PrefixTree::position_t> m_start_positions;
34+
vector<PrefixTree::position_t> m_end_positions;
3535
};
3636

3737
struct ExpectedToken {
38-
std::string_view m_raw_string;
39-
std::string m_type;
40-
std::map<string, CapturePositions> m_captures;
38+
string_view m_raw_string;
39+
string m_type;
40+
vector<pair<string, CapturePositions>> m_captures;
4141
};
4242

4343
struct ExpectedEvent {
44-
std::string_view m_logtype;
45-
std::string_view m_timestamp_raw;
46-
std::vector<ExpectedToken> m_tokens;
44+
string_view m_logtype;
45+
string_view m_timestamp_raw;
46+
vector<ExpectedToken> m_tokens;
4747
};
4848

4949
/**
@@ -58,8 +58,8 @@ struct ExpectedEvent {
5858
*/
5959
auto parse_and_validate(
6060
BufferParser& buffer_parser,
61-
std::string_view input,
62-
std::vector<ExpectedEvent> const& expected_events
61+
string_view input,
62+
vector<ExpectedEvent> const& expected_events
6363
) -> void;
6464

6565
/**
@@ -70,8 +70,8 @@ auto parse_and_validate(
7070

7171
auto parse_and_validate(
7272
BufferParser& buffer_parser,
73-
std::string_view input,
74-
std::vector<ExpectedEvent> const& expected_events
73+
string_view input,
74+
vector<ExpectedEvent> const& expected_events
7575
) -> void {
7676
buffer_parser.reset();
7777

@@ -121,29 +121,32 @@ auto parse_and_validate(
121121

122122
if (false == expected_captures.empty()) {
123123
auto const& lexer{buffer_parser.get_log_parser().m_lexer};
124-
auto optional_capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
125-
REQUIRE(optional_capture_ids.has_value());
124+
auto optional_captures{lexer.get_captures_from_rule_id(token_type)};
125+
REQUIRE(optional_captures.has_value());
126126

127-
if (false == optional_capture_ids.has_value()) {
127+
if (false == optional_captures.has_value()) {
128128
return;
129129
}
130130

131-
for (auto const capture_id : optional_capture_ids.value()) {
132-
auto const capture_name{lexer.m_id_symbol.at(capture_id)};
133-
REQUIRE(expected_captures.contains(capture_name));
134-
auto optional_reg_ids{lexer.get_reg_ids_from_capture_id(capture_id)};
131+
REQUIRE(expected_captures.size() == optional_captures.value().size());
132+
for (uint32_t j{0}; j < optional_captures.value().size(); j++) {
133+
auto const capture{optional_captures.value()[j]};
134+
auto const [expected_name, expected_positions]{expected_captures[j]};
135+
REQUIRE(expected_name == capture->get_name());
136+
auto optional_reg_ids{lexer.get_reg_ids_from_capture(capture)};
135137
REQUIRE(optional_reg_ids.has_value());
136138
if (false == optional_reg_ids.has_value()) {
137139
return;
138140
}
139141
auto const [start_reg_id, end_reg_id]{optional_reg_ids.value()};
140-
auto const actual_start_positions{
141-
token.get_reversed_reg_positions(start_reg_id)
142-
};
142+
auto actual_start_positions{token.get_reversed_reg_positions(start_reg_id)};
143143
auto const actual_end_positions{token.get_reversed_reg_positions(end_reg_id)};
144144
auto const [expected_start_positions, expected_end_positions]{
145-
expected_captures.at(capture_name)
145+
expected_positions
146146
};
147+
// Note: Known bug that start positions contain failed match starts as well, so
148+
// currently it must be truncated.
149+
actual_start_positions.resize(actual_end_positions.size());
147150
REQUIRE(expected_start_positions == actual_start_positions);
148151
REQUIRE(expected_end_positions == actual_end_positions);
149152
}
@@ -1066,3 +1069,77 @@ TEST_CASE("multi_capture_two", "[BufferParser]") {
10661069

10671070
parse_and_validate(buffer_parser, cInput, {expected_event});
10681071
}
1072+
1073+
/**
1074+
* @ingroup test_buffer_parser_capture
1075+
* @brief Tests a multi-capture with non-unique names.
1076+
*
1077+
* This test verifies that a buffer_parser with multiple capture rules with non-unique capture rules
1078+
* can be generated and used correctly.
1079+
*
1080+
* ### Schema Definition
1081+
* @code
1082+
* delimiters: \n\r[:,
1083+
* var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123)
1084+
* var2:(?<capture>[A-Za-z]+123) text text
1085+
* @endcode
1086+
*
1087+
* ### Input Example
1088+
* @code
1089+
* "Log is myCapture123 text anotherCapture123 and then another variable is capture123 text text"
1090+
* @endcode
1091+
*
1092+
* ### Expected Logtype
1093+
* @code
1094+
* "Log is <capture> text <capture> and then another variable is <capture> text text"
1095+
* @endcode
1096+
*
1097+
* ### Expected Tokenization
1098+
* @code
1099+
* "Log" -> uncaught string
1100+
* " is" -> uncaught string
1101+
* " " -> uncaught string
1102+
* "myCapture123 text anotherCapture123" -> "var1"
1103+
* " and" -> uncaught string
1104+
* " then" -> uncaught string
1105+
* " another" -> uncaught string
1106+
* " variable" -> uncaught string
1107+
* " is" -> uncaught string
1108+
* " " -> uncaught string
1109+
* "capture123 text text" -> "var2"
1110+
* @endcode
1111+
*/
1112+
TEST_CASE("multi_capture_non_unique_names", "[BufferParser]") {
1113+
constexpr string_view cDelimitersSchema{R"(delimiters: \n\r[:,)"};
1114+
constexpr string_view cVar1{R"(var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123))"};
1115+
constexpr string_view cVar2{R"(var2:(?<capture>[A-Za-z]+123) text text)"};
1116+
constexpr string_view cInput{"Log is myCapture123 text anotherCapture123 and then another "
1117+
"variable is capture123 text text"};
1118+
1119+
ExpectedEvent const expected_event{
1120+
.m_logtype{"Log is <capture> text <capture> and then another variable is <capture> "
1121+
"text text"},
1122+
.m_timestamp_raw{""},
1123+
.m_tokens{
1124+
{{"Log", "", {}},
1125+
{" is", "", {}},
1126+
{" myCapture123 text anotherCapture123", "var1",
1127+
{{{"capture", {{7}, {19}}},
1128+
{"capture", {{25}, {42}}}}}},
1129+
{" and", "", {}},
1130+
{" then", "", {}},
1131+
{" another", "", {}},
1132+
{" variable", "", {}},
1133+
{" is", "", {}},
1134+
{" capture123 text text", "var2", {{{"capture", {{72}, {82}}}}}}}
1135+
}
1136+
};
1137+
1138+
Schema schema;
1139+
schema.add_delimiters(cDelimitersSchema);
1140+
schema.add_variable(cVar1, -1);
1141+
schema.add_variable(cVar2, -1);
1142+
BufferParser buffer_parser{std::move(schema.release_schema_ast_ptr())};
1143+
1144+
parse_and_validate(buffer_parser, cInput, {expected_event});
1145+
}

0 commit comments

Comments
 (0)