1717#include < fmt/format.h>
1818
1919using log_surgeon::BufferParser;
20- using log_surgeon::capture_id_t ;
2120using log_surgeon::ErrorCode;
2221using log_surgeon::finite_automata::PrefixTree;
2322using log_surgeon::rule_id_t ;
2423using log_surgeon::Schema;
2524using log_surgeon::SymbolId;
25+ using std::pair;
2626using std::string;
2727using std::string_view;
2828using std::unordered_map;
2929using std::vector;
3030
3131namespace {
3232struct CapturePositions {
33- std:: vector<PrefixTree::position_t > m_start_positions;
34- std:: vector<PrefixTree::position_t > m_end_positions;
33+ vector<PrefixTree::position_t > m_start_positions;
34+ vector<PrefixTree::position_t > m_end_positions;
3535};
3636
3737struct ExpectedToken {
38- std:: string_view m_raw_string;
39- std:: string m_type;
40- std::map< string, CapturePositions> m_captures;
38+ string_view m_raw_string;
39+ string m_type;
40+ vector<pair< string, CapturePositions> > m_captures;
4141};
4242
4343struct ExpectedEvent {
44- std:: string_view m_logtype;
45- std:: string_view m_timestamp_raw;
46- std:: vector<ExpectedToken> m_tokens;
44+ string_view m_logtype;
45+ string_view m_timestamp_raw;
46+ vector<ExpectedToken> m_tokens;
4747};
4848
4949/* *
@@ -58,8 +58,8 @@ struct ExpectedEvent {
5858 */
5959auto parse_and_validate (
6060 BufferParser& buffer_parser,
61- std:: string_view input,
62- std:: vector<ExpectedEvent> const & expected_events
61+ string_view input,
62+ vector<ExpectedEvent> const & expected_events
6363) -> void;
6464
6565/* *
@@ -70,8 +70,8 @@ auto parse_and_validate(
7070
7171auto parse_and_validate (
7272 BufferParser& buffer_parser,
73- std:: string_view input,
74- std:: vector<ExpectedEvent> const & expected_events
73+ string_view input,
74+ vector<ExpectedEvent> const & expected_events
7575) -> void {
7676 buffer_parser.reset ();
7777
@@ -121,29 +121,32 @@ auto parse_and_validate(
121121
122122 if (false == expected_captures.empty ()) {
123123 auto const & lexer{buffer_parser.get_log_parser ().m_lexer };
124- auto optional_capture_ids {lexer.get_capture_ids_from_rule_id (token_type)};
125- REQUIRE (optional_capture_ids .has_value ());
124+ auto optional_captures {lexer.get_captures_from_rule_id (token_type)};
125+ REQUIRE (optional_captures .has_value ());
126126
127- if (false == optional_capture_ids .has_value ()) {
127+ if (false == optional_captures .has_value ()) {
128128 return ;
129129 }
130130
131- for (auto const capture_id : optional_capture_ids.value ()) {
132- auto const capture_name{lexer.m_id_symbol .at (capture_id)};
133- REQUIRE (expected_captures.contains (capture_name));
134- auto optional_reg_ids{lexer.get_reg_ids_from_capture_id (capture_id)};
131+ REQUIRE (expected_captures.size () == optional_captures.value ().size ());
132+ for (uint32_t j{0 }; j < optional_captures.value ().size (); j++) {
133+ auto const capture{optional_captures.value ()[j]};
134+ auto const [expected_name, expected_positions]{expected_captures[j]};
135+ REQUIRE (expected_name == capture->get_name ());
136+ auto optional_reg_ids{lexer.get_reg_ids_from_capture (capture)};
135137 REQUIRE (optional_reg_ids.has_value ());
136138 if (false == optional_reg_ids.has_value ()) {
137139 return ;
138140 }
139141 auto const [start_reg_id, end_reg_id]{optional_reg_ids.value ()};
140- auto const actual_start_positions{
141- token.get_reversed_reg_positions (start_reg_id)
142- };
142+ auto actual_start_positions{token.get_reversed_reg_positions (start_reg_id)};
143143 auto const actual_end_positions{token.get_reversed_reg_positions (end_reg_id)};
144144 auto const [expected_start_positions, expected_end_positions]{
145- expected_captures. at (capture_name)
145+ expected_positions
146146 };
147+ // Note: Known bug that start positions contain failed match starts as well, so
148+ // currently it must be truncated.
149+ actual_start_positions.resize (actual_end_positions.size ());
147150 REQUIRE (expected_start_positions == actual_start_positions);
148151 REQUIRE (expected_end_positions == actual_end_positions);
149152 }
@@ -1066,3 +1069,77 @@ TEST_CASE("multi_capture_two", "[BufferParser]") {
10661069
10671070 parse_and_validate (buffer_parser, cInput, {expected_event});
10681071}
1072+
1073+ /* *
1074+ * @ingroup test_buffer_parser_capture
1075+ * @brief Tests a multi-capture with non-unique names.
1076+ *
1077+ * This test verifies that a buffer_parser with multiple capture rules with non-unique capture rules
1078+ * can be generated and used correctly.
1079+ *
1080+ * ### Schema Definition
1081+ * @code
1082+ * delimiters: \n\r[:,
1083+ * var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123)
1084+ * var2:(?<capture>[A-Za-z]+123) text text
1085+ * @endcode
1086+ *
1087+ * ### Input Example
1088+ * @code
1089+ * "Log is myCapture123 text anotherCapture123 and then another variable is capture123 text text"
1090+ * @endcode
1091+ *
1092+ * ### Expected Logtype
1093+ * @code
1094+ * "Log is <capture> text <capture> and then another variable is <capture> text text"
1095+ * @endcode
1096+ *
1097+ * ### Expected Tokenization
1098+ * @code
1099+ * "Log" -> uncaught string
1100+ * " is" -> uncaught string
1101+ * " " -> uncaught string
1102+ * "myCapture123 text anotherCapture123" -> "var1"
1103+ * " and" -> uncaught string
1104+ * " then" -> uncaught string
1105+ * " another" -> uncaught string
1106+ * " variable" -> uncaught string
1107+ * " is" -> uncaught string
1108+ * " " -> uncaught string
1109+ * "capture123 text text" -> "var2"
1110+ * @endcode
1111+ */
1112+ TEST_CASE (" multi_capture_non_unique_names" , " [BufferParser]" ) {
1113+ constexpr string_view cDelimitersSchema{R"( delimiters: \n\r[:,)" };
1114+ constexpr string_view cVar1{R"( var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123))" };
1115+ constexpr string_view cVar2{R"( var2:(?<capture>[A-Za-z]+123) text text)" };
1116+ constexpr string_view cInput{" Log is myCapture123 text anotherCapture123 and then another "
1117+ " variable is capture123 text text" };
1118+
1119+ ExpectedEvent const expected_event{
1120+ .m_logtype {" Log is <capture> text <capture> and then another variable is <capture> "
1121+ " text text" },
1122+ .m_timestamp_raw {" " },
1123+ .m_tokens {
1124+ {{" Log" , " " , {}},
1125+ {" is" , " " , {}},
1126+ {" myCapture123 text anotherCapture123" , " var1" ,
1127+ {{{" capture" , {{7 }, {19 }}},
1128+ {" capture" , {{25 }, {42 }}}}}},
1129+ {" and" , " " , {}},
1130+ {" then" , " " , {}},
1131+ {" another" , " " , {}},
1132+ {" variable" , " " , {}},
1133+ {" is" , " " , {}},
1134+ {" capture123 text text" , " var2" , {{{" capture" , {{72 }, {82 }}}}}}}
1135+ }
1136+ };
1137+
1138+ Schema schema;
1139+ schema.add_delimiters (cDelimitersSchema);
1140+ schema.add_variable (cVar1, -1 );
1141+ schema.add_variable (cVar2, -1 );
1142+ BufferParser buffer_parser{std::move (schema.release_schema_ast_ptr ())};
1143+
1144+ parse_and_validate (buffer_parser, cInput, {expected_event});
1145+ }
0 commit comments