feat: Support non-unique capture names (fixes #180). #193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

SharafMohamed merged 10 commits into y-scope:main from SharafMohamed:non-unique-capture-names

Nov 27, 2025

src/log_surgeon/Lexer.hpp

SharafMohamed marked this conversation as resolved.

Show resolved Hide resolved

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -3,7 +3,6 @@
  
    #include <array>

    #include <cstdint>

    #include <map>

    #include <memory>

    #include <optional>

    #include <set>

    @@ -13,6 +12,7 @@
  
    #include <vector>

    #include <log_surgeon/Constants.hpp>

    #include <log_surgeon/finite_automata/Capture.hpp>

    #include <log_surgeon/finite_automata/Dfa.hpp>

    #include <log_surgeon/finite_automata/DfaState.hpp>

    #include <log_surgeon/finite_automata/NfaState.hpp>

    @@ -166,29 +166,29 @@ class Lexer {
  
        }

        /**

         * Retrieves a list of capture IDs for a given rule ID.

         * These capture IDs correspond to the captures in the rule that were matched during lexing.

         * Retrieves a list of capture pointers for a given rule ID.

         * These pointers correspond to the captures in the rule that were matched during lexing.

         * @param rule_id The ID of the rule to search for captures.

         * @return A vector of capture IDs if the rule contains captures;

         * @return A vector of capture pointers if the rule contains captures;

         * @return std::nullopt if no captures are found for the rule.

         */

        [[nodiscard]] auto get_capture_ids_from_rule_id(rule_id_t const rule_id) const

                -> std::optional<std::vector<capture_id_t>> {

            if (m_rule_id_to_capture_ids.contains(rule_id)) {

                return m_rule_id_to_capture_ids.at(rule_id);

        [[nodiscard]] auto get_captures_from_rule_id(rule_id_t const rule_id) const

                -> std::optional<std::vector<finite_automata::Capture const*>> {

            if (m_rule_id_to_capture.contains(rule_id)) {

                return m_rule_id_to_capture.at(rule_id);

            }

            return std::nullopt;

        }

        /**

         * @param capture_id ID associated with a capture within a rule.

         * @param capture Pointer to the capture.

         * @return The start and end tag of the capture on success.

         * @return std::nullopt if no capture is associated with the given capture ID.

         * @return std::nullopt if no tags are associated with the given capture.

         */

        [[nodiscard]] auto get_tag_id_pair_from_capture_id(capture_id_t const capture_id) const

        [[nodiscard]] auto get_tag_id_pair_from_capture(finite_automata::Capture const* const capture) const

                -> std::optional<std::pair<tag_id_t, tag_id_t>> {

            if (m_capture_id_to_tag_id_pair.contains(capture_id)) {

                return m_capture_id_to_tag_id_pair.at(capture_id);

            if (m_capture_to_tag_id_pair.contains(capture)) {

                return m_capture_to_tag_id_pair.at(capture);

            }

            return std::nullopt;

        }

    @@ -209,14 +209,14 @@ class Lexer {
  
        }

        /**

         * Retrieves the register IDs for the start and end tags associated with a given capture ID.

         * @param capture_id The ID of the capture to search for.

         * Retrieves the register IDs for the start and end tags associated with a given capture.

         * @param capture Pointer to the capture to search for.

         * @return A pair of register IDs corresponding to the start and end tags of the capture.

         * @return std::nullopt if no such capture is found.

         */

        [[nodiscard]] auto get_reg_ids_from_capture_id(capture_id_t const capture_id) const

        [[nodiscard]] auto get_reg_ids_from_capture(finite_automata::Capture const* const capture) const

                -> std::optional<std::pair<reg_id_t, reg_id_t>> {

            auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id(capture_id)};

            auto const optional_tag_id_pair{get_tag_id_pair_from_capture(capture)};

            if (false == optional_tag_id_pair.has_value()) {

                return std::nullopt;

            }

    @@ -265,8 +265,10 @@ class Lexer {
  
        bool m_asked_for_more_data{false};

        TypedDfaState const* m_prev_state{nullptr};

        TypedDfaState const* m_state{nullptr};

        std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;

        std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;

        std::unordered_map<rule_id_t, std::vector<finite_automata::Capture const*>>

                m_rule_id_to_capture;

        std::unordered_map<finite_automata::Capture const*, std::pair<tag_id_t, tag_id_t>>

                m_capture_to_tag_id_pair;

    };

    namespace lexers {

src/log_surgeon/Lexer.tpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ @@
     #include <cassert>
     #include <memory>
     #include <stack>
-    #include <stdexcept>
     #include <string>
     #include <vector>
@@ Expand Down Expand Up @@
     template <typename TypedNfaState, typename TypedDfaState>
     void Lexer<TypedNfaState, TypedDfaState>::generate() {
         for (auto const& rule : m_rules) {
-            for (auto const* capture : rule.get_captures()) {
-                std::string const capture_name{capture->get_name()};
-                if (m_symbol_id.contains(capture_name)) {
-                    throw std::invalid_argument(
-                            "`m_rules` contains capture names that are not unique."
-                    );
+            auto const rule_id{rule.get_variable_id()};
+            if (false == rule.get_captures().empty()) {
+                auto& captures_vec{m_rule_id_to_capture.try_emplace(rule_id).first->second};
+                for (auto const* capture : rule.get_captures()) {
+                    captures_vec.push_back(capture);
                 }
-                auto const capture_id{m_symbol_id.size()};
-                m_symbol_id.emplace(capture_name, capture_id);
-                m_id_symbol.emplace(capture_id, capture_name);
-                auto const rule_id{rule.get_variable_id()};
-                m_rule_id_to_capture_ids.try_emplace(rule_id);
-                m_rule_id_to_capture_ids.at(rule_id).push_back(capture_id);
             }
         }
         finite_automata::Nfa<TypedNfaState> nfa{m_rules};
         for (auto const& [capture, tag_id_pair] : nfa.get_capture_to_tag_id_pair()) {
-            std::string const capture_name{capture->get_name()};
-            auto const capture_id{m_symbol_id.at(capture_name)};
-            m_capture_id_to_tag_id_pair.emplace(capture_id, tag_id_pair);
+            m_capture_to_tag_id_pair.emplace(capture, tag_id_pair);
         }
         m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa);
@@ Expand Down @@

src/log_surgeon/LogEvent.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -65,16 +65,16 @@ auto LogEventView::get_logtype() const -> std::string {
  
                {

                    logtype += token_view.release_delimiter();

                }

                if (auto const& optional_capture_ids{

                            m_log_parser.m_lexer.get_capture_ids_from_rule_id(rule_id)

                if (auto const& optional_captures{

                            m_log_parser.m_lexer.get_captures_from_rule_id(rule_id)

                    };

                    optional_capture_ids.has_value())

                    optional_captures.has_value())

                {

                    auto capture_view{token_view};

                    auto const& capture_ids{optional_capture_ids.value()};

                    for (auto const capture_id : capture_ids) {

                    auto const& captures{optional_captures.value()};

                    for (auto const capture : captures) {

                        auto const& optional_reg_id_pair{

                                m_log_parser.m_lexer.get_reg_ids_from_capture_id(capture_id)

                                m_log_parser.m_lexer.get_reg_ids_from_capture(capture)

                        };

                        if (false == optional_reg_id_pair.has_value()) {

                            continue;

    @@ -86,7 +86,7 @@ auto LogEventView::get_logtype() const -> std::string {
  
                                capture_view.get_reversed_reg_positions(optional_reg_id_pair->second)

                        };

                        auto capture_name{m_log_parser.get_id_symbol(capture_id)};

                        auto const& capture_name{capture->get_name()};

                        if (false == start_positions.empty() && -1 < start_positions[0]

                            && false == end_positions.empty() && -1 < end_positions[0])

                        {

src/log_surgeon/finite_automata/Capture.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,15 +2,14 @@ @@
     #define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE
     #include <string>
-    #include <string_view>
     #include <utility>
     namespace log_surgeon::finite_automata {
     class Capture {
     public:
         explicit Capture(std::string name) : m_name{std::move(name)} {}
-        [[nodiscard]] auto get_name() const -> std::string_view { return m_name; }
+        [[nodiscard]] auto get_name() const -> std::string const& { return m_name; }
     private:
         std::string m_name;
@@ Expand Down @@

src/log_surgeon/finite_automata/Nfa.hpp

-Original file line number
+Diff line change
@@ -1,7 +1,6 @@
     #ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP
     #define LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP
-    #include <cstddef>
     #include <cstdint>
     #include <memory>
     #include <optional>
@@ Expand Down @@

src/log_surgeon/types.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ @@
     #include <cstdint>
     namespace log_surgeon {
-    using capture_id_t = uint32_t;
     using reg_id_t = uint32_t;
     using rule_id_t = uint32_t;
     using tag_id_t = uint32_t;
@@ Expand Down @@

tests/test-buffer-parser.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,33 +17,33 @@
  
    #include <fmt/format.h>

    using log_surgeon::BufferParser;

    using log_surgeon::capture_id_t;

    using log_surgeon::ErrorCode;

    using log_surgeon::finite_automata::PrefixTree;

    using log_surgeon::rule_id_t;

    using log_surgeon::Schema;

    using log_surgeon::SymbolId;

    using std::pair;

    using std::string;

    using std::string_view;

    using std::unordered_map;

    using std::vector;

    namespace {

    struct CapturePositions {

        std::vector<PrefixTree::position_t> m_start_positions;

        std::vector<PrefixTree::position_t> m_end_positions;

        vector<PrefixTree::position_t> m_start_positions;

        vector<PrefixTree::position_t> m_end_positions;

    };

    struct ExpectedToken {

        std::string_view m_raw_string;

        std::string m_type;

        std::map<string, CapturePositions> m_captures;

        string_view m_raw_string;

        string m_type;

        vector<pair<string, CapturePositions>> m_captures;

    };

    struct ExpectedEvent {

        std::string_view m_logtype;

        std::string_view m_timestamp_raw;

        std::vector<ExpectedToken> m_tokens;

        string_view m_logtype;

        string_view m_timestamp_raw;

        vector<ExpectedToken> m_tokens;

    };

    /**

    @@ -58,8 +58,8 @@ struct ExpectedEvent {
  
     */

    auto parse_and_validate(

            BufferParser& buffer_parser,

            std::string_view input,

            std::vector<ExpectedEvent> const& expected_events

            string_view input,

            vector<ExpectedEvent> const& expected_events

    ) -> void;

    /**

    @@ -70,8 +70,8 @@ auto parse_and_validate(
  
    auto parse_and_validate(

            BufferParser& buffer_parser,

            std::string_view input,

            std::vector<ExpectedEvent> const& expected_events

            string_view input,

            vector<ExpectedEvent> const& expected_events

    ) -> void {

        buffer_parser.reset();

    @@ -121,29 +121,32 @@ auto parse_and_validate(
  
                if (false == expected_captures.empty()) {

                    auto const& lexer{buffer_parser.get_log_parser().m_lexer};

                    auto optional_capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};

                    REQUIRE(optional_capture_ids.has_value());

                    auto optional_captures{lexer.get_captures_from_rule_id(token_type)};

                    REQUIRE(optional_captures.has_value());

                    if (false == optional_capture_ids.has_value()) {

                    if (false == optional_captures.has_value()) {

                        return;

                    }

                    for (auto const capture_id : optional_capture_ids.value()) {

                        auto const capture_name{lexer.m_id_symbol.at(capture_id)};

                        REQUIRE(expected_captures.contains(capture_name));

                        auto optional_reg_ids{lexer.get_reg_ids_from_capture_id(capture_id)};

                    REQUIRE(expected_captures.size() == optional_captures.value().size());

                    for (uint32_t j{0}; j < optional_captures.value().size(); j++) {

                        auto const capture{optional_captures.value()[j]};

                        auto const [expected_name, expected_positions]{expected_captures[j]};

                        REQUIRE(expected_name == capture->get_name());

                        auto optional_reg_ids{lexer.get_reg_ids_from_capture(capture)};

                        REQUIRE(optional_reg_ids.has_value());

                        if (false == optional_reg_ids.has_value()) {

                            return;

                        }

                        auto const [start_reg_id, end_reg_id]{optional_reg_ids.value()};

                        auto const actual_start_positions{

                                token.get_reversed_reg_positions(start_reg_id)

                        };

                        auto actual_start_positions{token.get_reversed_reg_positions(start_reg_id)};

                        auto const actual_end_positions{token.get_reversed_reg_positions(end_reg_id)};

                        auto const [expected_start_positions, expected_end_positions]{

                                expected_captures.at(capture_name)

                                expected_positions

                        };

                        // Note: Known bug that start positions contain failed match starts as well, so

                        // currently it must be truncated.

                        actual_start_positions.resize(actual_end_positions.size());

                        REQUIRE(expected_start_positions == actual_start_positions);

                        REQUIRE(expected_end_positions == actual_end_positions);

                    }

    @@ -1066,3 +1069,77 @@ TEST_CASE("multi_capture_two", "[BufferParser]") {
  
        parse_and_validate(buffer_parser, cInput, {expected_event});

    }

    /**

     * @ingroup test_buffer_parser_capture

     * @brief Tests a multi-capture with non-unique names.

     *

     * This test verifies that a buffer_parser with multiple capture rules with non-unique capture rules

     * can be generated and used correctly.

     *

     * ### Schema Definition

     * @code

     * delimiters: \n\r[:,

     * var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123)

     * var2:(?<capture>[A-Za-z]+123) text text

     * @endcode

     *

     * ### Input Example

     * @code

     * "Log is myCapture123 text anotherCapture123 and then another variable is capture123 text text"

     * @endcode

     *

     * ### Expected Logtype

     * @code

     * "Log is <capture> text <capture> and then another variable is <capture> text text"

     * @endcode

     *

     * ### Expected Tokenization

     * @code

     * "Log" -> uncaught string

     * " is" -> uncaught string

     * " " -> uncaught string

     * "myCapture123 text anotherCapture123" -> "var1"

     * " and" -> uncaught string

     * " then" -> uncaught string

     * " another" -> uncaught string

     * " variable" -> uncaught string

     * " is" -> uncaught string

     * " " -> uncaught string

     * "capture123 text text" -> "var2"

     * @endcode

     */

    TEST_CASE("multi_capture_non_unique_names", "[BufferParser]") {

        constexpr string_view cDelimitersSchema{R"(delimiters: \n\r[:,)"};

        constexpr string_view cVar1{R"(var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123))"};

        constexpr string_view cVar2{R"(var2:(?<capture>[A-Za-z]+123) text text)"};

        constexpr string_view cInput{"Log is myCapture123 text anotherCapture123 and then another "

                                     "variable is capture123 text text"};

        ExpectedEvent const expected_event{

                .m_logtype{"Log is <capture> text <capture> and then another variable is <capture> "

                           "text text"},

                .m_timestamp_raw{""},

                .m_tokens{

                        {{"Log", "", {}},

                         {" is", "", {}},

                         {" myCapture123 text anotherCapture123", "var1",

                          {{{"capture", {{7}, {19}}},

                            {"capture", {{25}, {42}}}}}},

                         {" and", "", {}},

                         {" then", "", {}},

                         {" another", "", {}},

                         {" variable", "", {}},

                         {" is", "", {}},

                         {" capture123 text text", "var2", {{{"capture", {{72}, {82}}}}}}}

                }

        };

        Schema schema;

        schema.add_delimiters(cDelimitersSchema);

        schema.add_variable(cVar1, -1);

        schema.add_variable(cVar2, -1);

        BufferParser buffer_parser{std::move(schema.release_schema_ast_ptr())};

        parse_and_validate(buffer_parser, cInput, {expected_event});

    }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: Support non-unique capture names (fixes #180). #193

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!