Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 21 additions & 19 deletions src/log_surgeon/Lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include <array>
#include <cstdint>
#include <map>
#include <memory>
#include <optional>
#include <set>
Expand All @@ -13,6 +12,7 @@
#include <vector>

#include <log_surgeon/Constants.hpp>
#include <log_surgeon/finite_automata/Capture.hpp>
#include <log_surgeon/finite_automata/Dfa.hpp>
#include <log_surgeon/finite_automata/DfaState.hpp>
#include <log_surgeon/finite_automata/NfaState.hpp>
Expand Down Expand Up @@ -166,29 +166,29 @@ class Lexer {
}

/**
* Retrieves a list of capture IDs for a given rule ID.
* These capture IDs correspond to the captures in the rule that were matched during lexing.
* Retrieves a list of capture pointers for a given rule ID.
* These pointers correspond to the captures in the rule that were matched during lexing.
* @param rule_id The ID of the rule to search for captures.
* @return A vector of capture IDs if the rule contains captures;
* @return A vector of capture pointers if the rule contains captures;
* @return std::nullopt if no captures are found for the rule.
*/
[[nodiscard]] auto get_capture_ids_from_rule_id(rule_id_t const rule_id) const
-> std::optional<std::vector<capture_id_t>> {
if (m_rule_id_to_capture_ids.contains(rule_id)) {
return m_rule_id_to_capture_ids.at(rule_id);
[[nodiscard]] auto get_captures_from_rule_id(rule_id_t const rule_id) const
-> std::optional<std::vector<finite_automata::Capture const*>> {
if (m_rule_id_to_capture.contains(rule_id)) {
return m_rule_id_to_capture.at(rule_id);
}
return std::nullopt;
}

/**
* @param capture_id ID associated with a capture within a rule.
* @param capture Pointer to the capture.
* @return The start and end tag of the capture on success.
* @return std::nullopt if no capture is associated with the given capture ID.
* @return std::nullopt if no tags are associated with the given capture.
*/
[[nodiscard]] auto get_tag_id_pair_from_capture_id(capture_id_t const capture_id) const
[[nodiscard]] auto get_tag_id_pair_from_capture(finite_automata::Capture const* const capture) const
-> std::optional<std::pair<tag_id_t, tag_id_t>> {
if (m_capture_id_to_tag_id_pair.contains(capture_id)) {
return m_capture_id_to_tag_id_pair.at(capture_id);
if (m_capture_to_tag_id_pair.contains(capture)) {
return m_capture_to_tag_id_pair.at(capture);
}
return std::nullopt;
}
Expand All @@ -209,14 +209,14 @@ class Lexer {
}

/**
* Retrieves the register IDs for the start and end tags associated with a given capture ID.
* @param capture_id The ID of the capture to search for.
* Retrieves the register IDs for the start and end tags associated with a given capture.
* @param capture Pointer to the capture to search for.
* @return A pair of register IDs corresponding to the start and end tags of the capture.
* @return std::nullopt if no such capture is found.
*/
[[nodiscard]] auto get_reg_ids_from_capture_id(capture_id_t const capture_id) const
[[nodiscard]] auto get_reg_ids_from_capture(finite_automata::Capture const* const capture) const
-> std::optional<std::pair<reg_id_t, reg_id_t>> {
auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id(capture_id)};
auto const optional_tag_id_pair{get_tag_id_pair_from_capture(capture)};
if (false == optional_tag_id_pair.has_value()) {
return std::nullopt;
}
Expand Down Expand Up @@ -265,8 +265,10 @@ class Lexer {
bool m_asked_for_more_data{false};
TypedDfaState const* m_prev_state{nullptr};
TypedDfaState const* m_state{nullptr};
std::unordered_map<rule_id_t, std::vector<capture_id_t>> m_rule_id_to_capture_ids;
std::unordered_map<capture_id_t, std::pair<tag_id_t, tag_id_t>> m_capture_id_to_tag_id_pair;
std::unordered_map<rule_id_t, std::vector<finite_automata::Capture const*>>
m_rule_id_to_capture;
std::unordered_map<finite_automata::Capture const*, std::pair<tag_id_t, tag_id_t>>
m_capture_to_tag_id_pair;
};

namespace lexers {
Expand Down
23 changes: 6 additions & 17 deletions src/log_surgeon/Lexer.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <cassert>
#include <memory>
#include <stack>
#include <stdexcept>
#include <string>
#include <vector>

Expand Down Expand Up @@ -438,28 +437,18 @@ auto Lexer<TypedNfaState, TypedDfaState>::get_highest_priority_rule(rule_id_t co
template <typename TypedNfaState, typename TypedDfaState>
void Lexer<TypedNfaState, TypedDfaState>::generate() {
for (auto const& rule : m_rules) {
for (auto const* capture : rule.get_captures()) {
std::string const capture_name{capture->get_name()};
if (m_symbol_id.contains(capture_name)) {
throw std::invalid_argument(
"`m_rules` contains capture names that are not unique."
);
auto const rule_id{rule.get_variable_id()};
if (false == rule.get_captures().empty()) {
auto& captures_vec{m_rule_id_to_capture.try_emplace(rule_id).first->second};
for (auto const* capture : rule.get_captures()) {
captures_vec.push_back(capture);
}
auto const capture_id{m_symbol_id.size()};
m_symbol_id.emplace(capture_name, capture_id);
m_id_symbol.emplace(capture_id, capture_name);

auto const rule_id{rule.get_variable_id()};
m_rule_id_to_capture_ids.try_emplace(rule_id);
m_rule_id_to_capture_ids.at(rule_id).push_back(capture_id);
}
}

finite_automata::Nfa<TypedNfaState> nfa{m_rules};
for (auto const& [capture, tag_id_pair] : nfa.get_capture_to_tag_id_pair()) {
std::string const capture_name{capture->get_name()};
auto const capture_id{m_symbol_id.at(capture_name)};
m_capture_id_to_tag_id_pair.emplace(capture_id, tag_id_pair);
m_capture_to_tag_id_pair.emplace(capture, tag_id_pair);
}

m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState, TypedNfaState>>(nfa);
Expand Down
14 changes: 7 additions & 7 deletions src/log_surgeon/LogEvent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,16 @@ auto LogEventView::get_logtype() const -> std::string {
{
logtype += token_view.release_delimiter();
}
if (auto const& optional_capture_ids{
m_log_parser.m_lexer.get_capture_ids_from_rule_id(rule_id)
if (auto const& optional_captures{
m_log_parser.m_lexer.get_captures_from_rule_id(rule_id)
};
optional_capture_ids.has_value())
optional_captures.has_value())
{
auto capture_view{token_view};
auto const& capture_ids{optional_capture_ids.value()};
for (auto const capture_id : capture_ids) {
auto const& captures{optional_captures.value()};
for (auto const capture : captures) {
auto const& optional_reg_id_pair{
m_log_parser.m_lexer.get_reg_ids_from_capture_id(capture_id)
m_log_parser.m_lexer.get_reg_ids_from_capture(capture)
};
if (false == optional_reg_id_pair.has_value()) {
continue;
Expand All @@ -86,7 +86,7 @@ auto LogEventView::get_logtype() const -> std::string {
capture_view.get_reversed_reg_positions(optional_reg_id_pair->second)
};

auto capture_name{m_log_parser.get_id_symbol(capture_id)};
auto const& capture_name{capture->get_name()};
if (false == start_positions.empty() && -1 < start_positions[0]
&& false == end_positions.empty() && -1 < end_positions[0])
{
Expand Down
3 changes: 1 addition & 2 deletions src/log_surgeon/finite_automata/Capture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
#define LOG_SURGEON_FINITE_AUTOMATA_CAPTURE

#include <string>
#include <string_view>
#include <utility>

namespace log_surgeon::finite_automata {
class Capture {
public:
explicit Capture(std::string name) : m_name{std::move(name)} {}

[[nodiscard]] auto get_name() const -> std::string_view { return m_name; }
[[nodiscard]] auto get_name() const -> std::string const& { return m_name; }

private:
std::string m_name;
Expand Down
1 change: 0 additions & 1 deletion src/log_surgeon/finite_automata/Nfa.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP
#define LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
Expand Down
1 change: 0 additions & 1 deletion src/log_surgeon/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <cstdint>

namespace log_surgeon {
using capture_id_t = uint32_t;
using reg_id_t = uint32_t;
using rule_id_t = uint32_t;
using tag_id_t = uint32_t;
Expand Down
125 changes: 101 additions & 24 deletions tests/test-buffer-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,33 @@
#include <fmt/format.h>

using log_surgeon::BufferParser;
using log_surgeon::capture_id_t;
using log_surgeon::ErrorCode;
using log_surgeon::finite_automata::PrefixTree;
using log_surgeon::rule_id_t;
using log_surgeon::Schema;
using log_surgeon::SymbolId;
using std::pair;
using std::string;
using std::string_view;
using std::unordered_map;
using std::vector;

namespace {
struct CapturePositions {
std::vector<PrefixTree::position_t> m_start_positions;
std::vector<PrefixTree::position_t> m_end_positions;
vector<PrefixTree::position_t> m_start_positions;
vector<PrefixTree::position_t> m_end_positions;
};

struct ExpectedToken {
std::string_view m_raw_string;
std::string m_type;
std::map<string, CapturePositions> m_captures;
string_view m_raw_string;
string m_type;
vector<pair<string, CapturePositions>> m_captures;
};

struct ExpectedEvent {
std::string_view m_logtype;
std::string_view m_timestamp_raw;
std::vector<ExpectedToken> m_tokens;
string_view m_logtype;
string_view m_timestamp_raw;
vector<ExpectedToken> m_tokens;
};

/**
Expand All @@ -58,8 +58,8 @@ struct ExpectedEvent {
*/
auto parse_and_validate(
BufferParser& buffer_parser,
std::string_view input,
std::vector<ExpectedEvent> const& expected_events
string_view input,
vector<ExpectedEvent> const& expected_events
) -> void;

/**
Expand All @@ -70,8 +70,8 @@ auto parse_and_validate(

auto parse_and_validate(
BufferParser& buffer_parser,
std::string_view input,
std::vector<ExpectedEvent> const& expected_events
string_view input,
vector<ExpectedEvent> const& expected_events
) -> void {
buffer_parser.reset();

Expand Down Expand Up @@ -121,29 +121,32 @@ auto parse_and_validate(

if (false == expected_captures.empty()) {
auto const& lexer{buffer_parser.get_log_parser().m_lexer};
auto optional_capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
REQUIRE(optional_capture_ids.has_value());
auto optional_captures{lexer.get_captures_from_rule_id(token_type)};
REQUIRE(optional_captures.has_value());

if (false == optional_capture_ids.has_value()) {
if (false == optional_captures.has_value()) {
return;
}

for (auto const capture_id : optional_capture_ids.value()) {
auto const capture_name{lexer.m_id_symbol.at(capture_id)};
REQUIRE(expected_captures.contains(capture_name));
auto optional_reg_ids{lexer.get_reg_ids_from_capture_id(capture_id)};
REQUIRE(expected_captures.size() == optional_captures.value().size());
for (uint32_t j{0}; j < optional_captures.value().size(); j++) {
auto const capture{optional_captures.value()[j]};
auto const [expected_name, expected_positions]{expected_captures[j]};
REQUIRE(expected_name == capture->get_name());
auto optional_reg_ids{lexer.get_reg_ids_from_capture(capture)};
REQUIRE(optional_reg_ids.has_value());
if (false == optional_reg_ids.has_value()) {
return;
}
auto const [start_reg_id, end_reg_id]{optional_reg_ids.value()};
auto const actual_start_positions{
token.get_reversed_reg_positions(start_reg_id)
};
auto actual_start_positions{token.get_reversed_reg_positions(start_reg_id)};
auto const actual_end_positions{token.get_reversed_reg_positions(end_reg_id)};
auto const [expected_start_positions, expected_end_positions]{
expected_captures.at(capture_name)
expected_positions
};
// Note: Known bug that start positions contain failed match starts as well, so
// currently it must be truncated.
actual_start_positions.resize(actual_end_positions.size());
REQUIRE(expected_start_positions == actual_start_positions);
REQUIRE(expected_end_positions == actual_end_positions);
}
Expand Down Expand Up @@ -1066,3 +1069,77 @@ TEST_CASE("multi_capture_two", "[BufferParser]") {

parse_and_validate(buffer_parser, cInput, {expected_event});
}

/**
* @ingroup test_buffer_parser_capture
* @brief Tests a multi-capture with non-unique names.
*
* This test verifies that a buffer_parser with multiple capture rules with non-unique capture rules
* can be generated and used correctly.
*
* ### Schema Definition
* @code
* delimiters: \n\r[:,
* var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123)
* var2:(?<capture>[A-Za-z]+123) text text
* @endcode
*
* ### Input Example
* @code
* "Log is myCapture123 text anotherCapture123 and then another variable is capture123 text text"
* @endcode
*
* ### Expected Logtype
* @code
* "Log is <capture> text <capture> and then another variable is <capture> text text"
* @endcode
*
* ### Expected Tokenization
* @code
* "Log" -> uncaught string
* " is" -> uncaught string
* " " -> uncaught string
* "myCapture123 text anotherCapture123" -> "var1"
* " and" -> uncaught string
* " then" -> uncaught string
* " another" -> uncaught string
* " variable" -> uncaught string
* " is" -> uncaught string
* " " -> uncaught string
* "capture123 text text" -> "var2"
* @endcode
*/
TEST_CASE("multi_capture_non_unique_names", "[BufferParser]") {
constexpr string_view cDelimitersSchema{R"(delimiters: \n\r[:,)"};
constexpr string_view cVar1{R"(var1:(?<capture>[A-Za-z]+123) text (?<capture>[A-Za-z]+123))"};
constexpr string_view cVar2{R"(var2:(?<capture>[A-Za-z]+123) text text)"};
constexpr string_view cInput{"Log is myCapture123 text anotherCapture123 and then another "
"variable is capture123 text text"};

ExpectedEvent const expected_event{
.m_logtype{"Log is <capture> text <capture> and then another variable is <capture> "
"text text"},
.m_timestamp_raw{""},
.m_tokens{
{{"Log", "", {}},
{" is", "", {}},
{" myCapture123 text anotherCapture123", "var1",
{{{"capture", {{7}, {19}}},
{"capture", {{25}, {42}}}}}},
{" and", "", {}},
{" then", "", {}},
{" another", "", {}},
{" variable", "", {}},
{" is", "", {}},
{" capture123 text text", "var2", {{{"capture", {{72}, {82}}}}}}}
}
};

Schema schema;
schema.add_delimiters(cDelimitersSchema);
schema.add_variable(cVar1, -1);
schema.add_variable(cVar2, -1);
BufferParser buffer_parser{std::move(schema.release_schema_ast_ptr())};

parse_and_validate(buffer_parser, cInput, {expected_event});
}
Loading
Loading