Skip to content

Commit 193e1f9

Browse files
authored
feat: Add API for cleanly retrieving the capture group matches within a LogEvent's Token; Use the API in LogEventView::get_logtype to correctly only substitute leaf capture group matches. (#205)
1 parent 09f8a25 commit 193e1f9

9 files changed

Lines changed: 377 additions & 52 deletions

File tree

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ message(STATUS "Found fmt ${fmt_VERSION}.")
5252
find_package(Microsoft.GSL 4.0.0 REQUIRED)
5353
message(STATUS "Found Microsoft.GSL ${Microsoft.GSL_VERSION}.")
5454

55+
find_package(ystdlib 0.1.0 REQUIRED)
56+
message(STATUS "Found ystdlib ${ystdlib_VERSION}.")
57+
5558
if(log_surgeon_ENABLE_TESTS)
5659
find_package(Catch2 3.8.1 REQUIRED)
5760
message(STATUS "Found Catch2 ${Catch2_VERSION}.")
@@ -137,6 +140,7 @@ target_link_libraries(
137140
PUBLIC
138141
fmt::fmt
139142
Microsoft.GSL::GSL
143+
ystdlib::error_handling
140144
)
141145

142146
target_include_directories(

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ while (false == parser.done()) {
5555
if (ErrorCode err{parser.parse_next_event()}; ErrorCode::Success != err) {
5656
throw runtime_error("Parsing Failed");
5757
}
58+
LogEventView const& event{parser.get_log_parser().get_log_event_view()};
5859

5960
// Get and print the timestamp
6061
Token* timestamp{event.get_timestamp()};
@@ -63,7 +64,7 @@ while (false == parser.done()) {
6364
}
6465

6566
// Get and print the log-level
66-
auto const& loglevels = event.get_variables(*loglevel_id);
67+
auto const& loglevels{event.get_variables(*loglevel_id)};
6768
if (false == loglevels.empty()) {
6869
// In case there are multiple matches, just get the first one
6970
cout << "loglevel:" << loglevels[0]->to_string_view() << endl;
@@ -72,8 +73,7 @@ while (false == parser.done()) {
7273
// Other analysis...
7374

7475
// Print the entire event
75-
LogEventView const& event = parser.get_log_parser().get_log_event_view();
76-
cout << event->to_string() << endl;
76+
cout << event.to_string() << endl;
7777
}
7878
```
7979

@@ -91,6 +91,7 @@ Requirements:
9191
* [GSL] >= 4.0.0
9292
* [Task] >= 3.38
9393
* [uv] >= 0.7.10
94+
* [ystdlib-cpp] >= 0.1.0
9495

9596
To build and install the project to `$HOME/.local`:
9697

@@ -193,3 +194,4 @@ The following are issues we're aware of and working on:
193194
[GSL]: https://github.com/microsoft/GSL
194195
[Task]: https://taskfile.dev/
195196
[uv]: https://docs.astral.sh/uv
197+
[ystdlib-cpp]: https://github.com/y-scope/ystdlib-cpp

cmake/log_surgeon-config.cmake.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ if(@Microsoft.GSL_FOUND@)
1010
find_dependency(Microsoft.GSL)
1111
endif()
1212

13+
if(@ystdlib_FOUND@)
14+
find_dependency(ystdlib)
15+
endif()
16+
1317
set_and_check(log_surgeon_INCLUDE_DIR "@PACKAGE_LOG_SURGEON_INSTALL_INCLUDE_DIR@")
1418

1519
check_required_components(log_surgeon)

src/.clang-format

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ IncludeCategories:
99
Priority: 4
1010

1111
# External library headers. Update when adding new libraries.
12-
- Regex: "^<(fmt|gsl)"
12+
- Regex: "^<(fmt|gsl|ystdlib)"
1313
Priority: 3
1414

1515
# C system headers

src/log_surgeon/LogEvent.cpp

Lines changed: 120 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
#include "LogEvent.hpp"
22

3+
#include <cstddef>
34
#include <cstdint>
5+
#include <iterator>
46
#include <memory>
57
#include <optional>
8+
#include <set>
9+
#include <stdexcept>
610
#include <string>
711
#include <vector>
812

13+
#include <ystdlib/error_handling/ErrorCode.hpp>
14+
#include <ystdlib/error_handling/Result.hpp>
15+
916
#include <log_surgeon/Constants.hpp>
17+
#include <log_surgeon/finite_automata/Capture.hpp>
1018
#include <log_surgeon/LogParser.hpp>
1119
#include <log_surgeon/LogParserOutputBuffer.hpp>
1220
#include <log_surgeon/Token.hpp>
@@ -57,51 +65,104 @@ auto LogEventView::get_logtype() const -> std::string {
5765
auto token_view{m_log_output_buffer->get_mutable_token(i)};
5866
auto const rule_id{token_view.get_type_ids()->at(0)};
5967
if (static_cast<uint32_t>(SymbolId::TokenUncaughtString) == rule_id) {
60-
logtype += token_view.to_string_view();
68+
logtype.append(token_view.to_string_view());
69+
continue;
70+
}
71+
72+
bool is_first_token{};
73+
if (m_log_output_buffer->has_header()) {
74+
is_first_token = 0 == i;
6175
} else {
62-
bool is_first_token;
63-
if (m_log_output_buffer->has_header()) {
64-
is_first_token = 0 == i;
65-
} else {
66-
is_first_token = 1 == i;
67-
}
68-
if (static_cast<uint32_t>(SymbolId::TokenNewline) != rule_id && false == is_first_token)
69-
{
70-
logtype += token_view.release_delimiter();
71-
}
72-
auto const& optional_captures{m_log_parser.m_lexer.get_captures_from_rule_id(rule_id)};
73-
if (optional_captures.has_value()) {
74-
auto capture_view{token_view};
75-
auto const& captures{optional_captures.value()};
76-
for (auto const capture : captures) {
77-
auto const [reg_start_id, reg_end_id]{
78-
m_log_parser.m_lexer.get_reg_ids_from_capture(capture)
79-
};
80-
auto const start_positions{
81-
capture_view.get_reversed_reg_positions(reg_start_id)
82-
};
83-
auto const end_positions{capture_view.get_reversed_reg_positions(reg_end_id)};
84-
85-
auto const& capture_name{capture->get_name()};
86-
if (false == start_positions.empty() && -1 < start_positions[0]
87-
&& false == end_positions.empty() && -1 < end_positions[0])
88-
{
89-
capture_view.set_end_pos(start_positions[0]);
90-
logtype.append(capture_view.to_string_view());
91-
logtype.append("<" + capture_name + ">");
92-
capture_view.set_start_pos(end_positions[0]);
93-
}
94-
}
95-
capture_view.set_end_pos(token_view.get_end_pos());
96-
logtype.append(capture_view.to_string_view());
97-
} else {
98-
logtype += "<" + m_log_parser.get_id_symbol(rule_id) + ">";
76+
is_first_token = 1 == i;
77+
}
78+
if (static_cast<uint32_t>(SymbolId::TokenNewline) != rule_id && false == is_first_token) {
79+
logtype += token_view.release_delimiter();
80+
}
81+
82+
auto const matches{get_capture_matches(token_view)};
83+
if (matches.has_error()) {
84+
logtype.append("<" + m_log_parser.get_id_symbol(rule_id) + ">");
85+
continue;
86+
}
87+
auto prev_end_pos{token_view.get_start_pos()};
88+
for (auto const& match : matches.value()) {
89+
if (match.m_leaf) {
90+
logtype.append(
91+
token_view.get_sub_token(prev_end_pos, match.m_pos.m_start).to_string_view()
92+
);
93+
logtype.append("<" + match.m_capture->get_name() + ">");
94+
prev_end_pos = match.m_pos.m_end;
9995
}
10096
}
97+
logtype.append(
98+
token_view.get_sub_token(prev_end_pos, token_view.get_end_pos()).to_string_view()
99+
);
101100
}
102101
return logtype;
103102
}
104103

104+
auto LogEventView::get_capture_matches(Token const& root_var) const
105+
-> ystdlib::error_handling::Result<std::vector<Token::CaptureMatch>> {
106+
auto captures{
107+
get_log_parser().m_lexer.get_captures_from_rule_id(root_var.get_type_ids()->at(0))
108+
};
109+
if (false == captures.has_value()) {
110+
return LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroups};
111+
}
112+
113+
auto cmp{[](Token::CaptureMatch const& a, Token::CaptureMatch const& b) -> bool {
114+
if (a.m_pos.m_start != b.m_pos.m_start) {
115+
return a.m_pos.m_start < b.m_pos.m_start;
116+
}
117+
return a.m_pos.m_end > b.m_pos.m_end;
118+
}};
119+
std::set<Token::CaptureMatch, decltype(cmp)> ordered_matches;
120+
for (auto const* const capture : captures.value()) {
121+
auto position{get_capture_position(root_var, capture)};
122+
if (position.has_error()) {
123+
if (LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroupMatch} == position.error()) {
124+
continue;
125+
}
126+
return position.error();
127+
}
128+
ordered_matches.emplace(capture, position.value(), true);
129+
}
130+
if (ordered_matches.empty()) {
131+
return {{}};
132+
}
133+
134+
std::vector<Token::CaptureMatch> matches;
135+
matches.reserve(ordered_matches.size());
136+
auto const last_match{std::prev(ordered_matches.end())};
137+
for (auto match{ordered_matches.begin()}; match != last_match; ++match) {
138+
auto next_match{std::next(match)};
139+
auto leaf{false};
140+
if (match->m_pos.m_end <= next_match->m_pos.m_start) {
141+
leaf = true;
142+
}
143+
matches.emplace_back(match->m_capture, match->m_pos, leaf);
144+
}
145+
matches.emplace_back(last_match->m_capture, last_match->m_pos, true);
146+
return matches;
147+
}
148+
149+
auto LogEventView::get_capture_position(
150+
Token const& root_var,
151+
finite_automata::Capture const* const& capture
152+
) const -> ystdlib::error_handling::Result<Token::CaptureMatchPosition> {
153+
auto const [start_reg_id, end_reg_id]{
154+
get_log_parser().m_lexer.get_reg_ids_from_capture(capture)
155+
};
156+
auto const start_positions{root_var.get_reversed_reg_positions(start_reg_id)};
157+
auto const end_positions{root_var.get_reversed_reg_positions(end_reg_id)};
158+
if (start_positions.empty() || 0 > start_positions[0] || end_positions.empty()
159+
|| 0 > end_positions[0])
160+
{
161+
return LogEventErrorCode{LogEventErrorCodeEnum::NoCaptureGroupMatch};
162+
}
163+
return {start_positions[0], end_positions[0]};
164+
}
165+
105166
LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} {
106167
set_multiline(src.is_multiline());
107168
m_log_output_buffer->set_has_header(src.m_log_output_buffer->has_header());
@@ -147,3 +208,24 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()}
147208
}
148209
}
149210
} // namespace log_surgeon
211+
212+
using log_surgeon::LogEventErrorCodeEnum;
213+
214+
using LogEventErrorCategory = ystdlib::error_handling::ErrorCategory<LogEventErrorCodeEnum>;
215+
216+
template <>
217+
auto LogEventErrorCategory::name() const noexcept -> char const* {
218+
return "log_surgeon::LogEvent";
219+
}
220+
221+
template <>
222+
auto LogEventErrorCategory::message(LogEventErrorCodeEnum error_enum) const -> std::string {
223+
switch (error_enum) {
224+
case LogEventErrorCodeEnum::NoCaptureGroups:
225+
return "LogEvent NoCaptureGroups";
226+
case LogEventErrorCodeEnum::NoCaptureGroupMatch:
227+
return "LogEvent NoCaptureGroupMatch";
228+
default:
229+
return "Unrecognized LogEventErrorCode";
230+
}
231+
}

src/log_surgeon/LogEvent.hpp

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
#ifndef LOG_SURGEON_LOG_EVENT_HPP
22
#define LOG_SURGEON_LOG_EVENT_HPP
33

4+
#include <cstddef>
5+
#include <cstdint>
46
#include <memory>
7+
#include <optional>
58
#include <string>
69
#include <vector>
710

11+
#include <ystdlib/error_handling/ErrorCode.hpp>
12+
#include <ystdlib/error_handling/Result.hpp>
13+
14+
#include <log_surgeon/finite_automata/Capture.hpp>
815
#include <log_surgeon/LogParserOutputBuffer.hpp>
916
#include <log_surgeon/Token.hpp>
1017

@@ -93,13 +100,16 @@ class LogEventView {
93100
[[nodiscard]] auto to_string() const -> std::string;
94101

95102
/**
96-
* Constructs a user friendly/readable representation of the log event's
97-
* logtype. A logtype is essentially the static text of a log event with the
98-
* variable components replaced with their name. Therefore, two separate log
99-
* events from the same logging source code may have the same logtype.
103+
* Constructs a user friendly/readable representation of the log event's logtype. A logtype is
104+
* essentially the static text of a log event with the variable components replaced with their
105+
* name. Therefore, two separate log events from the same logging source code may have the same
106+
* logtype.
107+
*
108+
* If a schema variable may contain capture groups, any leaf capture group matches will be
109+
* replaced with their name while all other text is treated as static text.
100110
* @return The logtype of the log.
101111
*/
102-
auto get_logtype() const -> std::string;
112+
[[nodiscard]] auto get_logtype() const -> std::string;
103113

104114
/**
105115
* Adds a Token to the array of tokens of a particular token type.
@@ -115,6 +125,37 @@ class LogEventView {
115125
m_log_var_occurrences[token_type_id].push_back(token_ptr);
116126
}
117127

128+
/**
129+
* Retrieves the position of match of type `capture` within `root_var`. `root_var` must be the
130+
* root parent variable containing `capture`.
131+
* @param root_var The parent log surgeon schema variable for `capture`.
132+
* @param capture The capture group type.
133+
* @return A result containing a `CapturePosition` on success, or an error code indicating the
134+
* failure:
135+
* - LogEventErrorCodeEnum::NoCaptureGroupMatch if `root_var` contains no valid match positions
136+
* for `capture`.
137+
*/
138+
[[nodiscard]] auto get_capture_position(
139+
Token const& root_var,
140+
finite_automata::Capture const* const& capture
141+
) const -> ystdlib::error_handling::Result<Token::CaptureMatchPosition>;
142+
143+
/**
144+
* Returns the capture group matches within `root_var` sorted by their appearance within the
145+
* text, with parent capture groups appearing before their children. More formally, they are
146+
* sorted by increasing start position and then by decreasing end position.
147+
*
148+
* Since capture groups can only overlap when nested and cannot span across the boundary of
149+
* another group, a capture group is a leaf if its end position is less than the end position of
150+
* the next capture group (or it is the last capture group).
151+
* @param root_var The root variable to get the capture groups from.
152+
* @return A result containing the sorted capture group matches (empty if no matches were
153+
* found), or an error code indicating the failure:
154+
* - LogEventErrorCodeEnum::NoCaptureGroups if no capture groups exist for `root_var`.
155+
*/
156+
[[nodiscard]] auto get_capture_matches(log_surgeon::Token const& root_var) const
157+
-> ystdlib::error_handling::Result<std::vector<Token::CaptureMatch>>;
158+
118159
// TODO: have LogParser own the output buffer as a LogEventView is already
119160
// tied to a single log parser
120161
std::unique_ptr<LogParserOutputBuffer> m_log_output_buffer;
@@ -143,6 +184,15 @@ class LogEvent : public LogEventView {
143184
private:
144185
std::vector<char> m_buffer;
145186
};
187+
188+
enum class LogEventErrorCodeEnum : uint8_t {
189+
NoCaptureGroups,
190+
NoCaptureGroupMatch
191+
};
192+
193+
using LogEventErrorCode = ystdlib::error_handling::ErrorCode<LogEventErrorCodeEnum>;
146194
} // namespace log_surgeon
147195

196+
YSTDLIB_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(log_surgeon::LogEventErrorCodeEnum);
197+
148198
#endif // LOG_SURGEON_LOG_EVENT_HPP

0 commit comments

Comments
 (0)