Skip to content

Commit e4e3540

Browse files
committed
Merge branch 'main' into test-reader-parser
2 parents 682e871 + b1c4fea commit e4e3540

File tree

10 files changed

+190
-95
lines changed

10 files changed

+190
-95
lines changed

src/log_surgeon/BufferParser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ BufferParser::parse_next_event(char* buf, size_t size, size_t& offset, bool fini
3232
if (0 != m_log_parser.get_log_event_view().m_log_output_buffer->pos()) {
3333
offset = m_log_parser.get_log_event_view()
3434
.m_log_output_buffer->get_token(0)
35-
.m_start_pos;
35+
.get_start_pos();
3636
}
3737
reset();
3838
return error_code;

src/log_surgeon/Lalr1Parser.tpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ namespace {
2020
MatchedSymbol& curr_symbol = symbols.top();
2121
std::visit(
2222
Overloaded{
23-
[&line_num](Token& token) { line_num = token.m_line; },
23+
[&line_num](Token& token) { line_num = token.get_line_num(); },
2424
[&symbols](NonTerminal& m) {
2525
for (size_t i{0}; i < m.get_production()->m_body.size(); ++i) {
2626
symbols.push(m.move_symbol(i));
@@ -548,7 +548,9 @@ template <typename TypedNfaState, typename TypedDfaState>
548548
auto Lalr1Parser<TypedNfaState, TypedDfaState>::get_input_until_next_newline(Token* error_token)
549549
-> std::string {
550550
std::string rest_of_line;
551-
bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd);
551+
bool next_is_end_token{
552+
error_token->get_type_ids()->at(0) == static_cast<uint32_t>(SymbolId::TokenEnd)
553+
};
552554
bool next_has_newline = (error_token->to_string().find('\n') != std::string::npos)
553555
|| (error_token->to_string().find('\r') != std::string::npos);
554556
while (!next_has_newline && !next_is_end_token) {
@@ -557,7 +559,8 @@ auto Lalr1Parser<TypedNfaState, TypedDfaState>::get_input_until_next_newline(Tok
557559
|| (token.to_string().find('\r') != std::string::npos);
558560
if (!next_has_newline) {
559561
rest_of_line += token.to_string();
560-
next_is_end_token = (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd);
562+
next_is_end_token
563+
= token.get_type_ids()->at(0) == static_cast<uint32_t>(SymbolId::TokenEnd);
561564
}
562565
}
563566
rest_of_line += "\n";
@@ -581,7 +584,9 @@ auto Lalr1Parser<TypedNfaState, TypedDfaState>::report_error() -> std::string {
581584
error_indicator += " ";
582585
}
583586
error_indicator += "^\n";
584-
if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd && consumed_input.empty()) {
587+
if (token.get_type_ids()->at(0) == static_cast<uint32_t>(SymbolId::TokenEnd)
588+
&& consumed_input.empty())
589+
{
585590
error_type = "empty file";
586591
error_indicator = "^\n";
587592
} else {
@@ -667,7 +672,7 @@ auto Lalr1Parser<TypedNfaState, TypedDfaState>::get_next_symbol() -> Token {
667672
template <typename TypedNfaState, typename TypedDfaState>
668673
auto Lalr1Parser<TypedNfaState, TypedDfaState>::parse_advance(Token& next_token, bool* accept)
669674
-> bool {
670-
for (auto const type : *next_token.m_type_ids_ptr) {
675+
for (auto const type : *next_token.get_type_ids()) {
671676
if (parse_symbol(type, next_token, accept)) {
672677
return *accept;
673678
}
@@ -718,12 +723,12 @@ auto Lalr1Parser<TypedNfaState, TypedDfaState>::parse_symbol(
718723
m_parse_stack_matches.pop();
719724
}
720725
if (reduce->m_semantic_rule != nullptr) {
721-
if (0 == m_next_token->m_start_pos) {
726+
if (0 == m_next_token->get_start_pos()) {
722727
m_input_buffer.set_consumed_pos(
723728
m_input_buffer.storage().size() - 1
724729
);
725730
} else {
726-
m_input_buffer.set_consumed_pos(m_next_token->m_start_pos - 1);
731+
m_input_buffer.set_consumed_pos(m_next_token->get_start_pos() - 1);
727732
}
728733
matched_non_terminal.set_parser_ast(
729734
reduce->m_semantic_rule(&matched_non_terminal)

src/log_surgeon/Lexer.tpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer)
6060
input_buffer.storage().size(),
6161
m_match_line,
6262
m_type_ids,
63-
std::move(m_dfa->release_reg_handler())
63+
m_dfa->release_reg_handler()
6464
};
6565
return {ErrorCode::Success, token};
6666
}
@@ -141,7 +141,7 @@ auto Lexer<TypedNfaState, TypedDfaState>::scan(ParserInputBuffer& input_buffer)
141141
input_buffer.storage().size(),
142142
m_match_line,
143143
m_type_ids,
144-
std::move(m_dfa->release_reg_handler())
144+
m_dfa->release_reg_handler()
145145
};
146146
return {ErrorCode::Success, token};
147147
}

src/log_surgeon/LogEvent.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,14 @@ auto LogEventView::get_logtype() const -> std::string {
5656
}
5757
for (uint32_t i{1}; i < m_log_output_buffer->pos(); ++i) {
5858
auto token_view{m_log_output_buffer->get_mutable_token(i)};
59-
auto const rule_id{token_view.m_type_ids_ptr->at(0)};
59+
auto const rule_id{token_view.get_type_ids()->at(0)};
6060
if (static_cast<uint32_t>(SymbolId::TokenUncaughtString) == rule_id) {
6161
logtype += token_view.to_string_view();
6262
} else {
6363
bool const is_first_token{false == m_log_output_buffer->has_timestamp() && 1 == i};
6464
if (static_cast<uint32_t>(SymbolId::TokenNewline) != rule_id && false == is_first_token)
6565
{
66-
logtype += token_view.get_delimiter();
67-
token_view.m_start_pos++;
66+
logtype += token_view.release_delimiter();
6867
}
6968
if (auto const& optional_capture_ids{
7069
m_log_parser.m_lexer.get_capture_ids_from_rule_id(rule_id)
@@ -91,13 +90,13 @@ auto LogEventView::get_logtype() const -> std::string {
9190
if (false == start_positions.empty() && -1 < start_positions[0]
9291
&& false == end_positions.empty() && -1 < end_positions[0])
9392
{
94-
capture_view.m_end_pos = start_positions[0];
93+
capture_view.set_end_pos(start_positions[0]);
9594
logtype.append(capture_view.to_string_view());
9695
logtype.append("<" + capture_name + ">");
97-
capture_view.m_start_pos = end_positions[0];
96+
capture_view.set_start_pos(end_positions[0]);
9897
}
9998
}
100-
capture_view.m_end_pos = token_view.m_end_pos;
99+
capture_view.set_end_pos(token_view.get_end_pos());
101100
logtype.append(capture_view.to_string_view());
102101
} else {
103102
logtype += "<" + m_log_parser.get_id_symbol(rule_id) + ">";
@@ -140,14 +139,14 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()}
140139
m_buffer.data(),
141140
buffer_size,
142141
0,
143-
token.m_type_ids_ptr
142+
token.get_type_ids()
144143
};
145144
m_log_output_buffer->set_curr_token(copied_token);
146145
m_log_output_buffer->advance_to_next_token();
147146
}
148147
for (uint32_t i = 0; i < get_log_output_buffer()->pos(); i++) {
149148
Token& token = get_log_output_buffer()->get_mutable_token(i);
150-
auto const& token_types = *token.m_type_ids_ptr;
149+
auto const& token_types{*token.get_type_ids()};
151150
add_token(token_types[0], &token);
152151
}
153152
}

src/log_surgeon/LogParser.cpp

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -137,39 +137,37 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode {
137137
}
138138
next_token = optional_next_token.value();
139139
if (false == output_buffer->has_timestamp()
140-
&& next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenNewlineTimestamp)
140+
&& next_token.get_type_ids()->at(0)
141+
== static_cast<uint32_t>(SymbolId::TokenNewlineTimestamp))
141142
{
142143
// TODO: combine the below with found_start_of_next_message
143144
// into 1 function
144145
// Increment by 1 because the '\n' character is not part of the
145146
// next log message
146147
m_start_of_log_message = next_token;
147-
if (m_start_of_log_message.m_start_pos == m_start_of_log_message.m_buffer_size - 1)
148-
{
149-
m_start_of_log_message.m_start_pos = 0;
150-
} else {
151-
m_start_of_log_message.m_start_pos++;
152-
}
148+
m_start_of_log_message.increment_start_pos();
153149
// make a message with just the '\n' character
154-
next_token.m_end_pos = next_token.m_start_pos + 1;
155-
next_token.m_type_ids_ptr
156-
= &Lexer<ByteNfaState, ByteDfaState>::cTokenUncaughtStringTypes;
150+
next_token.set_end_pos(next_token.get_next_pos());
151+
next_token.set_type_ids(
152+
&Lexer<ByteNfaState, ByteDfaState>::cTokenUncaughtStringTypes
153+
);
157154
output_buffer->set_token(1, next_token);
158155
output_buffer->set_pos(2);
159-
m_input_buffer.set_consumed_pos(next_token.m_start_pos);
156+
m_input_buffer.set_consumed_pos(next_token.get_start_pos());
160157
m_has_start_of_log = true;
161158
parsing_action = ParsingAction::Compress;
162159
return ErrorCode::Success;
163160
}
164161
}
165-
if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd) {
162+
if (next_token.get_type_ids()->at(0) == static_cast<uint32_t>(SymbolId::TokenEnd)) {
166163
output_buffer->set_token(0, next_token);
167164
output_buffer->set_pos(1);
168165
parsing_action = ParsingAction::CompressAndFinish;
169166
return ErrorCode::Success;
170167
}
171-
if (next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenFirstTimestamp
172-
|| next_token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenNewlineTimestamp)
168+
if (next_token.get_type_ids()->at(0) == static_cast<uint32_t>(SymbolId::TokenFirstTimestamp)
169+
|| next_token.get_type_ids()->at(0)
170+
== static_cast<uint32_t>(SymbolId::TokenNewlineTimestamp))
173171
{
174172
output_buffer->set_has_timestamp(true);
175173
output_buffer->set_token(0, next_token);
@@ -189,11 +187,11 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode {
189187
}
190188
Token next_token{optional_next_token.value()};
191189
output_buffer->set_curr_token(next_token);
192-
auto token_type = next_token.m_type_ids_ptr->at(0);
190+
auto token_type{next_token.get_type_ids()->at(0)};
193191
bool found_start_of_next_message
194192
= (output_buffer->has_timestamp()
195193
&& token_type == (uint32_t)SymbolId::TokenNewlineTimestamp)
196-
|| (!output_buffer->has_timestamp() && next_token.get_char(0) == '\n'
194+
|| (false == output_buffer->has_timestamp() && next_token.get_delimiter() == "\n"
197195
&& token_type != (uint32_t)SymbolId::TokenNewline);
198196
if (token_type == (uint32_t)SymbolId::TokenEnd) {
199197
parsing_action = ParsingAction::CompressAndFinish;
@@ -202,7 +200,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode {
202200
if (false == output_buffer->has_timestamp()
203201
&& token_type == (uint32_t)SymbolId::TokenNewline)
204202
{
205-
m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().m_end_pos);
203+
m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().get_end_pos());
206204
output_buffer->advance_to_next_token();
207205
parsing_action = ParsingAction::Compress;
208206
return ErrorCode::Success;
@@ -211,22 +209,13 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode {
211209
// increment by 1 because the '\n' character is not part of the next
212210
// log message
213211
m_start_of_log_message = output_buffer->get_curr_token();
214-
if (m_start_of_log_message.m_start_pos == m_start_of_log_message.m_buffer_size - 1) {
215-
m_start_of_log_message.m_start_pos = 0;
216-
} else {
217-
m_start_of_log_message.m_start_pos++;
218-
}
212+
auto const consumed_pos{m_start_of_log_message.increment_start_pos()};
219213
// make the last token of the current message the '\n' character
220214
Token curr_token = output_buffer->get_curr_token();
221-
curr_token.m_end_pos = curr_token.m_start_pos + 1;
222-
curr_token.m_type_ids_ptr
223-
= &Lexer<ByteNfaState, ByteDfaState>::cTokenUncaughtStringTypes;
215+
curr_token.set_end_pos(curr_token.get_next_pos());
216+
curr_token.set_type_ids(&Lexer<ByteNfaState, ByteDfaState>::cTokenUncaughtStringTypes);
224217
output_buffer->set_curr_token(curr_token);
225-
if (0 == m_start_of_log_message.m_start_pos) {
226-
m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - 1);
227-
} else {
228-
m_input_buffer.set_consumed_pos(m_start_of_log_message.m_start_pos - 1);
229-
}
218+
m_input_buffer.set_consumed_pos(consumed_pos);
230219
m_has_start_of_log = true;
231220
output_buffer->advance_to_next_token();
232221
parsing_action = ParsingAction::Compress;
@@ -255,7 +244,7 @@ auto LogParser::generate_log_event_view_metadata() -> void {
255244
uint32_t first_newline_pos{0};
256245
for (uint32_t i = start; i < m_log_event_view->m_log_output_buffer->pos(); i++) {
257246
Token* token = &m_log_event_view->m_log_output_buffer->get_mutable_token(i);
258-
m_log_event_view->add_token(token->m_type_ids_ptr->at(0), token);
247+
m_log_event_view->add_token(token->get_type_ids()->at(0), token);
259248
if (token->get_delimiter() == "\n" && first_newline_pos == 0) {
260249
first_newline_pos = i;
261250
}

src/log_surgeon/SchemaParser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static auto schema_var_rule(NonTerminal* m) -> unique_ptr<SchemaVarAST> {
132132
return make_unique<SchemaVarAST>(
133133
identifier_ast.m_name,
134134
std::move(m->non_terminal_cast(3).get_parser_ast().get<unique_ptr<RegexASTByte>>()),
135-
m->token_cast(2).m_line
135+
m->token_cast(2).get_line_num()
136136
);
137137
}
138138

src/log_surgeon/Token.cpp

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,78 @@
11
#include "Token.hpp"
22

3-
#include <algorithm>
3+
#include <cstddef>
44
#include <string>
55
#include <string_view>
66

77
namespace log_surgeon {
8-
auto Token::to_string() -> std::string {
9-
if (m_start_pos <= m_end_pos) {
10-
return {m_buffer + m_start_pos, m_buffer + m_end_pos};
11-
}
12-
if (m_wrap_around_string.empty()) {
13-
m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size}
14-
+ std::string{m_buffer, m_buffer + m_end_pos};
8+
auto Token::get_cached_string() -> std::string const& {
9+
if (m_cached_string.empty()) {
10+
if (get_start_pos() <= get_end_pos()) {
11+
auto const token{m_buffer.subspan(get_start_pos(), get_end_pos() - get_start_pos())};
12+
m_cached_string = std::string{token.begin(), token.end()};
13+
} else {
14+
auto const token_start{
15+
m_buffer.subspan(get_start_pos(), get_buffer_size() - get_start_pos())
16+
};
17+
auto const token_end{m_buffer.subspan(0, get_end_pos())};
18+
m_cached_string = std::string{token_start.begin(), token_start.end()}
19+
+ std::string{token_end.begin(), token_end.end()};
20+
}
1521
}
16-
return {m_wrap_around_string};
22+
return m_cached_string;
23+
}
24+
25+
auto Token::to_string() -> std::string {
26+
return {get_cached_string()};
1727
}
1828

1929
auto Token::to_string_view() -> std::string_view {
20-
if (m_start_pos <= m_end_pos) {
21-
return {m_buffer + m_start_pos, m_end_pos - m_start_pos};
22-
}
23-
if (m_wrap_around_string.empty()) {
24-
m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size}
25-
+ std::string{m_buffer, m_buffer + m_end_pos};
30+
if (get_start_pos() <= get_end_pos()) {
31+
auto const token{m_buffer.subspan(get_start_pos(), get_end_pos() - get_start_pos())};
32+
return {token.begin(), token.end()};
2633
}
27-
return {m_wrap_around_string};
34+
return {get_cached_string()};
35+
}
36+
37+
auto Token::get_delimiter() const -> std::string {
38+
auto const delim{m_buffer.subspan(get_start_pos(), 1)};
39+
return {delim.begin(), delim.end()};
2840
}
2941

30-
auto Token::get_char(uint8_t i) const -> char {
31-
if (m_start_pos + i < m_buffer_size) {
32-
return m_buffer[m_start_pos + i];
42+
auto Token::get_length() const -> size_t {
43+
if (get_start_pos() <= get_end_pos()) {
44+
return get_end_pos() - get_start_pos();
3345
}
34-
return m_buffer[i - (m_buffer_size - m_start_pos)];
46+
return get_buffer_size() - get_start_pos() + get_end_pos();
3547
}
3648

37-
auto Token::get_delimiter() const -> std::string {
38-
return {m_buffer + m_start_pos, m_buffer + m_start_pos + 1};
49+
auto Token::release_delimiter() -> char {
50+
auto const delim{m_buffer[get_start_pos()]};
51+
increment_start_pos();
52+
return delim;
53+
}
54+
55+
auto Token::set_start_pos(size_t pos) -> void {
56+
m_cached_string.clear();
57+
m_start_pos = pos;
58+
}
59+
60+
auto Token::set_end_pos(size_t pos) -> void {
61+
m_cached_string.clear();
62+
m_end_pos = pos;
63+
}
64+
65+
auto Token::increment_start_pos() -> size_t {
66+
auto const old_start_pos{get_start_pos()};
67+
set_start_pos(get_next_pos());
68+
return old_start_pos;
3969
}
4070

41-
auto Token::get_length() const -> uint32_t {
42-
if (m_start_pos <= m_end_pos) {
43-
return m_end_pos - m_start_pos;
71+
auto Token::get_next_pos() const -> size_t {
72+
auto next_pos{get_start_pos() + 1};
73+
if (next_pos == get_buffer_size()) {
74+
next_pos = 0;
4475
}
45-
return m_buffer_size - m_start_pos + m_end_pos;
76+
return next_pos;
4677
}
4778
} // namespace log_surgeon

0 commit comments

Comments
 (0)