Skip to content

Commit f9d40ee

Browse files
committed
[logfile] build content map for compressed files
1 parent 306d1db commit f9d40ee

File tree

4 files changed

+147
-26
lines changed

4 files changed

+147
-26
lines changed

src/line_buffer.cc

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,9 @@ line_buffer::load_next_buffer()
709709
count = BZ2_bzread(bz_file,
710710
scratch,
711711
std::min((size_t) seek_to, sizeof(scratch)));
712+
if (count <= 0) {
713+
break;
714+
}
712715
seek_to -= count;
713716
}
714717
rc = BZ2_bzread(bz_file,
@@ -1224,7 +1227,8 @@ line_buffer::load_next_line(file_range prev_line)
12241227
if (start_iter != this->lb_line_starts.end()) {
12251228
auto next_line_iter = start_iter + 1;
12261229

1227-
// log_debug("found offset %d %d", buffer_offset, *start_iter);
1230+
// log_debug("found offset %d %d", buffer_offset,
1231+
// *start_iter);
12281232
if (next_line_iter != this->lb_line_starts.end()) {
12291233
auto start_index = std::distance(
12301234
this->lb_line_starts.begin(), start_iter);
@@ -1432,7 +1436,8 @@ line_buffer::read_range(file_range fr, scan_direction dir)
14321436
}
14331437

14341438
Result<auto_buffer, std::string>
1435-
line_buffer::peek_range(file_range fr)
1439+
line_buffer::peek_range(file_range fr,
1440+
lnav::enums::bitset<peek_options> options)
14361441
{
14371442
static const std::string SHORT_READ_MSG = "short read";
14381443

@@ -1448,7 +1453,9 @@ line_buffer::peek_range(file_range fr)
14481453
if (rc == -1) {
14491454
return Err(lnav::from_errno().message());
14501455
}
1451-
if (rc != fr.fr_size) {
1456+
if (!options.is_set<peek_options::allow_short_read>()
1457+
&& rc != fr.fr_size)
1458+
{
14521459
return Err(SHORT_READ_MSG);
14531460
}
14541461
buf.resize(rc);
@@ -1466,7 +1473,13 @@ line_buffer::peek_range(file_range fr)
14661473
return Err(lnav::from_errno().message());
14671474
}
14681475
if (rc != fr.fr_size) {
1469-
return Err(SHORT_READ_MSG);
1476+
this->lb_file_size = gi->strm.total_out;
1477+
log_info("fd(%d): set file size to %llu",
1478+
this->lb_fd.get(),
1479+
this->lb_file_size);
1480+
if (!options.is_set<peek_options::allow_short_read>()) {
1481+
return Err(SHORT_READ_MSG);
1482+
}
14701483
}
14711484
buf.resize(rc);
14721485
return Ok(std::move(buf));
@@ -1503,6 +1516,9 @@ line_buffer::peek_range(file_range fr)
15031516
count = BZ2_bzread(bz_file,
15041517
scratch,
15051518
std::min((size_t) seek_to, sizeof(scratch)));
1519+
if (count <= 0) {
1520+
break;
1521+
}
15061522
seek_to -= count;
15071523
}
15081524
auto rc = BZ2_bzread(bz_file, buf.data(), fr.fr_size);
@@ -1513,7 +1529,14 @@ line_buffer::peek_range(file_range fr)
15131529
return Err(lnav::from_errno().message());
15141530
}
15151531
if (rc != fr.fr_size) {
1516-
return Err(SHORT_READ_MSG);
1532+
if (options.is_set<peek_options::allow_short_read>()) {
1533+
this->lb_file_size = fr.fr_offset + fr.fr_size;
1534+
log_info("fd(%d): set file size to %llu",
1535+
this->lb_fd.get(),
1536+
this->lb_file_size);
1537+
} else {
1538+
return Err(SHORT_READ_MSG);
1539+
}
15171540
}
15181541
buf.resize(rc);
15191542
return Ok(std::move(buf));
@@ -1524,7 +1547,7 @@ line_buffer::peek_range(file_range fr)
15241547
if (rc == -1) {
15251548
return Err(lnav::from_errno().message());
15261549
}
1527-
if (rc != fr.fr_size) {
1550+
if (!options.is_set<peek_options::allow_short_read>() && rc != fr.fr_size) {
15281551
return Err(SHORT_READ_MSG);
15291552
}
15301553
buf.resize(rc);

src/line_buffer.hh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,12 @@ public:
213213
Result<shared_buffer_ref, std::string> read_range(
214214
file_range fr, scan_direction dir = scan_direction::forward);
215215

216-
Result<auto_buffer, std::string> peek_range(file_range fr);
216+
enum class peek_options : uint8_t {
217+
allow_short_read,
218+
};
219+
220+
Result<auto_buffer, std::string> peek_range(
221+
file_range fr, lnav::enums::bitset<peek_options> options = {});
217222

218223
file_range get_available();
219224

src/logfile.cc

Lines changed: 111 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -482,16 +482,112 @@ logfile::build_content_map()
482482
static auto op = lnav_operation{"build_content_map"};
483483

484484
auto op_guard = lnav_opid_guard::internal(op);
485+
486+
log_info("%s: trying to build content map",
487+
this->lf_filename_as_string.c_str());
488+
if (this->lf_line_buffer.is_compressed()) {
489+
auto skip_size = file_off_t{512 * 1024};
490+
auto read_size = file_ssize_t{64 * 1024};
491+
pattern_locks line_locks;
492+
scan_batch_context sbc_tmp{
493+
this->lf_allocator,
494+
line_locks,
495+
};
496+
497+
auto peek_range = file_range{
498+
0,
499+
read_size,
500+
};
501+
while (true) {
502+
auto last_peek = peek_range;
503+
peek_range.fr_offset += skip_size;
504+
log_debug(" content map peek %lld:%lld",
505+
peek_range.fr_offset,
506+
peek_range.fr_size);
507+
auto peek_res = this->lf_line_buffer.peek_range(
508+
peek_range,
509+
{
510+
line_buffer::peek_options::allow_short_read,
511+
});
512+
if (peek_res.isErr()) {
513+
log_error(" content map peek failed -- %s",
514+
peek_res.unwrapErr().c_str());
515+
break;
516+
}
517+
518+
auto buf = peek_res.unwrap();
519+
if (buf.empty()) {
520+
log_info("file size %lld",
521+
this->lf_line_buffer.get_file_size());
522+
if (this->lf_line_buffer.get_file_size() == -1) {
523+
log_info("skipped past end, reversing");
524+
skip_size = peek_range.fr_size;
525+
peek_range = last_peek;
526+
continue;
527+
}
528+
log_info(" reached end of file");
529+
break;
530+
}
531+
auto buf_sf = to_string_fragment(buf);
532+
auto split_res = buf_sf.split_pair(string_fragment::tag1{'\n'});
533+
if (!split_res) {
534+
log_warning(" cannot find start of line at %lld",
535+
peek_range.fr_offset);
536+
continue;
537+
}
538+
539+
auto [_junk, line_start_sf] = split_res.value();
540+
while (!line_start_sf.empty()) {
541+
auto utf8_res = is_utf8(line_start_sf, '\n');
542+
if (!utf8_res.usr_remaining) {
543+
log_warning(" cannot find end of line at %lld",
544+
peek_range.fr_offset + line_start_sf.sf_begin);
545+
break;
546+
}
547+
auto line_len = utf8_res.remaining_ptr() - line_start_sf.data();
548+
shared_buffer tmp_sb;
549+
shared_buffer_ref tmp_sbr;
550+
551+
tmp_sbr.share(tmp_sb, line_start_sf.data(), line_len);
552+
553+
auto map_line_fr = file_range{
554+
peek_range.fr_offset + line_start_sf.sf_begin,
555+
line_len,
556+
};
557+
map_line_fr.fr_metadata.m_has_ansi = utf8_res.usr_has_ansi;
558+
map_line_fr.fr_metadata.m_valid_utf = utf8_res.is_valid();
559+
auto map_li = line_info{map_line_fr};
560+
map_li.li_utf8_scan_result = utf8_res;
561+
std::vector<logline> tmp_index;
562+
auto scan_res = this->lf_format->scan(
563+
*this, tmp_index, map_li, tmp_sbr, sbc_tmp);
564+
if (scan_res.is<log_format::scan_match>()) {
565+
auto line_time = tmp_index.front()
566+
.get_time<std::chrono::microseconds>();
567+
this->lf_content_map.emplace_back(content_map_entry{
568+
map_line_fr,
569+
line_time,
570+
});
571+
log_info(" adding content map entry %lld - %s",
572+
map_line_fr.fr_offset,
573+
lnav::to_rfc3339_string(line_time).c_str());
574+
if (skip_size < 1024 * 1024 * 1024) {
575+
skip_size *= 2;
576+
}
577+
break;
578+
}
579+
line_start_sf = utf8_res.usr_remaining.value();
580+
}
581+
}
582+
}
583+
485584
auto retval = rebuild_result_t::NO_NEW_LINES;
486585
auto full_size = this->get_content_size();
487586

488587
this->lf_lower_bound_entry = std::nullopt;
489588
this->lf_upper_bound_entry = std::nullopt;
490-
this->lf_file_size_at_map_time = full_size;
491589

492-
log_info("%s: finding content layout (full_size=%lld)",
493-
this->lf_filename_as_string.c_str(),
494-
full_size);
590+
log_info(" finding content layout (full_size=%lld)", full_size);
495591
if (this->lf_options.loo_time_range.has_lower_bound()
496592
&& this->lf_options.loo_time_range.tr_begin
497593
> this->lf_index.front().get_time<std::chrono::microseconds>()
@@ -506,17 +602,15 @@ logfile::build_content_map()
506602
file_range{first_line_offset, full_size - first_line_offset},
507603
ll->get_time<std::chrono::microseconds>(),
508604
};
509-
log_info("%s: lower bound is within current index, erasing %ld lines",
510-
this->lf_filename_as_string.c_str(),
605+
log_info(" lower bound is within current index, erasing %ld lines",
511606
std::distance(this->lf_index.cbegin(), ll));
512607
this->lf_index_size = first_line_offset;
513608
this->lf_index.clear();
514609
retval = rebuild_result_t::NEW_ORDER;
515610
}
516611

517612
if (this->lf_index_size == full_size) {
518-
log_trace("%s: file has already been scanned, no need to peek",
519-
this->lf_filename_as_string.c_str());
613+
log_trace(" file has already been scanned, no need to peek");
520614
const auto& last_line = this->lf_index.back();
521615
auto last_line_offset = last_line.get_offset();
522616
this->lf_upper_bound_entry = content_map_entry{
@@ -527,29 +621,29 @@ logfile::build_content_map()
527621
&& this->lf_options.loo_time_range.tr_begin
528622
> this->lf_index.back().get_time<std::chrono::microseconds>())
529623
{
530-
log_info("%s: lower bound is past content",
531-
this->lf_filename_as_string.c_str());
624+
log_info(" lower bound is past content");
532625
this->lf_index.clear();
533626
retval = rebuild_result_t::NEW_ORDER;
534627
}
628+
this->lf_file_size_at_map_time = full_size;
535629
return retval;
536630
}
537631

538632
auto end_entry_opt
539633
= this->find_content_map_entry(full_size, map_read_upper_bound{});
540634
if (!end_entry_opt.is<map_entry_found>()) {
541635
log_warning(
542-
"%s: skipping content map since the last message could not be "
543-
"found",
544-
this->get_filename().c_str());
636+
" skipping content map since the last message could not be "
637+
"found");
545638
return retval;
546639
}
547640

548641
auto end_entry = end_entry_opt.get<map_entry_found>().mef_entry;
549-
log_info("found content end: %llu %s",
642+
log_info(" found content end: %llu %s",
550643
end_entry.cme_range.fr_offset,
551644
lnav::to_rfc3339_string(to_timeval(end_entry.cme_time)).c_str());
552645
this->lf_upper_bound_entry = end_entry;
646+
this->lf_file_size_at_map_time = full_size;
553647

554648
if (this->lf_options.loo_time_range.has_lower_bound()) {
555649
if (this->lf_options.loo_time_range.tr_begin > end_entry.cme_time) {
@@ -560,8 +654,7 @@ logfile::build_content_map()
560654
.get_time<std::chrono::microseconds>())
561655
{
562656
auto offset = full_size / 2;
563-
log_debug("%s: searching for lower bound %lld",
564-
this->lf_filename_as_string.c_str(),
657+
log_debug(" searching for lower bound %lld",
565658
this->lf_options.loo_time_range.tr_begin.count());
566659
auto low_entry_opt = this->find_content_map_entry(
567660
offset,
@@ -570,7 +663,7 @@ logfile::build_content_map()
570663
});
571664
if (low_entry_opt.is<map_entry_found>()) {
572665
auto low_entry = low_entry_opt.get<map_entry_found>().mef_entry;
573-
log_info("found content start: %llu %s",
666+
log_info(" found content start: %llu %s",
574667
low_entry.cme_range.fr_offset,
575668
lnav::to_rfc3339_string(to_timeval(low_entry.cme_time))
576669
.c_str());
@@ -1368,8 +1461,7 @@ logfile::rebuild_index(std::optional<ui_clock::time_point> deadline)
13681461
} else {
13691462
limit = 100;
13701463
}
1371-
} else if (!this->lf_line_buffer.is_compressed()
1372-
&& this->lf_options.loo_detect_format
1464+
} else if (this->lf_options.loo_detect_format
13731465
&& (!has_format || !this->lf_upper_bound_entry))
13741466
{
13751467
limit = 1000;

src/logfile.hh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ private:
603603
std::chrono::microseconds cme_time;
604604
};
605605
file_size_t lf_file_size_at_map_time{0};
606+
std::vector<content_map_entry> lf_content_map;
606607
std::optional<content_map_entry> lf_lower_bound_entry;
607608
std::optional<content_map_entry> lf_upper_bound_entry;
608609
std::optional<file_size_t> lf_upper_bound_size;

0 commit comments

Comments
 (0)