From e0ff4434a937ef0e29f6d5e676a1e9873ef4ff98 Mon Sep 17 00:00:00 2001 From: Tim Stack Date: Sat, 24 Jun 2023 23:12:29 -0700 Subject: [PATCH] [logfile] report utf error location Related to #1156 --- src/base/is_utf8.hh | 2 +- src/line_buffer.cc | 9 +++++---- src/line_buffer.hh | 4 ++-- src/lnav.cc | 16 ++++++++++++++++ src/logfile.cc | 37 +++++++++++++++++++++++++++++-------- test/drive_data_scanner.cc | 2 +- test/drive_line_buffer.cc | 5 +++-- 7 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/base/is_utf8.hh b/src/base/is_utf8.hh index ad25819b..56a959f5 100644 --- a/src/base/is_utf8.hh +++ b/src/base/is_utf8.hh @@ -37,7 +37,7 @@ struct utf8_scan_result { const char* usr_message{nullptr}; size_t usr_faulty_bytes{0}; - string_fragment usr_valid_frag; + string_fragment usr_valid_frag{string_fragment::invalid()}; nonstd::optional usr_remaining; bool usr_has_ansi{false}; diff --git a/src/line_buffer.cc b/src/line_buffer.cc index e9c05d94..f370c023 100644 --- a/src/line_buffer.cc +++ b/src/line_buffer.cc @@ -1084,8 +1084,7 @@ line_buffer::load_next_line(file_range prev_line) if (lf != nullptr) { lf -= 1; } - retval.li_valid_utf = scan_res.is_valid(); - retval.li_has_ansi = scan_res.usr_has_ansi; + retval.li_utf8_scan_result = scan_res; } auto got_new_data = old_retval_size != retval.li_file_range.fr_size; @@ -1174,8 +1173,10 @@ line_buffer::load_next_line(file_range prev_line) (int) retval.li_partial); #endif - retval.li_file_range.fr_metadata.m_has_ansi = retval.li_has_ansi; - retval.li_file_range.fr_metadata.m_valid_utf = retval.li_valid_utf; + retval.li_file_range.fr_metadata.m_has_ansi + = retval.li_utf8_scan_result.usr_has_ansi; + retval.li_file_range.fr_metadata.m_valid_utf + = retval.li_utf8_scan_result.is_valid(); return Ok(retval); } diff --git a/src/line_buffer.hh b/src/line_buffer.hh index ecd9e6c8..e0d32185 100644 --- a/src/line_buffer.hh +++ b/src/line_buffer.hh @@ -45,6 +45,7 @@ #include "base/auto_fd.hh" #include "base/auto_mem.hh" #include "base/file_range.hh" +#include "base/is_utf8.hh" #include "base/lnav_log.hh" #include "base/result.h" #include "safe/safe.h" @@ -53,8 +54,7 @@ struct line_info { file_range li_file_range; bool li_partial{false}; - bool li_valid_utf{true}; - bool li_has_ansi{false}; + utf8_scan_result li_utf8_scan_result{}; }; /** diff --git a/src/lnav.cc b/src/lnav.cc index 6388a2ce..36623ab8 100644 --- a/src/lnav.cc +++ b/src/lnav.cc @@ -3205,6 +3205,22 @@ SELECT tbl_name FROM sqlite_master WHERE sql LIKE 'CREATE VIRTUAL TABLE%' return EXIT_FAILURE; } + for (const auto& lf : lnav_data.ld_active_files.fc_files) { + for (const auto& note : lf->get_notes()) { + switch (note.first) { + case logfile::note_type::not_utf: { + auto um = lnav::console::user_message::error( + note.second); + lnav::console::print(stderr, um); + break; + } + + default: + break; + } + } + } + for (auto& pair : cmd_results) { if (pair.first.isErr()) { lnav::console::print(stderr, pair.first.unwrapErr()); diff --git a/src/logfile.cc b/src/logfile.cc index d057cf2a..32aa196f 100644 --- a/src/logfile.cc +++ b/src/logfile.cc @@ -380,8 +380,9 @@ logfile::process_prefix(shared_buffer_ref& sbr, auto& last_line = this->lf_index.back(); last_line.set_valid_utf(last_line.is_valid_utf() - && li.li_valid_utf); - last_line.set_has_ansi(last_line.has_ansi() || li.li_has_ansi); + && li.li_utf8_scan_result.is_valid()); + last_line.set_has_ansi(last_line.has_ansi() + || li.li_utf8_scan_result.usr_has_ansi); } if (prescan_size > 0 && this->lf_index.size() >= prescan_size && prescan_time != this->lf_index[prescan_size - 1].get_time()) @@ -437,8 +438,8 @@ logfile::process_prefix(shared_buffer_ref& sbr, last_level, last_mod, last_opid); - this->lf_index.back().set_valid_utf(li.li_valid_utf); - this->lf_index.back().set_has_ansi(li.li_has_ansi); + this->lf_index.back().set_valid_utf(li.li_utf8_scan_result.is_valid()); + this->lf_index.back().set_has_ansi(li.li_utf8_scan_result.usr_has_ansi); } return retval; @@ -608,19 +609,37 @@ logfile::rebuild_index(nonstd::optional deadline) } prev_range = li.li_file_range; - if (!this->lf_options.loo_non_utf_is_visible && !li.li_valid_utf) { + if (this->lf_format == nullptr + && !this->lf_options.loo_non_utf_is_visible + && !li.li_utf8_scan_result.is_valid()) + { log_info("file is not utf, hiding: %s", this->lf_filename.c_str()); this->lf_indexing = false; this->lf_options.loo_is_visible = false; + auto note_text = fmt::format( + FMT_STRING("not indexing non-UTF-8 file -- line: " + "{}; column: {}; error: {}"), + this->lf_index.size() + 1, + li.li_utf8_scan_result.usr_valid_frag.sf_end, + li.li_utf8_scan_result.usr_message); this->lf_notes.writeAccess()->emplace(note_type::not_utf, - "hiding non-UTF-8 file"); + note_text); if (this->lf_logfile_observer != nullptr) { this->lf_logfile_observer->logfile_indexing( this->shared_from_this(), 0, 0); } break; } + if (this->lf_format != nullptr + && !li.li_utf8_scan_result.is_valid()) + { + log_warning("%s: invalid UTF-8 detected at %d:%d -- %s", + this->lf_filename.c_str(), + this->lf_index.size() + 1, + li.li_utf8_scan_result.usr_valid_frag.sf_end, + li.li_utf8_scan_result.usr_message); + } size_t old_size = this->lf_index.size(); @@ -641,7 +660,7 @@ logfile::rebuild_index(nonstd::optional deadline) .unwrapOr(text_format_t::TF_UNKNOWN); log_debug("setting text format to %d", this->lf_text_format); } - if (!li.li_valid_utf + if (!li.li_utf8_scan_result.is_valid() && this->lf_text_format != text_format_t::TF_MARKDOWN && this->lf_text_format != text_format_t::TF_LOG) { @@ -661,7 +680,9 @@ logfile::rebuild_index(nonstd::optional deadline) auto sbr = read_result.unwrap(); sbr.rtrim(is_line_ending); - if (li.li_valid_utf && li.li_has_ansi) { + if (li.li_utf8_scan_result.is_valid() + && li.li_utf8_scan_result.usr_has_ansi) + { sbr.erase_ansi(); } diff --git a/test/drive_data_scanner.cc b/test/drive_data_scanner.cc index f159c431..a7a0fb71 100644 --- a/test/drive_data_scanner.cc +++ b/test/drive_data_scanner.cc @@ -157,9 +157,9 @@ main(int argc, char* argv[]) auto& root_formats = log_format::get_root_formats(); std::vector>::iterator iter; - std::vector index; if (is_log) { + std::vector index; logfile_open_options loo; auto open_res = logfile::open(argv[lpc], loo); auto lf = open_res.unwrap(); diff --git a/test/drive_line_buffer.cc b/test/drive_line_buffer.cc index 75acba36..b283b9c4 100644 --- a/test/drive_line_buffer.cc +++ b/test/drive_line_buffer.cc @@ -167,14 +167,15 @@ main(int argc, char* argv[]) auto sbr = read_result.unwrap(); - if (!li.li_valid_utf) { + if (!li.li_utf8_scan_result.is_valid()) { scrub_to_utf8(sbr.get_writable_data(), sbr.length()); } printf("%.*s", (int) sbr.length(), sbr.get_data()); if ((off_t) (li.li_file_range.fr_offset + li.li_file_range.fr_size) - < offset) { + < offset) + { printf("\n"); } last_range = li.li_file_range;