/** * Copyright (c) 2022, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "document.sections.hh" #include "base/enum_util.hh" #include "base/itertools.hh" #include "base/lnav_log.hh" #include "base/opt_util.hh" #include "data_scanner.hh" namespace lnav { namespace document { nonstd::optional hier_node::lookup_child(section_key_t key) const { return make_optional_from_nullable(key.match( [this](const std::string& str) -> hier_node* { auto iter = this->hn_named_children.find(str); if (iter != this->hn_named_children.end()) { return iter->second; } return nullptr; }, [this](size_t index) -> hier_node* { if (index < this->hn_children.size()) { return this->hn_children[index].get(); } return nullptr; })); } nonstd::optional hier_node::lookup_path(const hier_node* root, const std::vector& path) { auto retval = make_optional_from_nullable(root); for (const auto& comp : path) { if (!retval) { break; } retval = retval.value()->lookup_child(comp); } if (!retval) { return nonstd::nullopt; } return retval; } struct metadata_builder { std::vector mb_intervals; std::vector mb_type_intervals; std::unique_ptr mb_root_node; std::set mb_indents; metadata to_metadata() && { return { std::move(this->mb_intervals), std::move(this->mb_root_node), std::move(this->mb_type_intervals), std::move(this->mb_indents), }; } }; static void discover_metadata_int(const attr_line_t& al, metadata_builder& mb) { const auto& orig_attrs = al.get_attrs(); auto headers = orig_attrs | lnav::itertools::filter_in([](const string_attr& attr) { if (attr.sa_type != &VC_ROLE) { return false; } auto role = attr.sa_value.get(); switch (role) { case role_t::VCR_H1: case role_t::VCR_H2: case role_t::VCR_H3: case role_t::VCR_H4: case role_t::VCR_H5: case role_t::VCR_H6: return true; default: return false; } }) | lnav::itertools::sort_by(&string_attr::sa_range); // Remove headers from quoted text for (const auto& orig_attr : orig_attrs) { if (orig_attr.sa_type == &VC_ROLE && orig_attr.sa_value.get() == role_t::VCR_QUOTED_TEXT) { remove_string_attr(headers, orig_attr.sa_range); } } auto& intervals = mb.mb_intervals; struct open_interval_t { open_interval_t(uint32_t level, file_off_t start, section_key_t id) : oi_level(level), oi_start(start), oi_id(std::move(id)) { } int32_t oi_level; file_off_t oi_start; section_key_t oi_id; std::unique_ptr oi_node{std::make_unique()}; }; std::vector open_intervals; auto root_node = std::make_unique(); for (const auto& hdr_attr : headers) { auto role = hdr_attr.sa_value.get(); auto role_num = lnav::enums::to_underlying(role) - lnav::enums::to_underlying(role_t::VCR_H1); std::vector new_open_intervals; for (auto& oi : open_intervals) { if (oi.oi_level >= role_num) { // close out this section intervals.emplace_back( oi.oi_start, hdr_attr.sa_range.lr_start - 1, oi.oi_id); auto* node_ptr = oi.oi_node.get(); auto* parent_node = oi.oi_node->hn_parent; if (parent_node != nullptr) { parent_node->hn_children.emplace_back( std::move(oi.oi_node)); parent_node->hn_named_children.insert({ oi.oi_id.get(), node_ptr, }); } } else { new_open_intervals.emplace_back(std::move(oi)); } } auto* parent_node = new_open_intervals.empty() ? root_node.get() : new_open_intervals.back().oi_node.get(); new_open_intervals.emplace_back(role_num, hdr_attr.sa_range.lr_start, al.get_substring(hdr_attr.sa_range)); new_open_intervals.back().oi_node->hn_parent = parent_node; new_open_intervals.back().oi_node->hn_start = hdr_attr.sa_range.lr_start; open_intervals = std::move(new_open_intervals); } for (auto& oi : open_intervals) { // close out this section intervals.emplace_back(oi.oi_start, al.length(), oi.oi_id); auto* node_ptr = oi.oi_node.get(); auto* parent_node = oi.oi_node->hn_parent; if (parent_node == nullptr) { root_node = std::move(oi.oi_node); } else { parent_node->hn_children.emplace_back(std::move(oi.oi_node)); parent_node->hn_named_children.insert({ oi.oi_id.get(), node_ptr, }); } } for (auto& interval : intervals) { auto start_off_iter = find_string_attr_containing( orig_attrs, &SA_ORIGIN_OFFSET, interval.start); if (start_off_iter != orig_attrs.end()) { interval.start += start_off_iter->sa_value.get(); } auto stop_off_iter = find_string_attr_containing( orig_attrs, &SA_ORIGIN_OFFSET, interval.stop - 1); if (stop_off_iter != orig_attrs.end()) { interval.stop += stop_off_iter->sa_value.get(); } } hier_node::depth_first(root_node.get(), [&orig_attrs](hier_node* node) { auto off_opt = get_string_attr(orig_attrs, &SA_ORIGIN_OFFSET, node->hn_start); if (off_opt) { node->hn_start += off_opt.value()->sa_value.get(); } }); if (!root_node->hn_children.empty() || !root_node->hn_named_children.empty()) { mb.mb_root_node = std::move(root_node); } } metadata discover_metadata(const attr_line_t& al) { metadata_builder mb; discover_metadata_int(al, mb); return std::move(mb).to_metadata(); } class structure_walker { public: explicit structure_walker(attr_line_t& al, line_range lr, text_format_t tf) : sw_line(al), sw_range(lr), sw_text_format(tf), sw_scanner(string_fragment::from_str_range( al.get_string(), lr.lr_start, lr.lr_end)) { this->sw_interval_state.resize(1); this->sw_hier_nodes.push_back(std::make_unique()); } metadata walk() { metadata_builder mb; size_t garbage_count = 0; while (garbage_count < 1000) { auto tokenize_res = this->sw_scanner.tokenize2(this->sw_text_format); if (!tokenize_res) { break; } auto dt = tokenize_res->tr_token; element el(tokenize_res->tr_token, tokenize_res->tr_capture); #if 0 log_debug("tok %s %s", data_scanner::token2name(dt), tokenize_res->to_string().c_str()); #endif if (dt != DT_WHITE) { this->sw_at_start = false; } switch (dt) { case DT_XML_DECL_TAG: case DT_XML_EMPTY_TAG: this->sw_values.emplace_back(el); break; case DT_COMMENT: this->sw_type_intervals.emplace_back( el.e_capture.c_begin, el.e_capture.c_end, section_types_t::comment); this->sw_line.get_attrs().emplace_back( line_range{ this->sw_range.lr_start + el.e_capture.c_begin, this->sw_range.lr_start + el.e_capture.c_end, }, VC_ROLE.value(role_t::VCR_COMMENT)); break; case DT_XML_OPEN_TAG: this->flush_values(); this->sw_interval_state.back().is_start = el.e_capture.c_begin; this->sw_interval_state.back().is_line_number = this->sw_line_number; this->sw_interval_state.back().is_name = tokenize_res->to_string(); this->sw_depth += 1; this->sw_interval_state.resize(this->sw_depth + 1); this->sw_hier_nodes.push_back( std::make_unique()); break; case DT_XML_CLOSE_TAG: { auto term = this->flush_values(); if (this->sw_depth > 0) { if (term) { this->append_child_node(term); } this->sw_interval_state.pop_back(); this->sw_hier_stage = std::move(this->sw_hier_nodes.back()); this->sw_hier_nodes.pop_back(); } this->append_child_node(el.e_capture); if (this->sw_depth > 0) { this->sw_depth -= 1; } this->flush_values(); break; } case DT_H1: { this->sw_line.get_attrs().emplace_back( line_range{ this->sw_range.lr_start + el.e_capture.c_begin + 1, this->sw_range.lr_start + el.e_capture.c_end - 1, }, VC_ROLE.value(role_t::VCR_H1)); this->sw_line_number += 2; break; } case DT_LCURLY: case DT_LSQUARE: case DT_LPAREN: { this->flush_values(); // this->append_child_node(term); this->sw_depth += 1; this->sw_interval_state.back().is_start = el.e_capture.c_begin; this->sw_interval_state.back().is_line_number = this->sw_line_number; this->sw_interval_state.resize(this->sw_depth + 1); this->sw_hier_nodes.push_back( std::make_unique()); break; } case DT_RCURLY: case DT_RSQUARE: case DT_RPAREN: { auto term = this->flush_values(); if (this->sw_depth > 0) { this->append_child_node(term); this->sw_depth -= 1; this->sw_interval_state.pop_back(); this->sw_hier_stage = std::move(this->sw_hier_nodes.back()); this->sw_hier_nodes.pop_back(); if (this->sw_interval_state.back().is_start) { data_scanner::capture_t obj_cap = { static_cast(this->sw_interval_state.back() .is_start.value()), el.e_capture.c_end, }; auto sf = this->sw_scanner.to_string_fragment(obj_cap); if (!sf.find('\n')) { this->sw_hier_stage->hn_named_children.clear(); this->sw_hier_stage->hn_children.clear(); while (!this->sw_intervals.empty() && this->sw_intervals.back().start > obj_cap.c_begin) { this->sw_intervals.pop_back(); } } } } this->sw_values.emplace_back(el); break; } case DT_COMMA: if (this->sw_depth > 0) { auto term = this->flush_values(); this->append_child_node(term); } break; case DT_LINE: this->sw_line_number += 1; this->sw_at_start = true; break; case DT_WHITE: if (this->sw_at_start) { size_t indent_size = 0; for (auto ch : tokenize_res->to_string_fragment()) { if (ch == '\t') { do { indent_size += 1; } while (indent_size % 8); } else { indent_size += 1; } } this->sw_indents.insert(indent_size); this->sw_at_start = false; } break; default: if (dt == DT_GARBAGE) { garbage_count += 1; } if (dt == DT_QUOTED_STRING) { auto quoted_sf = tokenize_res->to_string_fragment(); if (quoted_sf.find('\n')) { this->sw_type_intervals.emplace_back( el.e_capture.c_begin, el.e_capture.c_end, section_types_t::multiline_string); this->sw_line.get_attrs().emplace_back( line_range{ this->sw_range.lr_start + el.e_capture.c_begin, this->sw_range.lr_start + el.e_capture.c_end, }, VC_ROLE.value(role_t::VCR_STRING)); } } this->sw_values.emplace_back(el); break; } } this->flush_values(); if (this->sw_hier_stage != nullptr) { this->sw_hier_stage->hn_parent = this->sw_hier_nodes.back().get(); this->sw_hier_nodes.back()->hn_children.push_back( std::move(this->sw_hier_stage)); } this->sw_hier_stage = std::move(this->sw_hier_nodes.back()); this->sw_hier_nodes.pop_back(); if (this->sw_hier_stage->hn_children.size() == 1 && this->sw_hier_stage->hn_named_children.empty()) { this->sw_hier_stage = std::move(this->sw_hier_stage->hn_children.front()); this->sw_hier_stage->hn_parent = nullptr; } if (!this->sw_indents.empty()) { auto low_indent_iter = this->sw_indents.begin(); if (*low_indent_iter == 1) { // adding guides for small indents is noisy, drop for now this->sw_indents.clear(); } else { auto lcm = *low_indent_iter; for (auto indent_iter = this->sw_indents.begin(); indent_iter != this->sw_indents.end();) { if ((*indent_iter % lcm) == 0) { ++indent_iter; } else { indent_iter = this->sw_indents.erase(indent_iter); } } } } mb.mb_root_node = std::move(this->sw_hier_stage); mb.mb_intervals = std::move(this->sw_intervals); mb.mb_type_intervals = std::move(this->sw_type_intervals); mb.mb_indents = std::move(this->sw_indents); discover_metadata_int(this->sw_line, mb); return std::move(mb).to_metadata(); } private: struct element { element(data_token_t token, data_scanner::capture_t& cap) : e_token(token), e_capture(cap) { } data_token_t e_token; data_scanner::capture_t e_capture; }; struct interval_state { nonstd::optional is_start; size_t is_line_number{0}; std::string is_name; }; nonstd::optional flush_values() { nonstd::optional last_key; nonstd::optional retval; if (!this->sw_values.empty()) { if (!this->sw_interval_state.back().is_start) { this->sw_interval_state.back().is_start = this->sw_values.front().e_capture.c_begin; this->sw_interval_state.back().is_line_number = this->sw_line_number; } retval = this->sw_values.back().e_capture; } for (const auto& el : this->sw_values) { switch (el.e_token) { case DT_SYMBOL: case DT_CONSTANT: case DT_WORD: case DT_QUOTED_STRING: last_key = el.e_capture; break; case DT_COLON: case DT_EQUALS: if (last_key) { this->sw_interval_state.back().is_name = this->sw_scanner .to_string_fragment(last_key.value()) .to_string(); if (!this->sw_interval_state.back().is_name.empty()) { this->sw_interval_state.back().is_start = static_cast( last_key.value().c_begin); this->sw_interval_state.back().is_line_number = this->sw_line_number; } last_key = nonstd::nullopt; } break; default: break; } } this->sw_values.clear(); return retval; } void append_child_node(nonstd::optional terminator) { auto& ivstate = this->sw_interval_state.back(); if (!ivstate.is_start || !terminator || this->sw_depth == 0) { ivstate.is_start = nonstd::nullopt; ivstate.is_line_number = 0; ivstate.is_name.clear(); return; } auto new_node = this->sw_hier_stage != nullptr ? std::move(this->sw_hier_stage) : std::make_unique(); auto iv_start = ivstate.is_start.value(); auto iv_stop = static_cast(terminator.value().c_end); auto* top_node = this->sw_hier_nodes.back().get(); auto new_key = ivstate.is_name.empty() ? lnav::document::section_key_t{top_node->hn_children.size()} : lnav::document::section_key_t{ivstate.is_name}; this->sw_intervals.emplace_back(iv_start, iv_stop, new_key); auto* retval = new_node.get(); new_node->hn_parent = top_node; new_node->hn_start = this->sw_intervals.back().start; new_node->hn_line_number = ivstate.is_line_number; if (!ivstate.is_name.empty()) { top_node->hn_named_children.insert({ ivstate.is_name, retval, }); } top_node->hn_children.emplace_back(std::move(new_node)); ivstate.is_start = nonstd::nullopt; ivstate.is_line_number = 0; ivstate.is_name.clear(); } attr_line_t& sw_line; line_range sw_range; text_format_t sw_text_format; data_scanner sw_scanner; int sw_depth{0}; size_t sw_line_number{0}; bool sw_at_start{true}; std::set sw_indents; std::vector sw_values{}; std::vector sw_interval_state; std::vector sw_intervals; std::vector sw_type_intervals; std::vector> sw_hier_nodes; std::unique_ptr sw_hier_stage; }; metadata discover_structure(attr_line_t& al, struct line_range lr, text_format_t tf) { return structure_walker(al, lr, tf).walk(); } std::vector metadata::possibility_provider(const std::vector& path) { std::vector retval; auto curr_node = lnav::document::hier_node::lookup_path( this->m_sections_root.get(), path); if (curr_node) { auto* parent_node = curr_node.value()->hn_parent; if (parent_node != nullptr) { for (const auto& sibling : parent_node->hn_named_children) { retval.template emplace_back(sibling.first); } } } return retval; } } // namespace document } // namespace lnav