mirror of
https://github.com/tstack/lnav
synced 2024-11-17 15:29:40 +00:00
740b827901
This is a partial fix for handling ANSI escapes in parts of log messages that would prevent regexes from matching. Still more work to do. Related to #1057
1072 lines
34 KiB
C++
1072 lines
34 KiB
C++
/**
|
|
* Copyright (c) 2007-2012, Timothy Stack
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
* * Neither the name of Timothy Stack nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <algorithm>
|
|
|
|
#include "data_parser.hh"
|
|
|
|
#include "config.h"
|
|
#include "spookyhash/SpookyV2.h"
|
|
|
|
data_format data_parser::FORMAT_SEMI("semi", DT_COMMA, DT_SEMI);
|
|
data_format data_parser::FORMAT_COMMA("comma", DT_INVALID, DT_COMMA);
|
|
data_format data_parser::FORMAT_PLAIN("plain", DT_INVALID, DT_INVALID);
|
|
|
|
data_parser::data_parser(data_scanner* ds)
|
|
: dp_errors("dp_errors", __FILE__, __LINE__),
|
|
dp_pairs("dp_pairs", __FILE__, __LINE__), dp_msg_format(nullptr),
|
|
dp_msg_format_begin(ds->get_init_offset()), dp_scanner(ds)
|
|
{
|
|
if (TRACE_FILE != nullptr) {
|
|
fprintf(TRACE_FILE, "input %s\n", ds->get_input().to_string().c_str());
|
|
}
|
|
}
|
|
|
|
void
|
|
data_parser::pairup(data_parser::schema_id_t* schema,
|
|
data_parser::element_list_t& pairs_out,
|
|
data_parser::element_list_t& in_list,
|
|
int group_depth)
|
|
{
|
|
element_list_t ELEMENT_LIST_T(el_stack), ELEMENT_LIST_T(free_row),
|
|
ELEMENT_LIST_T(key_comps), ELEMENT_LIST_T(value),
|
|
ELEMENT_LIST_T(prefix);
|
|
SpookyHash context;
|
|
|
|
require(in_list.el_format.df_name != nullptr);
|
|
|
|
POINT_TRACE("pairup_start");
|
|
|
|
FORMAT_TRACE(in_list);
|
|
|
|
for (auto iter = in_list.begin(); iter != in_list.end(); ++iter) {
|
|
if (iter->e_token == DNT_GROUP) {
|
|
element_list_t ELEMENT_LIST_T(group_pairs);
|
|
|
|
this->pairup(
|
|
nullptr, group_pairs, *iter->e_sub_elements, group_depth + 1);
|
|
if (!group_pairs.empty()) {
|
|
iter->assign_elements(group_pairs);
|
|
}
|
|
}
|
|
|
|
if (in_list.el_format.df_prefix_terminator != DT_INVALID) {
|
|
if (iter->e_token == in_list.el_format.df_prefix_terminator) {
|
|
in_list.el_format.df_prefix_terminator = DT_INVALID;
|
|
} else {
|
|
el_stack.PUSH_BACK(*iter);
|
|
}
|
|
} else if (iter->e_token == in_list.el_format.df_terminator) {
|
|
this->end_of_value(
|
|
el_stack, key_comps, value, in_list, group_depth);
|
|
|
|
key_comps.PUSH_BACK(*iter);
|
|
} else if (iter->e_token == in_list.el_format.df_qualifier) {
|
|
value.SPLICE(
|
|
value.end(), key_comps, key_comps.begin(), key_comps.end());
|
|
strip(value, element_is_space{});
|
|
if (!value.empty()) {
|
|
el_stack.PUSH_BACK(element(value, DNT_VALUE));
|
|
}
|
|
} else if (iter->e_token == in_list.el_format.df_separator) {
|
|
auto key_iter = key_comps.end();
|
|
bool found = false, key_is_values = true;
|
|
|
|
if (!key_comps.empty()) {
|
|
do {
|
|
--key_iter;
|
|
if (key_iter->e_token == in_list.el_format.df_appender) {
|
|
++key_iter;
|
|
value.SPLICE(value.end(),
|
|
key_comps,
|
|
key_comps.begin(),
|
|
key_iter);
|
|
key_comps.POP_FRONT();
|
|
found = true;
|
|
} else if (key_iter->e_token
|
|
== in_list.el_format.df_terminator)
|
|
{
|
|
std::vector<element> key_copy;
|
|
|
|
value.SPLICE(value.end(),
|
|
key_comps,
|
|
key_comps.begin(),
|
|
key_iter);
|
|
key_comps.POP_FRONT();
|
|
strip(key_comps, element_is_space{});
|
|
if (key_comps.empty()) {
|
|
key_iter = key_comps.end();
|
|
} else {
|
|
key_iter = key_comps.begin();
|
|
}
|
|
found = true;
|
|
}
|
|
if (key_iter != key_comps.end()) {
|
|
switch (key_iter->e_token) {
|
|
case DT_WORD:
|
|
case DT_SYMBOL:
|
|
key_is_values = false;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
} while (key_iter != key_comps.begin() && !found);
|
|
}
|
|
if (!found && !el_stack.empty() && !key_comps.empty()) {
|
|
element_list_t::iterator value_iter;
|
|
|
|
if (el_stack.size() > 1
|
|
&& in_list.el_format.df_appender != DT_INVALID
|
|
&& in_list.el_format.df_terminator != DT_INVALID)
|
|
{
|
|
/* If we're expecting a terminator and haven't found it */
|
|
/* then this is part of the value. */
|
|
continue;
|
|
}
|
|
|
|
value.SPLICE(
|
|
value.end(), key_comps, key_comps.begin(), key_comps.end());
|
|
value_iter = value.end();
|
|
std::advance(value_iter, -1);
|
|
key_comps.SPLICE(
|
|
key_comps.begin(), value, value_iter, value.end());
|
|
key_comps.resize(1);
|
|
}
|
|
|
|
strip(value, element_is_space{});
|
|
value.remove_if(element_if(DT_COMMA));
|
|
if (!value.empty()) {
|
|
el_stack.PUSH_BACK(element(value, DNT_VALUE));
|
|
}
|
|
strip(key_comps, element_is_space{});
|
|
if (!key_comps.empty()) {
|
|
if (key_is_values) {
|
|
el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
|
|
} else {
|
|
el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
|
|
}
|
|
}
|
|
key_comps.CLEAR();
|
|
value.CLEAR();
|
|
} else {
|
|
key_comps.PUSH_BACK(*iter);
|
|
}
|
|
|
|
POINT_TRACE("pairup_loop");
|
|
}
|
|
|
|
POINT_TRACE("pairup_eol");
|
|
|
|
CONSUMED_TRACE(in_list);
|
|
|
|
// Only perform the free-row logic at the top level, if we're in a group
|
|
// assume it is a list.
|
|
if (group_depth < 1 && el_stack.empty()) {
|
|
free_row.SPLICE(
|
|
free_row.begin(), key_comps, key_comps.begin(), key_comps.end());
|
|
} else {
|
|
this->end_of_value(el_stack, key_comps, value, in_list, group_depth);
|
|
}
|
|
|
|
POINT_TRACE("pairup_stack");
|
|
|
|
context.Init(0, 0);
|
|
while (!el_stack.empty()) {
|
|
auto kv_iter = el_stack.begin();
|
|
if (kv_iter->e_token == DNT_VALUE) {
|
|
if (pairs_out.empty()) {
|
|
free_row.PUSH_BACK(el_stack.front());
|
|
} else {
|
|
element_list_t ELEMENT_LIST_T(free_pair_subs);
|
|
struct element blank;
|
|
|
|
blank.e_capture.c_begin = blank.e_capture.c_end
|
|
= el_stack.front().e_capture.c_begin;
|
|
blank.e_token = DNT_KEY;
|
|
free_pair_subs.PUSH_BACK(blank);
|
|
free_pair_subs.PUSH_BACK(el_stack.front());
|
|
pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR));
|
|
}
|
|
}
|
|
if (kv_iter->e_token != DNT_KEY) {
|
|
el_stack.POP_FRONT();
|
|
continue;
|
|
}
|
|
|
|
++kv_iter;
|
|
if (kv_iter == el_stack.end()) {
|
|
el_stack.POP_FRONT();
|
|
continue;
|
|
}
|
|
|
|
element_list_t ELEMENT_LIST_T(pair_subs);
|
|
|
|
if (schema != nullptr) {
|
|
size_t key_len;
|
|
const char* key_val
|
|
= this->get_element_string(el_stack.front(), key_len);
|
|
context.Update(key_val, key_len);
|
|
}
|
|
|
|
while (!free_row.empty()) {
|
|
element_list_t ELEMENT_LIST_T(free_pair_subs);
|
|
struct element blank;
|
|
|
|
blank.e_capture.c_begin = blank.e_capture.c_end
|
|
= free_row.front().e_capture.c_begin;
|
|
blank.e_token = DNT_KEY;
|
|
free_pair_subs.PUSH_BACK(blank);
|
|
free_pair_subs.PUSH_BACK(free_row.front());
|
|
pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR));
|
|
free_row.POP_FRONT();
|
|
}
|
|
|
|
bool has_value = false;
|
|
|
|
if (kv_iter->e_token == DNT_VALUE) {
|
|
++kv_iter;
|
|
has_value = true;
|
|
}
|
|
|
|
pair_subs.SPLICE(
|
|
pair_subs.begin(), el_stack, el_stack.begin(), kv_iter);
|
|
|
|
if (!has_value) {
|
|
element_list_t ELEMENT_LIST_T(blank_value);
|
|
struct element blank;
|
|
|
|
blank.e_token = DT_QUOTED_STRING;
|
|
blank.e_capture.c_begin = blank.e_capture.c_end
|
|
= pair_subs.front().e_capture.c_end;
|
|
if (blank.e_capture.c_begin >= 0
|
|
&& blank.e_capture.c_begin
|
|
< this->dp_scanner->get_input().sf_end)
|
|
{
|
|
switch (this->dp_scanner->to_string_fragment(blank.e_capture)
|
|
.front())
|
|
{
|
|
case '=':
|
|
case ':':
|
|
blank.e_capture.c_begin += 1;
|
|
blank.e_capture.c_end += 1;
|
|
break;
|
|
}
|
|
}
|
|
blank_value.PUSH_BACK(blank);
|
|
pair_subs.PUSH_BACK(element(blank_value, DNT_VALUE));
|
|
}
|
|
|
|
pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR));
|
|
}
|
|
|
|
if (pairs_out.size() == 1) {
|
|
element& pair = pairs_out.front();
|
|
element& evalue = pair.e_sub_elements->back();
|
|
|
|
if (evalue.e_token == DNT_VALUE && evalue.e_sub_elements != nullptr
|
|
&& evalue.e_sub_elements->size() > 1)
|
|
{
|
|
element_list_t::iterator next_sub;
|
|
|
|
next_sub = pair.e_sub_elements->begin();
|
|
++next_sub;
|
|
prefix.SPLICE(prefix.begin(),
|
|
*pair.e_sub_elements,
|
|
pair.e_sub_elements->begin(),
|
|
next_sub);
|
|
free_row.CLEAR();
|
|
free_row.SPLICE(free_row.begin(),
|
|
*evalue.e_sub_elements,
|
|
evalue.e_sub_elements->begin(),
|
|
evalue.e_sub_elements->end());
|
|
pairs_out.CLEAR();
|
|
context.Init(0, 0);
|
|
}
|
|
}
|
|
|
|
if (group_depth >= 1 && pairs_out.empty() && !free_row.empty()) {
|
|
pairs_out.SWAP(free_row);
|
|
}
|
|
|
|
if (pairs_out.empty() && !free_row.empty()) {
|
|
while (!free_row.empty()) {
|
|
switch (free_row.front().e_token) {
|
|
case DNT_GROUP:
|
|
case DNT_VALUE:
|
|
case DT_EMAIL:
|
|
case DT_CONSTANT:
|
|
case DT_NUMBER:
|
|
case DT_SYMBOL:
|
|
case DT_HEX_NUMBER:
|
|
case DT_OCTAL_NUMBER:
|
|
case DT_VERSION_NUMBER:
|
|
case DT_QUOTED_STRING:
|
|
case DT_IPV4_ADDRESS:
|
|
case DT_IPV6_ADDRESS:
|
|
case DT_MAC_ADDRESS:
|
|
case DT_HEX_DUMP:
|
|
case DT_XML_DECL_TAG:
|
|
case DT_XML_OPEN_TAG:
|
|
case DT_XML_CLOSE_TAG:
|
|
case DT_XML_EMPTY_TAG:
|
|
case DT_UUID:
|
|
case DT_URL:
|
|
case DT_PATH:
|
|
case DT_DATE:
|
|
case DT_TIME:
|
|
case DT_PERCENTAGE: {
|
|
element_list_t ELEMENT_LIST_T(pair_subs);
|
|
struct element blank;
|
|
|
|
blank.e_capture.c_begin = blank.e_capture.c_end
|
|
= free_row.front().e_capture.c_begin;
|
|
blank.e_token = DNT_KEY;
|
|
pair_subs.PUSH_BACK(blank);
|
|
pair_subs.PUSH_BACK(free_row.front());
|
|
pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR));
|
|
|
|
// Throw something into the hash so that the number of
|
|
// columns is significant. I don't think we want to
|
|
// use the token ID since some columns values might vary
|
|
// between rows.
|
|
context.Update(" ", 1);
|
|
} break;
|
|
|
|
default: {
|
|
size_t key_len;
|
|
const char* key_val
|
|
= this->get_element_string(free_row.front(), key_len);
|
|
|
|
context.Update(key_val, key_len);
|
|
} break;
|
|
}
|
|
|
|
free_row.POP_FRONT();
|
|
}
|
|
}
|
|
|
|
if (!prefix.empty()) {
|
|
element_list_t ELEMENT_LIST_T(pair_subs);
|
|
struct element blank;
|
|
|
|
blank.e_capture.c_begin = blank.e_capture.c_end
|
|
= prefix.front().e_capture.c_begin;
|
|
blank.e_token = DNT_KEY;
|
|
pair_subs.PUSH_BACK(blank);
|
|
pair_subs.PUSH_BACK(prefix.front());
|
|
pairs_out.PUSH_FRONT(element(pair_subs, DNT_PAIR));
|
|
}
|
|
|
|
if (schema != nullptr) {
|
|
context.Final(schema->out(0), schema->out(1));
|
|
}
|
|
|
|
if (schema != nullptr && this->dp_msg_format != nullptr) {
|
|
for (auto& fiter : pairs_out) {
|
|
*(this->dp_msg_format) += this->get_string_up_to_value(fiter);
|
|
this->dp_msg_format->append("#");
|
|
}
|
|
if ((size_t) this->dp_msg_format_begin
|
|
< this->dp_scanner->get_input().length())
|
|
{
|
|
auto last = this->dp_scanner->get_input().substr(
|
|
this->dp_msg_format_begin);
|
|
|
|
switch (last.front()) {
|
|
case '\'':
|
|
case '"':
|
|
last.sf_begin += 1;
|
|
break;
|
|
}
|
|
*(this->dp_msg_format) += last.to_string();
|
|
}
|
|
}
|
|
|
|
if (pairs_out.size() > 1000) {
|
|
pairs_out.resize(1000);
|
|
}
|
|
}
|
|
|
|
void
|
|
data_parser::discover_format()
|
|
{
|
|
std::stack<discover_format_state> state_stack;
|
|
this->dp_group_token.push_back(DT_INVALID);
|
|
this->dp_group_stack.resize(1);
|
|
|
|
state_stack.push(discover_format_state());
|
|
while (true) {
|
|
auto tok_res = this->dp_scanner->tokenize2();
|
|
if (!tok_res) {
|
|
break;
|
|
}
|
|
|
|
element elem;
|
|
elem.e_token = tok_res->tr_token;
|
|
elem.e_capture = tok_res->tr_inner_capture;
|
|
|
|
require(elem.e_capture.c_begin >= 0);
|
|
require(elem.e_capture.c_end >= 0);
|
|
|
|
state_stack.top().update_for_element(elem);
|
|
switch (elem.e_token) {
|
|
case DT_LPAREN:
|
|
case DT_LANGLE:
|
|
case DT_LCURLY:
|
|
case DT_LSQUARE:
|
|
this->dp_group_token.push_back(elem.e_token);
|
|
this->dp_group_stack.emplace_back("_anon_", __FILE__, __LINE__);
|
|
state_stack.push(discover_format_state());
|
|
break;
|
|
|
|
case DT_EMPTY_CONTAINER: {
|
|
auto& curr_group = this->dp_group_stack.back();
|
|
auto empty_list = element_list_t("_anon_", __FILE__, __LINE__);
|
|
discover_format_state dfs;
|
|
|
|
dfs.finalize();
|
|
|
|
empty_list.el_format = dfs.dfs_format;
|
|
curr_group.PUSH_BACK(element());
|
|
|
|
auto& empty = curr_group.back();
|
|
empty.e_capture.c_begin = elem.e_capture.c_begin + 1;
|
|
empty.e_capture.c_end = elem.e_capture.c_begin + 1;
|
|
empty.e_token = DNT_GROUP;
|
|
empty.assign_elements(empty_list);
|
|
break;
|
|
}
|
|
|
|
case DT_RPAREN:
|
|
case DT_RANGLE:
|
|
case DT_RCURLY:
|
|
case DT_RSQUARE:
|
|
if (this->dp_group_token.back() == (elem.e_token - 1)) {
|
|
this->dp_group_token.pop_back();
|
|
|
|
auto riter = this->dp_group_stack.rbegin();
|
|
++riter;
|
|
state_stack.top().finalize();
|
|
this->dp_group_stack.back().el_format
|
|
= state_stack.top().dfs_format;
|
|
state_stack.pop();
|
|
if (!this->dp_group_stack.back().empty()) {
|
|
(*riter).PUSH_BACK(
|
|
element(this->dp_group_stack.back(), DNT_GROUP));
|
|
} else {
|
|
(*riter).PUSH_BACK(element());
|
|
riter->back().e_capture.c_begin
|
|
= elem.e_capture.c_begin;
|
|
riter->back().e_capture.c_end = elem.e_capture.c_begin;
|
|
riter->back().e_token = DNT_GROUP;
|
|
riter->back().assign_elements(
|
|
this->dp_group_stack.back());
|
|
}
|
|
this->dp_group_stack.pop_back();
|
|
} else {
|
|
this->dp_group_stack.back().PUSH_BACK(elem);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
this->dp_group_stack.back().PUSH_BACK(elem);
|
|
break;
|
|
}
|
|
}
|
|
|
|
while (this->dp_group_stack.size() > 1) {
|
|
this->dp_group_token.pop_back();
|
|
|
|
auto riter = this->dp_group_stack.rbegin();
|
|
++riter;
|
|
if (!this->dp_group_stack.back().empty()) {
|
|
state_stack.top().finalize();
|
|
this->dp_group_stack.back().el_format
|
|
= state_stack.top().dfs_format;
|
|
state_stack.pop();
|
|
(*riter).PUSH_BACK(element(this->dp_group_stack.back(), DNT_GROUP));
|
|
}
|
|
this->dp_group_stack.pop_back();
|
|
}
|
|
|
|
state_stack.top().finalize();
|
|
this->dp_group_stack.back().el_format = state_stack.top().dfs_format;
|
|
}
|
|
|
|
void
|
|
data_parser::end_of_value(data_parser::element_list_t& el_stack,
|
|
data_parser::element_list_t& key_comps,
|
|
data_parser::element_list_t& value,
|
|
const data_parser::element_list_t& in_list,
|
|
int group_depth)
|
|
{
|
|
key_comps.remove_if(element_if(in_list.el_format.df_terminator));
|
|
key_comps.remove_if(element_if(DT_COMMA));
|
|
value.remove_if(element_if(in_list.el_format.df_terminator));
|
|
value.remove_if(element_if(DT_COMMA));
|
|
strip(key_comps, element_is_space{});
|
|
strip(value, element_is_space{});
|
|
if ((el_stack.empty() || el_stack.back().e_token != DNT_KEY)
|
|
&& value.empty() && key_comps.size() > 1
|
|
&& (key_comps.front().e_token == DT_WORD
|
|
|| key_comps.front().e_token == DT_SYMBOL))
|
|
{
|
|
element_list_t::iterator key_iter, key_end;
|
|
bool found_value = false;
|
|
int word_count = 0;
|
|
key_iter = key_comps.begin();
|
|
key_end = key_comps.begin();
|
|
for (; key_iter != key_comps.end(); ++key_iter) {
|
|
if (key_iter->e_token == DT_WORD || key_iter->e_token == DT_SYMBOL)
|
|
{
|
|
word_count += 1;
|
|
if (found_value) {
|
|
key_end = key_comps.begin();
|
|
}
|
|
} else if (key_iter->e_token == DT_WHITE
|
|
|| key_iter->e_token == DT_CSI)
|
|
{
|
|
} else {
|
|
if (!found_value) {
|
|
key_end = key_iter;
|
|
}
|
|
found_value = true;
|
|
}
|
|
}
|
|
if (word_count != 1) {
|
|
key_end = key_comps.begin();
|
|
}
|
|
value.SPLICE(value.end(), key_comps, key_end, key_comps.end());
|
|
strip(key_comps, element_is_space{});
|
|
if (!key_comps.empty()) {
|
|
el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
|
|
}
|
|
key_comps.CLEAR();
|
|
} else {
|
|
value.SPLICE(
|
|
value.end(), key_comps, key_comps.begin(), key_comps.end());
|
|
}
|
|
strip(value, element_is_space{});
|
|
strip(value, element_if(DT_COLON));
|
|
strip(value, element_is_space{});
|
|
if (!value.empty()) {
|
|
if (value.size() == 2 && value.back().e_token == DNT_GROUP) {
|
|
element_list_t ELEMENT_LIST_T(group_pair);
|
|
|
|
group_pair.PUSH_BACK(element(value, DNT_PAIR));
|
|
el_stack.PUSH_BACK(element(group_pair, DNT_VALUE));
|
|
} else {
|
|
el_stack.PUSH_BACK(element(value, DNT_VALUE));
|
|
}
|
|
}
|
|
value.CLEAR();
|
|
}
|
|
|
|
void
|
|
data_parser::parse()
|
|
{
|
|
this->discover_format();
|
|
|
|
this->pairup(
|
|
&this->dp_schema_id, this->dp_pairs, this->dp_group_stack.front());
|
|
}
|
|
|
|
std::string
|
|
data_parser::get_element_string(const data_parser::element& elem) const
|
|
{
|
|
return this->dp_scanner->to_string_fragment(elem.e_capture).to_string();
|
|
}
|
|
|
|
std::string
|
|
data_parser::get_string_up_to_value(const data_parser::element& elem)
|
|
{
|
|
const element& val_elem
|
|
= elem.e_token == DNT_PAIR ? elem.e_sub_elements->back() : elem;
|
|
|
|
if (this->dp_msg_format_begin <= val_elem.e_capture.c_begin) {
|
|
auto leading_and_key = data_scanner::capture_t(
|
|
this->dp_msg_format_begin, val_elem.e_capture.c_begin);
|
|
auto str = this->dp_scanner->get_input().data();
|
|
if (leading_and_key.length() >= 2) {
|
|
switch (str[leading_and_key.c_end - 1]) {
|
|
case '\'':
|
|
case '"':
|
|
leading_and_key.c_end -= 1;
|
|
switch (str[leading_and_key.c_end - 1]) {
|
|
case 'r':
|
|
case 'u':
|
|
leading_and_key.c_end -= 1;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
switch (str[leading_and_key.c_begin]) {
|
|
case '\'':
|
|
case '"':
|
|
leading_and_key.c_begin += 1;
|
|
break;
|
|
}
|
|
}
|
|
this->dp_msg_format_begin = val_elem.e_capture.c_end;
|
|
return this->dp_scanner->to_string_fragment(leading_and_key)
|
|
.to_string();
|
|
} else {
|
|
this->dp_msg_format_begin = val_elem.e_capture.c_end;
|
|
}
|
|
return "";
|
|
}
|
|
|
|
const char*
|
|
data_parser::get_element_string(const data_parser::element& elem,
|
|
size_t& len_out)
|
|
{
|
|
len_out = elem.e_capture.length();
|
|
return this->dp_scanner->to_string_fragment(elem.e_capture).data();
|
|
}
|
|
|
|
void
|
|
data_parser::print(FILE* out, data_parser::element_list_t& el)
|
|
{
|
|
fprintf(out,
|
|
" %s\n",
|
|
this->dp_scanner->get_input().to_string().c_str());
|
|
for (auto& iter : el) {
|
|
iter.print(out, *this->dp_scanner);
|
|
}
|
|
}
|
|
|
|
FILE* data_parser::TRACE_FILE;
|
|
|
|
data_format_state_t
|
|
dfs_prefix_next(data_format_state_t state, data_token_t next_token)
|
|
{
|
|
data_format_state_t retval = state;
|
|
|
|
switch (state) {
|
|
case DFS_INIT:
|
|
switch (next_token) {
|
|
case DT_PATH:
|
|
case DT_COLON:
|
|
case DT_EQUALS:
|
|
case DT_CONSTANT:
|
|
case DT_EMAIL:
|
|
case DT_WORD:
|
|
case DT_SYMBOL:
|
|
case DT_OCTAL_NUMBER:
|
|
case DT_HEX_NUMBER:
|
|
case DT_NUMBER:
|
|
case DT_WHITE:
|
|
case DT_CSI:
|
|
case DT_LSQUARE:
|
|
case DT_RSQUARE:
|
|
case DT_LANGLE:
|
|
case DT_RANGLE:
|
|
case DT_EMPTY_CONTAINER:
|
|
break;
|
|
|
|
default:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_EXPECTING_SEP:
|
|
case DFS_ERROR:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return retval;
|
|
}
|
|
|
|
data_format_state_t
|
|
dfs_semi_next(data_format_state_t state, data_token_t next_token)
|
|
{
|
|
data_format_state_t retval = state;
|
|
|
|
switch (state) {
|
|
case DFS_INIT:
|
|
switch (next_token) {
|
|
case DT_COMMA:
|
|
case DT_SEMI:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
retval = DFS_KEY;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_KEY:
|
|
switch (next_token) {
|
|
case DT_COLON:
|
|
case DT_EQUALS:
|
|
retval = DFS_VALUE;
|
|
break;
|
|
|
|
case DT_SEMI:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_VALUE:
|
|
switch (next_token) {
|
|
case DT_SEMI:
|
|
retval = DFS_INIT;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_EXPECTING_SEP:
|
|
case DFS_ERROR:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
}
|
|
|
|
return retval;
|
|
}
|
|
|
|
data_format_state_t
|
|
dfs_comma_next(data_format_state_t state, data_token_t next_token)
|
|
{
|
|
data_format_state_t retval = state;
|
|
|
|
switch (state) {
|
|
case DFS_INIT:
|
|
switch (next_token) {
|
|
case DT_COMMA:
|
|
break;
|
|
|
|
case DT_SEMI:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
retval = DFS_KEY;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_KEY:
|
|
switch (next_token) {
|
|
case DT_COLON:
|
|
case DT_EQUALS:
|
|
retval = DFS_VALUE;
|
|
break;
|
|
|
|
case DT_COMMA:
|
|
retval = DFS_INIT;
|
|
break;
|
|
|
|
case DT_WORD:
|
|
retval = DFS_EXPECTING_SEP;
|
|
break;
|
|
|
|
case DT_SEMI:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_EXPECTING_SEP:
|
|
switch (next_token) {
|
|
case DT_COLON:
|
|
case DT_EQUALS:
|
|
case DT_LPAREN:
|
|
case DT_LCURLY:
|
|
case DT_LSQUARE:
|
|
case DT_LANGLE:
|
|
retval = DFS_VALUE;
|
|
break;
|
|
|
|
case DT_EMPTY_CONTAINER:
|
|
retval = DFS_INIT;
|
|
break;
|
|
|
|
case DT_COMMA:
|
|
case DT_SEMI:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_VALUE:
|
|
switch (next_token) {
|
|
case DT_COMMA:
|
|
retval = DFS_INIT;
|
|
break;
|
|
|
|
case DT_COLON:
|
|
case DT_EQUALS:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case DFS_ERROR:
|
|
retval = DFS_ERROR;
|
|
break;
|
|
}
|
|
|
|
return retval;
|
|
}
|
|
|
|
data_parser::element::element()
|
|
: e_capture(-1, -1), e_token(DT_INVALID), e_sub_elements(nullptr)
|
|
{
|
|
}
|
|
|
|
data_parser::element::element(data_parser::element_list_t& subs,
|
|
data_token_t token,
|
|
bool assign_subs_elements)
|
|
: e_capture(subs.front().e_capture.c_begin, subs.back().e_capture.c_end),
|
|
e_token(token), e_sub_elements(nullptr)
|
|
{
|
|
if (assign_subs_elements) {
|
|
this->assign_elements(subs);
|
|
}
|
|
}
|
|
|
|
data_parser::element::element(const data_parser::element& other)
|
|
{
|
|
/* require(other.e_sub_elements == nullptr); */
|
|
|
|
this->e_capture = other.e_capture;
|
|
this->e_token = other.e_token;
|
|
this->e_sub_elements = nullptr;
|
|
if (other.e_sub_elements != nullptr) {
|
|
this->assign_elements(*other.e_sub_elements);
|
|
}
|
|
}
|
|
|
|
data_parser::element::~element()
|
|
{
|
|
delete this->e_sub_elements;
|
|
this->e_sub_elements = nullptr;
|
|
}
|
|
|
|
data_parser::element&
|
|
data_parser::element::operator=(const data_parser::element& other)
|
|
{
|
|
this->e_capture = other.e_capture;
|
|
this->e_token = other.e_token;
|
|
this->e_sub_elements = nullptr;
|
|
if (other.e_sub_elements != nullptr) {
|
|
this->assign_elements(*other.e_sub_elements);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
void
|
|
data_parser::element::assign_elements(data_parser::element_list_t& subs)
|
|
{
|
|
if (this->e_sub_elements == nullptr) {
|
|
this->e_sub_elements = new element_list_t("_sub_", __FILE__, __LINE__);
|
|
this->e_sub_elements->el_format = subs.el_format;
|
|
}
|
|
this->e_sub_elements->SWAP(subs);
|
|
this->update_capture();
|
|
}
|
|
|
|
void
|
|
data_parser::element::update_capture()
|
|
{
|
|
if (this->e_sub_elements != nullptr && !this->e_sub_elements->empty()) {
|
|
this->e_capture.c_begin
|
|
= this->e_sub_elements->front().e_capture.c_begin;
|
|
this->e_capture.c_end = this->e_sub_elements->back().e_capture.c_end;
|
|
}
|
|
}
|
|
|
|
const data_parser::element&
|
|
data_parser::element::get_pair_value() const
|
|
{
|
|
require(this->e_token == DNT_PAIR);
|
|
|
|
return this->e_sub_elements->back();
|
|
}
|
|
|
|
data_token_t
|
|
data_parser::element::value_token() const
|
|
{
|
|
data_token_t retval = DT_INVALID;
|
|
|
|
if (this->e_token == DNT_VALUE) {
|
|
if (this->e_sub_elements != nullptr
|
|
&& this->e_sub_elements->size() == 1)
|
|
{
|
|
retval = this->e_sub_elements->front().e_token;
|
|
} else {
|
|
retval = DT_SYMBOL;
|
|
}
|
|
} else {
|
|
retval = this->e_token;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
const data_parser::element&
|
|
data_parser::element::get_value_elem() const
|
|
{
|
|
if (this->e_token == DNT_VALUE) {
|
|
if (this->e_sub_elements != nullptr
|
|
&& this->e_sub_elements->size() == 1)
|
|
{
|
|
return this->e_sub_elements->front();
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
const data_parser::element&
|
|
data_parser::element::get_pair_elem() const
|
|
{
|
|
if (this->e_token == DNT_VALUE) {
|
|
return this->e_sub_elements->front();
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
void
|
|
data_parser::element::print(FILE* out, data_scanner& ds, int offset) const
|
|
{
|
|
int lpc;
|
|
|
|
if (this->e_sub_elements != nullptr) {
|
|
for (auto& e_sub_element : *this->e_sub_elements) {
|
|
e_sub_element.print(out, ds, offset + 1);
|
|
}
|
|
}
|
|
|
|
fprintf(out,
|
|
"%4s %3d:%-3d ",
|
|
data_scanner::token2name(this->e_token),
|
|
this->e_capture.c_begin,
|
|
this->e_capture.c_end);
|
|
for (lpc = 0; lpc < this->e_capture.c_end; lpc++) {
|
|
if (lpc == this->e_capture.c_begin) {
|
|
fputc('^', out);
|
|
} else if (lpc == (this->e_capture.c_end - 1)) {
|
|
fputc('^', out);
|
|
} else if (lpc > this->e_capture.c_begin) {
|
|
fputc('-', out);
|
|
} else {
|
|
fputc(' ', out);
|
|
}
|
|
}
|
|
for (; lpc < (int) ds.get_input().length(); lpc++) {
|
|
fputc(' ', out);
|
|
}
|
|
|
|
std::string sub = ds.to_string_fragment(this->e_capture).to_string();
|
|
fprintf(out, " %s\n", sub.c_str());
|
|
}
|
|
|
|
data_parser::discover_format_state::discover_format_state()
|
|
: dfs_prefix_state(DFS_INIT), dfs_semi_state(DFS_INIT),
|
|
dfs_comma_state(DFS_INIT)
|
|
{
|
|
memset(this->dfs_hist, 0, sizeof(this->dfs_hist));
|
|
}
|
|
|
|
void
|
|
data_parser::discover_format_state::update_for_element(
|
|
const data_parser::element& elem)
|
|
{
|
|
this->dfs_prefix_state
|
|
= dfs_prefix_next(this->dfs_prefix_state, elem.e_token);
|
|
this->dfs_semi_state = dfs_semi_next(this->dfs_semi_state, elem.e_token);
|
|
this->dfs_comma_state = dfs_comma_next(this->dfs_comma_state, elem.e_token);
|
|
if (this->dfs_prefix_state != DFS_ERROR) {
|
|
if (this->dfs_semi_state == DFS_ERROR) {
|
|
this->dfs_semi_state = DFS_INIT;
|
|
}
|
|
if (this->dfs_comma_state == DFS_ERROR) {
|
|
this->dfs_comma_state = DFS_INIT;
|
|
}
|
|
}
|
|
this->dfs_hist[elem.e_token] += 1;
|
|
}
|
|
|
|
void
|
|
data_parser::discover_format_state::finalize()
|
|
{
|
|
data_token_t qualifier = this->dfs_format.df_qualifier;
|
|
data_token_t separator = this->dfs_format.df_separator;
|
|
data_token_t prefix_term = this->dfs_format.df_prefix_terminator;
|
|
|
|
this->dfs_format = FORMAT_PLAIN;
|
|
if (this->dfs_hist[DT_EQUALS]) {
|
|
qualifier = DT_COLON;
|
|
separator = DT_EQUALS;
|
|
}
|
|
|
|
if (this->dfs_semi_state != DFS_ERROR && this->dfs_hist[DT_SEMI]) {
|
|
this->dfs_format = FORMAT_SEMI;
|
|
} else if (this->dfs_comma_state != DFS_ERROR) {
|
|
this->dfs_format = FORMAT_COMMA;
|
|
if (separator == DT_COLON && this->dfs_hist[DT_COMMA] > 0) {
|
|
if (!((this->dfs_hist[DT_COLON] == this->dfs_hist[DT_COMMA])
|
|
|| ((this->dfs_hist[DT_COLON] - 1)
|
|
== this->dfs_hist[DT_COMMA])))
|
|
{
|
|
separator = DT_INVALID;
|
|
if (this->dfs_hist[DT_COLON] == 1) {
|
|
prefix_term = DT_COLON;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
this->dfs_format.df_qualifier = qualifier;
|
|
this->dfs_format.df_separator = separator;
|
|
this->dfs_format.df_prefix_terminator = prefix_term;
|
|
}
|