/** * Copyright (c) 2007-2012, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __data_parser_hh #define __data_parser_hh #include #include #include #include #include #include #include "yajlpp.hh" #include "pcrepp.hh" #include "byte_array.hh" #include "data_scanner.hh" /** * Switch to the 'parser' view mode when the user hits ';' so they * can easily see what columns are available. * * select * from logline; * select itemfrom(csv_key, 0) from logline; * select itemfrom(csv_key, -1) from logline; * select itemfrom(dict_key, "key") from logline; * select itemfrom(dict_key, "key[0]") from logline; * select itemfrom(csv_key, 0:3) from logline; support splices ? * * Add a command to create a logline table with a given name so the user can * do joins across the tables: * * create-logline-table sudo_logline * select * from logline, sudo_logline where sudo_logline.COMMAND=logline.COMMAND; * * select timestmap / 60 as minute, sc_status, count(*) from access_log * group by minute, sc_status * order by minute, sc_status desc; * (use group_concat() here?) * * The "itemfrom()" function parses the group and lets you specify an * expression to query the contents. * * For 'report-on' command: * 'report-on PWD' * select PWD,count(*) as amount from logline group by PWD order by amount desc; * 'report-on num_col num_col2' * select avg(num_col),stddev(num_col),... from logline; * Instead of a command, we should automatically create views with the * relevant select statements. * * Add a tojson() aggregate function to sqlite: * select foo,tojson(bar) group by foo; * * 1 ["a", "b", "c"] * 2 ["d", "e", "f"] * * We should automatically detect sqlite files provided on the command line * and attach the database. * * add a 'metadata' view that has all the metadata crud (sql tables/log) * * Add support for sqlite_log that writes to a temp file and is displayed in the * metadata view. * * Add a function that bookmarks all lines in the log view based on line_numbers * in the sql query result. * * select line_number from logline where A="b"; * hit 'y/Y' to move forward and backwards through sql results * hit 'R' key and all the lines are bookmarked * * add path manipulation functions like basename, dirname, splitext * * use the vt52_curses emulation to embed a editor for editing queries. For * example, you could hit 'ctrl+;' and it would split the window in half with * the bottom being used for nano. When the file was written, lnav should * notice and do a 'prepare' on the sql to make sure it is correct. * * Maybe add other tables for accessing lnav state. For example, you could do * a query to find log lines of interest and then insert their line numbers * into the 'bookmarks' table to create new user bookmarks. */ #define ELEMENT_LIST_T(var) var("" #var, __FILE__, __LINE__) #define PUSH_BACK(elem) push_back(elem, __FILE__, __LINE__) #define POP_FRONT(elem) pop_front(__FILE__, __LINE__) #define POP_BACK(elem) pop_back(__FILE__, __LINE__) #define SPLICE(pos, other, first, last) splice(pos, other, first, last, \ __FILE__, __LINE__) template void strip(Container &container, UnaryPredicate p) { while (!container.empty() && p(container.front())) { container.POP_FRONT(); } while (!container.empty() && p(container.back())) { container.POP_BACK(); } } enum data_format_state_t { DFS_ERROR = -1, DFS_INIT, DFS_KEY, DFS_EXPECTING_SEP, DFS_VALUE, }; struct data_format { data_format(const char *name, data_token_t appender = DT_INVALID, data_token_t terminator = DT_INVALID) : df_name(name), df_appender(appender), df_terminator(terminator) {}; const char * df_name; const data_token_t df_appender; const data_token_t df_terminator; }; data_format_state_t dfs_prefix_next(data_format_state_t state, data_token_t next_token); data_format_state_t dfs_semi_next(data_format_state_t state, data_token_t next_token); data_format_state_t dfs_comma_next(data_format_state_t state, data_token_t next_token); #define LIST_INIT_TRACE \ do { \ if (TRACE_FILE != NULL) { \ fprintf(TRACE_FILE, \ "%p %s:%d %s %s\n", \ this, \ fn, line, \ __func__, \ varname); \ } \ } while (false) #define LIST_DEINIT_TRACE \ do { \ if (TRACE_FILE != NULL) { \ fprintf(TRACE_FILE, \ "%p %s:%d %s\n", \ this, \ fn, line, \ __func__); \ } \ } while (false) #define ELEMENT_TRACE \ do { \ if (TRACE_FILE != NULL) { \ fprintf(TRACE_FILE, \ "%p %s:%d %s %s %d:%d\n", \ this, \ fn, line, \ __func__, \ data_scanner::token2name(elem.e_token), \ elem.e_capture.c_begin, \ elem.e_capture.c_end); \ } \ } while (false) #define LIST_TRACE \ do { \ if (TRACE_FILE != NULL) { \ fprintf(TRACE_FILE, \ "%p %s:%d %s\n", \ this, \ fn, line, \ __func__); \ } \ } while (false) #define SPLICE_TRACE \ do { \ if (TRACE_FILE != NULL) { \ fprintf(TRACE_FILE, \ "%p %s:%d %s %d %p %d:%d\n", \ this, \ fn, line, \ __func__, \ (int)std::distance(this->begin(), pos), \ &other, \ (int)std::distance(other.begin(), first), \ (int)std::distance(last, other.end())); \ } \ } while (false); #define POINT_TRACE(name) \ do { \ if (TRACE_FILE) { \ fprintf(TRACE_FILE, \ "0x0 %s:%d point %s\n", \ __FILE__, __LINE__, \ name); \ } \ } while (false); class data_parser { public: static data_format FORMAT_SEMI; static data_format FORMAT_COMMA; static data_format FORMAT_PLAIN; static FILE *TRACE_FILE; typedef byte_array schema_id_t; struct element; /* typedef std::list element_list_t; */ class element_list_t : public std::list { public: element_list_t(const char *varname, const char *fn, int line) { LIST_INIT_TRACE; } element_list_t() { const char *varname = "_anon2_"; const char *fn = __FILE__; int line = __LINE__; LIST_INIT_TRACE; }; ~element_list_t() { const char *fn = __FILE__; int line = __LINE__; LIST_DEINIT_TRACE; }; void push_back(const element &elem, const char *fn, int line) { ELEMENT_TRACE; this->std::list::push_back(elem); }; void pop_front(const char *fn, int line) { LIST_TRACE; this->std::list::pop_front(); }; void pop_back(const char *fn, int line) { LIST_TRACE; this->std::list::pop_back(); }; void splice(iterator pos, element_list_t &other, iterator first, iterator last, const char *fn, int line) { SPLICE_TRACE; this->std::list::splice(pos, other, first, last); } }; struct element { element() : e_token(DT_INVALID), e_sub_elements(NULL) { }; element(element_list_t &subs, data_token_t token, bool assign_subs_elements = true) : e_capture(subs.front().e_capture.c_begin, subs.back().e_capture.c_end), e_token(token), e_sub_elements(NULL) { if (assign_subs_elements) { this->assign_elements(subs); } }; element(const element &other) { /* assert(other.e_sub_elements == NULL); */ this->e_capture = other.e_capture; this->e_token = other.e_token; this->e_sub_elements = NULL; if (other.e_sub_elements != NULL) { this->assign_elements(*other.e_sub_elements); } }; ~element() { if (this->e_sub_elements != NULL) { delete this->e_sub_elements; this->e_sub_elements = NULL; } }; element & operator=(const element &other) { this->e_capture = other.e_capture; this->e_token = other.e_token; this->e_sub_elements = NULL; if (other.e_sub_elements != NULL) { this->assign_elements(*other.e_sub_elements); } return *this; }; void assign_elements(element_list_t &subs) { if (this->e_sub_elements == NULL) { this->e_sub_elements = new element_list_t("_sub_", __FILE__, __LINE__); } this->e_sub_elements->swap(subs); this->update_capture(); }; void update_capture(void) { if (this->e_sub_elements != NULL) { this->e_capture.c_begin = this->e_sub_elements->front().e_capture.c_begin; this->e_capture.c_end = this->e_sub_elements->back().e_capture.c_end; } }; const element & get_pair_value(void) const { assert(this->e_token == DNT_PAIR); return this->e_sub_elements->back(); }; data_token_t value_token(void) const { data_token_t retval = DT_INVALID; if (this->e_token == DNT_VALUE) { if (this->e_sub_elements != NULL && this->e_sub_elements->size() == 1) { retval = this->e_sub_elements->front().e_token; } } else { retval = this->e_token; } return retval; }; void print(FILE *out, pcre_input &pi, int offset = 0) { int lpc; if (this->e_sub_elements != NULL) { for (element_list_t::iterator iter2 = this->e_sub_elements->begin(); iter2 != this->e_sub_elements->end(); ++iter2) { iter2->print(out, pi, offset + 1); } } fprintf(out, "%4s %3d:%-3d ", data_scanner::token2name(this->e_token), this->e_capture.c_begin, this->e_capture.c_end); for (lpc = 0; lpc < this->e_capture.c_end; lpc++) { if (lpc == this->e_capture.c_begin) { fputc('^', out); } else if (lpc == (this->e_capture.c_end - 1)) { fputc('^', out); } else if (lpc > this->e_capture.c_begin) { fputc('-', out); } else{ fputc(' ', out); } } for (; lpc < (int)pi.pi_length; lpc++) { fputc(' ', out); } std::string sub = pi.get_substr(&this->e_capture); fprintf(out, " %s\n", sub.c_str()); }; pcre_context::capture_t e_capture; data_token_t e_token; element_list_t * e_sub_elements; }; struct element_cmp { bool operator()(data_token_t token, const element &elem) const { return token == elem.e_token || token == DT_ANY; }; bool operator()(const element &elem, data_token_t token) const { return (*this)(token, elem); }; }; struct element_if { element_if(data_token_t token) : ei_token(token) { }; bool operator()(const element &a) const { return a.e_token == this->ei_token; }; private: data_token_t ei_token; }; data_parser(data_scanner *ds) : dp_errors("dp_errors", __FILE__, __LINE__), dp_pairs("dp_pairs", __FILE__, __LINE__), dp_format(NULL), dp_scanner(ds) { if (TRACE_FILE != NULL) { fprintf(TRACE_FILE, "input %s\n", ds->get_input().get_string()); } }; void pairup(schema_id_t *schema, element_list_t &pairs_out, element_list_t &in_list) { element_list_t ELEMENT_LIST_T(el_stack), ELEMENT_LIST_T(free_row), ELEMENT_LIST_T(key_comps), ELEMENT_LIST_T(value), ELEMENT_LIST_T(prefix); SHA_CTX context; POINT_TRACE("pairup_start"); for (element_list_t::iterator iter = in_list.begin(); iter != in_list.end(); ++iter) { if (iter->e_token == DNT_GROUP) { element_list_t ELEMENT_LIST_T(group_pairs); this->pairup(NULL, group_pairs, *iter->e_sub_elements); if (!group_pairs.empty()) { iter->assign_elements(group_pairs); } } if (iter->e_token == this->dp_format->df_terminator) { std::vector key_copy; value.SPLICE(value.end(), key_comps, key_comps.begin(), key_comps.end()); value.remove_if(element_if(this->dp_format->df_terminator)); strip(value, element_if(DT_WHITE)); value.remove_if(element_if(DT_COMMA)); if (!value.empty()) { el_stack.PUSH_BACK(element(value, DNT_VALUE)); } value.clear(); key_comps.PUSH_BACK(*iter); } else if (iter->e_token == DT_SEPARATOR) { element_list_t::iterator key_iter = key_comps.end(); bool found = false; do { --key_iter; if (key_iter->e_token == this->dp_format->df_appender) { ++key_iter; value.SPLICE(value.end(), key_comps, key_comps.begin(), key_iter); key_comps.POP_FRONT(); found = true; } else if (key_iter->e_token == this->dp_format->df_terminator) { std::vector key_copy; value.SPLICE(value.end(), key_comps, key_comps.begin(), key_iter); ++key_iter; key_comps.POP_FRONT(); strip(key_comps, element_if(DT_WHITE)); found = true; } } while (key_iter != key_comps.begin() && !found); if (!found && !el_stack.empty() && !key_comps.empty()) { element_list_t::iterator value_iter; if (el_stack.size() > 1 && this->dp_format->df_appender != DT_INVALID && this->dp_format->df_terminator != DT_INVALID) { /* If we're expecting a terminator and haven't found it */ /* then this is part of the value. */ continue; } value.SPLICE(value.end(), key_comps, key_comps.begin(), key_comps.end()); value_iter = value.end(); std::advance(value_iter, -1); key_comps.SPLICE(key_comps.begin(), value, value_iter, value.end()); key_comps.resize(1); } strip(value, element_if(DT_WHITE)); value.remove_if(element_if(DT_COMMA)); if (!value.empty()) { el_stack.PUSH_BACK(element(value, DNT_VALUE)); } strip(key_comps, element_if(DT_WHITE)); if (!key_comps.empty()) { el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false)); } key_comps.clear(); value.clear(); } else { key_comps.PUSH_BACK(*iter); } POINT_TRACE("pairup_loop"); } POINT_TRACE("pairup_eol"); if (el_stack.empty()) { free_row.SPLICE(free_row.begin(), key_comps, key_comps.begin(), key_comps.end()); } else { value.SPLICE(value.begin(), key_comps, key_comps.begin(), key_comps.end()); value.remove_if(element_if(this->dp_format->df_terminator)); strip(value, element_if(DT_WHITE)); value.remove_if(element_if(DT_COMMA)); if (!value.empty()) { el_stack.PUSH_BACK(element(value, DNT_VALUE)); } } POINT_TRACE("pairup_stack"); SHA_Init(&context); while (!el_stack.empty()) { element_list_t::iterator kv_iter = el_stack.begin(); if (kv_iter->e_token == DNT_VALUE) { if (pairs_out.empty()) { free_row.PUSH_BACK(el_stack.front()); } else { element_list_t ELEMENT_LIST_T(free_pair_subs); struct element blank; blank.e_capture.c_begin = blank.e_capture.c_end = el_stack.front().e_capture. c_begin; blank.e_token = DNT_KEY; free_pair_subs.PUSH_BACK(blank); free_pair_subs.PUSH_BACK(el_stack.front()); pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR)); } } if (kv_iter->e_token != DNT_KEY) { el_stack.POP_FRONT(); continue; } ++kv_iter; if (kv_iter == el_stack.end()) { el_stack.POP_FRONT(); continue; } if (kv_iter->e_token != DNT_VALUE) { el_stack.POP_FRONT(); continue; } std::string key_val = this->get_element_string(el_stack.front()); element_list_t ELEMENT_LIST_T(pair_subs); if (schema != NULL) { SHA_Update(&context, key_val.c_str(), key_val.length()); } while (!free_row.empty()) { element_list_t ELEMENT_LIST_T(free_pair_subs); struct element blank; blank.e_capture.c_begin = blank.e_capture.c_end = free_row.front().e_capture. c_begin; blank.e_token = DNT_KEY; free_pair_subs.PUSH_BACK(blank); free_pair_subs.PUSH_BACK(free_row.front()); pairs_out.PUSH_BACK(element(free_pair_subs, DNT_PAIR)); free_row.POP_FRONT(); } ++kv_iter; pair_subs.SPLICE(pair_subs.begin(), el_stack, el_stack.begin(), kv_iter); pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR)); } if (pairs_out.size() == 1) { element &pair = pairs_out.front(); element &value = pair.e_sub_elements->back(); if (value.e_token == DNT_VALUE && value.e_sub_elements != NULL && value.e_sub_elements->size() > 1) { element_list_t::iterator next_sub; next_sub = pair.e_sub_elements->begin(); ++next_sub; prefix.SPLICE(prefix.begin(), *pair.e_sub_elements, pair.e_sub_elements->begin(), next_sub); free_row.clear(); free_row.SPLICE(free_row.begin(), *value.e_sub_elements, value.e_sub_elements->begin(), value.e_sub_elements->end()); pairs_out.clear(); SHA_Init(&context); } } if (pairs_out.empty() && !free_row.empty()) { while (!free_row.empty()) { switch (free_row.front().e_token) { case DNT_GROUP: case DNT_VALUE: case DT_EMAIL: case DT_CONSTANT: case DT_NUMBER: case DT_SYMBOL: case DT_HEX_NUMBER: case DT_OCTAL_NUMBER: case DT_VERSION_NUMBER: case DT_QUOTED_STRING: case DT_IPV4_ADDRESS: case DT_IPV6_ADDRESS: case DT_MAC_ADDRESS: case DT_UUID: case DT_URL: case DT_PATH: case DT_TIME: case DT_PERCENTAGE: { element_list_t ELEMENT_LIST_T(pair_subs); struct element blank; blank.e_capture.c_begin = blank.e_capture.c_end = free_row.front().e_capture. c_begin; blank.e_token = DNT_KEY; pair_subs.PUSH_BACK(blank); pair_subs.PUSH_BACK(free_row.front()); pairs_out.PUSH_BACK(element(pair_subs, DNT_PAIR)); } break; default: { std::string key_val = this->get_element_string( free_row.front()); SHA_Update(&context, key_val.c_str(), key_val.length()); } break; } free_row.POP_FRONT(); } } if (!prefix.empty()) { element_list_t ELEMENT_LIST_T(pair_subs); struct element blank; blank.e_capture.c_begin = blank.e_capture.c_end = prefix.front().e_capture.c_begin; blank.e_token = DNT_KEY; pair_subs.PUSH_BACK(blank); pair_subs.PUSH_BACK(prefix.front()); pairs_out.push_front(element(pair_subs, DNT_PAIR)); } if (schema != NULL) { SHA_Final(this->dp_schema_id.ba_data, &context); } }; void discover_format(void) { pcre_context_static<30> pc; int hist[DT_TERMINAL_MAX]; struct element elem; this->dp_group_token.push_back(DT_INVALID); this->dp_group_stack.resize(1); data_format_state_t prefix_state = DFS_INIT; data_format_state_t semi_state = DFS_INIT; data_format_state_t comma_state = DFS_INIT; memset(hist, 0, sizeof(hist)); while (this->dp_scanner->tokenize(pc, elem.e_token)) { pcre_context::iterator pc_iter; pc_iter = std::find_if(pc.begin(), pc.end(), capture_if_not(-1)); assert(pc_iter != pc.end()); elem.e_capture = *pc_iter; assert(elem.e_capture.c_begin != -1); assert(elem.e_capture.c_end != -1); prefix_state = dfs_prefix_next(prefix_state, elem.e_token); semi_state = dfs_semi_next(semi_state, elem.e_token); comma_state = dfs_comma_next(comma_state, elem.e_token); if (prefix_state != DFS_ERROR) { if (semi_state == DFS_ERROR) { semi_state = DFS_INIT; } if (comma_state == DFS_ERROR) { comma_state = DFS_INIT; } } hist[elem.e_token] += 1; switch (elem.e_token) { case DT_LPAREN: case DT_LANGLE: case DT_LCURLY: case DT_LSQUARE: this->dp_group_token.push_back(elem.e_token); this->dp_group_stack.push_back(element_list_t("_anon_", __FILE__, __LINE__)); break; case DT_RPAREN: case DT_RANGLE: case DT_RCURLY: case DT_RSQUARE: if (this->dp_group_token.back() == (elem.e_token - 1)) { this->dp_group_token.pop_back(); std::list::reverse_iterator riter = this->dp_group_stack.rbegin(); ++riter; if (!this->dp_group_stack.back().empty()) { (*riter).PUSH_BACK(element(this->dp_group_stack.back(), DNT_GROUP)); } this->dp_group_stack.pop_back(); } else { this->dp_group_stack.back().PUSH_BACK(elem); } break; default: this->dp_group_stack.back().PUSH_BACK(elem); break; } } while (this->dp_group_stack.size() > 1) { this->dp_group_token.pop_back(); std::list::reverse_iterator riter = this->dp_group_stack.rbegin(); ++riter; if (!this->dp_group_stack.back().empty()) { (*riter).PUSH_BACK(element(this->dp_group_stack.back(), DNT_GROUP)); } this->dp_group_stack.pop_back(); } if (semi_state != DFS_ERROR && hist[DT_SEMI]) { this->dp_format = &FORMAT_SEMI; } else if (comma_state != DFS_ERROR) { this->dp_format = &FORMAT_COMMA; } else { this->dp_format = &FORMAT_PLAIN; } }; void parse(void) { this->discover_format(); this->pairup(&this->dp_schema_id, this->dp_pairs, this->dp_group_stack.front()); for (element_list_t::iterator iter = this->dp_pairs.begin(); iter != this->dp_pairs.end(); ++iter) { if (iter->e_token == DNT_PAIR) { element_list_t &pair_subs = *iter->e_sub_elements; std::string key_val = this->get_element_string( pair_subs.front()); } } }; std::string get_element_string(const element &elem) const { pcre_input &pi = this->dp_scanner->get_input(); return pi.get_substr(&elem.e_capture); }; void print(FILE *out, element_list_t &el) { fprintf(out, " %s\n", this->dp_scanner->get_input().get_string()); for (element_list_t::iterator iter = el.begin(); iter != el.end(); ++iter) { iter->print(out, this->dp_scanner->get_input()); } }; std::vector dp_group_token; std::list dp_group_stack; element_list_t dp_errors; element_list_t dp_pairs; schema_id_t dp_schema_id; data_format * dp_format; private: data_scanner *dp_scanner; }; #endif