lnav/src/data_scanner.cc

350 lines
11 KiB
C++
Raw Normal View History

2013-05-03 06:02:03 +00:00
/**
* Copyright (c) 2007-2012, Timothy Stack
*
* All rights reserved.
2013-05-28 04:35:00 +00:00
*
2013-05-03 06:02:03 +00:00
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
2013-05-28 04:35:00 +00:00
*
2013-05-03 06:02:03 +00:00
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
2013-05-28 04:35:00 +00:00
*
2013-05-03 06:02:03 +00:00
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
2011-06-20 05:30:10 +00:00
2011-06-13 14:46:03 +00:00
#include "config.h"
#include <arpa/inet.h>
2013-06-10 13:39:52 +00:00
#include <netinet/in.h>
#include <sys/socket.h>
2011-06-13 14:46:03 +00:00
#include "pcrepp.hh"
#include "data_scanner.hh"
using namespace std;
static struct {
const char *name;
2013-05-28 04:35:00 +00:00
pcrepp pcre;
} MATCHERS[DT_TERMINAL_MAX] = {
{ "quot", pcrepp("\\A(?:(?:u|r)?\"((?:\\\\.|[^\"])+)\"|"
"(?:u|r)?'((?:\\\\.|[^'])+)')"), },
2013-07-23 12:55:08 +00:00
{ "url", pcrepp("\\A([\\w]+://[^\\s'\"\\[\\](){}]+[/a-zA-Z0-9\\-=&])"),
2013-05-28 04:35:00 +00:00
},
2013-06-16 01:07:50 +00:00
{ "path", pcrepp("\\A((?:/|\\./|\\.\\./)[\\w\\.\\-_\\~/]*)"),
},
2013-05-28 04:35:00 +00:00
{ "mac", pcrepp(
"\\A([0-9a-fA-F][0-9a-fA-F](?::[0-9a-fA-F][0-9a-fA-F]){5})(?!:)"), },
{ "date",
pcrepp("\\A("
"\\d{4}/\\d{1,2}/\\d{1,2}|"
"\\d{4}-\\d{1,2}-\\d{1,2}|"
"\\d{2}/\\w{3}/\\d{4}"
")T?"), },
2013-05-28 04:35:00 +00:00
{ "time", pcrepp(
"\\A([\\s\\d]\\d:\\d\\d(?:(?!:\\d)|:\\d\\d(?:[\\.,]\\d{3,6})?Z?))\\b"), },
2013-06-13 05:36:31 +00:00
/* { "qual", pcrepp("\\A([^\\s:=]+:[^\\s:=,]+(?!,)(?::[^\\s:=,]+)*)"), }, */
{ "ipv6", pcrepp("\\A(::|[:\\da-fA-F\\.]+[a-fA-F\\d](?:%\\w+)?)"), },
2014-03-09 19:55:02 +00:00
{ "hexd", pcrepp(
"\\A([0-9a-fA-F][0-9a-fA-F](?::[0-9a-fA-F][0-9a-fA-F])+)"), },
2013-05-28 04:35:00 +00:00
{ "xmlt", pcrepp(
"\\A(<\\??[\\w:]+\\s*(?:[\\w:]+(?:\\s*=\\s*"
"(?:\"((?:\\\\.|[^\"])+)\"|'((?:\\\\.|[^'])+)'|[^>]+)"
"))*\\s*(?:/|\\?)>)"), },
{ "xmlo", pcrepp(
"\\A(<[\\w:]+\\s*(?:[\\w:]+(?:\\s*=\\s*"
"(?:\"((?:\\\\.|[^\"])+)\"|'((?:\\\\.|[^'])+)'|[^>]+)"
"))*\\s*>)"), },
{ "xmlc", pcrepp("\\A(</[\\w:]+\\s*>)"), },
{ "coln", pcrepp("\\A(:)"),
},
{ "eq", pcrepp("\\A(=)"),
2013-05-28 04:35:00 +00:00
},
{ "comm", pcrepp("\\A(,)"),
2013-05-28 04:35:00 +00:00
},
{ "semi", pcrepp("\\A(;)"),
2013-05-28 04:35:00 +00:00
},
{ "lcurly", pcrepp("\\A({)"),
2013-05-28 04:35:00 +00:00
},
{ "rcurly", pcrepp("\\A(})"),
2013-05-28 04:35:00 +00:00
},
{ "lsquare", pcrepp("\\A(\\[)"),
2013-05-28 04:35:00 +00:00
},
{ "rsquare", pcrepp("\\A(\\])"),
2013-05-28 04:35:00 +00:00
},
{ "lparen", pcrepp("\\A(\\()"),
2013-05-28 04:35:00 +00:00
},
{ "rparen", pcrepp("\\A(\\))"),
2013-05-28 04:35:00 +00:00
},
{ "langle", pcrepp("\\A(\\<)"),
2013-05-28 04:35:00 +00:00
},
{ "rangle", pcrepp("\\A(\\>)"),
2013-05-28 04:35:00 +00:00
},
{ "ipv4", pcrepp("\\A("
"(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\\.){3}"
"(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(?![\\d]))"),
2013-05-28 04:35:00 +00:00
},
2013-05-28 04:35:00 +00:00
{ "uuid", pcrepp(
2013-06-16 01:07:50 +00:00
"\\A([0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12})"), },
2013-05-28 04:35:00 +00:00
2014-03-11 10:49:47 +00:00
{ "vers", pcrepp(
"\\A("
"[0-9]+(?:\\.[0-9]+\\w*){2,}(?:-\\w+)?|"
"[0-9]+(?:\\.[0-9]+\\w*)+(?<!\\d[eE])-\\w+?"
2014-03-11 10:49:47 +00:00
")\\b"),
2013-05-28 04:35:00 +00:00
},
{ "oct", pcrepp("\\A(-?0[0-7]+\\b)"),
2013-05-28 04:35:00 +00:00
},
{ "pcnt", pcrepp("\\A(-?[0-9]+(\\.[0-9]+)?[ ]*%\\b)"),
2013-05-28 04:35:00 +00:00
},
{ "num", pcrepp("\\A(-?[0-9]+(\\.[0-9]+)?([eE][\\-+][0-9]+)?)"
2013-06-16 01:07:50 +00:00
"\\b(?![\\._\\-][a-zA-Z])"), },
2013-06-15 01:41:31 +00:00
{ "hex", pcrepp("\\A(-?(?:0x|[0-9])[0-9a-fA-F]+)"
2013-06-16 01:07:50 +00:00
"\\b(?![\\._\\-][a-zA-Z])"), },
2013-05-28 04:35:00 +00:00
2013-06-16 01:07:50 +00:00
{ "mail", pcrepp(
"\\A([a-zA-Z0-9\\._%+-]+@[a-zA-Z0-9\\.-]+\\.[a-zA-Z]+)\\b"), },
{ "cnst",
pcrepp("\\A(true|True|TRUE|false|False|FALSE|None|null)\\b") },
2013-05-28 04:35:00 +00:00
{ "word", pcrepp(
2013-06-16 01:07:50 +00:00
"\\A([a-zA-Z][a-z']+(?=[\\s\\(\\)!\\*:;'\\\"\\?,]|[\\.\\!,\\?]\\s|$))"),
},
{ "sym", pcrepp(
2013-06-16 01:07:50 +00:00
"\\A([^\";\\s:=,\\(\\)\\{\\}\\[\\]\\+#!@%\\^&\\*'\\?<>\\~`\\|\\\\]+"
"(?:::[^\";\\s:=,\\(\\)\\{\\}\\[\\]\\+#!@%\\^&\\*'\\?<>\\~`\\|\\\\]+)*)"),
2013-05-28 04:35:00 +00:00
},
{ "line", pcrepp("\\A(\r?\n|\r|;)"),
2013-05-28 04:35:00 +00:00
},
{ "wspc", pcrepp("\\A([ \\r\\t\\n]+)"),
2013-05-28 04:35:00 +00:00
},
{ "dot", pcrepp("\\A(\\.)"),
2013-05-28 04:35:00 +00:00
},
{ "gbg", pcrepp("\\A(.)"),
2013-05-28 04:35:00 +00:00
},
2011-06-13 14:46:03 +00:00
};
const char *DNT_NAMES[DNT_MAX - DNT_KEY] = {
"key",
"pair",
"val",
"row",
"unit",
"meas",
"var",
"rang",
"dt",
"grp",
};
const char *data_scanner::token2name(data_token_t token)
{
2013-05-28 04:35:00 +00:00
if (token < 0) {
return "inv";
}
else if (token < DT_TERMINAL_MAX) {
return MATCHERS[token].name;
}
else if (token == DT_ANY) {
return "any";
}
else{
return DNT_NAMES[token - DNT_KEY];
}
}
static
bool find_string_end(const char *str, size_t &start, size_t length, char term)
{
2013-05-28 04:35:00 +00:00
for (; start < length; start++) {
if (str[start] == term) {
start += 1;
return true;
}
if (str[start] == '\\') {
if (start + 1 >= length) {
return false;
}
start += 1;
}
}
return false;
}
static
void single_char_capture(pcre_context &pc, pcre_input &pi)
{
pc.all()[0].c_begin = pi.pi_offset;
2013-06-16 01:07:50 +00:00
pc.all()[0].c_end = pi.pi_offset + 1;
pc.all()[1] = pc.all()[0];
pc.set_count(2);
pi.pi_next_offset = pi.pi_offset + 1;
}
2011-06-13 14:46:03 +00:00
bool data_scanner::tokenize(pcre_context &pc, data_token_t &token_out)
{
const char *str = this->ds_pcre_input.get_string();
2013-06-16 01:07:50 +00:00
pcre_input &pi = this->ds_pcre_input;
int lpc;
2011-06-13 14:46:03 +00:00
token_out = data_token_t(-1);
2013-05-28 04:35:00 +00:00
if (this->ds_pcre_input.pi_next_offset > this->ds_pcre_input.pi_length) {
2013-05-28 04:35:00 +00:00
return false;
}
else if (this->ds_pcre_input.pi_next_offset ==
2013-05-28 04:35:00 +00:00
this->ds_pcre_input.pi_length) {
this->ds_pcre_input.pi_next_offset += 1;
token_out = DT_LINE;
return false;
}
2011-06-13 14:46:03 +00:00
for (lpc = 0; lpc < DT_TERMINAL_MAX; lpc++) {
2013-05-28 04:35:00 +00:00
switch (lpc) {
case DT_QUOTED_STRING: {
pcre_input &pi = this->ds_pcre_input;
const char *str = pi.get_string();
size_t str_start, str_end;
bool found = false;
2014-03-09 19:55:02 +00:00
2013-05-28 04:35:00 +00:00
pi.pi_offset = pi.pi_next_offset;
str_end = str_start = pi.pi_offset + 1;
switch (str[pi.pi_offset]) {
case 'u':
case 'r':
if (pi.pi_offset + 1 < pi.pi_length &&
(str[pi.pi_offset + 1] == '\'' ||
str[pi.pi_offset + 1] == '\"')) {
str_start += 1;
str_end += 1;
found = find_string_end(str,
str_end,
pi.pi_length,
2014-03-09 19:55:02 +00:00
str[pi.pi_offset + 1]);
2013-05-28 04:35:00 +00:00
}
break;
case '\'':
case '\"':
found = find_string_end(str,
str_end,
pi.pi_length,
str[pi.pi_offset]);
break;
}
if (found) {
token_out = data_token_t(DT_QUOTED_STRING);
pi.pi_next_offset = str_end;
pc.all()[0].c_begin = pi.pi_offset;
pc.all()[0].c_end = str_end;
pc.all()[1].c_begin = str_start;
pc.all()[1].c_end = str_end - 1;
pc.set_count(2);
return true;
}
}
break;
case DT_COLON: {
pi.pi_offset = pi.pi_next_offset;
if (str[pi.pi_offset] == ':') {
token_out = data_token_t(DT_COLON);
single_char_capture(pc, pi);
return true;
}
}
break;
case DT_EQUALS: {
pi.pi_offset = pi.pi_next_offset;
if (str[pi.pi_offset] == '=') {
token_out = data_token_t(DT_EQUALS);
single_char_capture(pc, pi);
return true;
}
}
break;
case DT_COMMA: {
pi.pi_offset = pi.pi_next_offset;
if (str[pi.pi_offset] == ',') {
2013-06-16 01:07:50 +00:00
token_out = data_token_t(DT_COMMA);
single_char_capture(pc, pi);
return true;
}
}
break;
case DT_SEMI: {
pi.pi_offset = pi.pi_next_offset;
if (str[pi.pi_offset] == ';') {
2013-06-16 01:07:50 +00:00
token_out = data_token_t(DT_SEMI);
single_char_capture(pc, pi);
return true;
2013-05-28 04:35:00 +00:00
}
}
break;
default:
if (MATCHERS[lpc].pcre.match(pc, this->ds_pcre_input,
PCRE_ANCHORED)) {
switch (lpc) {
2014-03-10 14:32:22 +00:00
case DT_IPV6_ADDRESS:
if (pc.all()->length() <= INET6_ADDRSTRLEN) {
char in6str[INET6_ADDRSTRLEN];
char buf[sizeof(struct in6_addr)];
this->ds_pcre_input.get_substr(pc.all(), in6str);
if (inet_pton(AF_INET6, in6str, buf) == 1) {
token_out = data_token_t(lpc);
return true;
}
}
this->ds_pcre_input.pi_next_offset =
this->ds_pcre_input.pi_offset;
break;
2013-05-28 04:35:00 +00:00
default:
token_out = data_token_t(lpc);
return true;
}
2013-05-28 04:35:00 +00:00
}
break;
}
2011-06-13 14:46:03 +00:00
}
ensure((0 <= token_out && token_out < DT_TERMINAL_MAX));
2011-06-13 14:46:03 +00:00
return true;
2011-06-13 14:46:03 +00:00
}