2013-05-03 06:02:03 +00:00
|
|
|
/**
|
|
|
|
* Copyright (c) 2007-2012, Timothy Stack
|
|
|
|
*
|
|
|
|
* All rights reserved.
|
2013-05-28 04:35:00 +00:00
|
|
|
*
|
2013-05-03 06:02:03 +00:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
2013-05-28 04:35:00 +00:00
|
|
|
*
|
2013-05-03 06:02:03 +00:00
|
|
|
* * Redistributions of source code must retain the above copyright notice, this
|
|
|
|
* list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of Timothy Stack nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
2013-05-28 04:35:00 +00:00
|
|
|
*
|
2013-05-03 06:02:03 +00:00
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
|
|
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
|
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
|
|
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
2011-06-20 05:30:10 +00:00
|
|
|
|
2011-06-13 14:46:03 +00:00
|
|
|
#include "config.h"
|
|
|
|
|
2013-05-24 14:55:56 +00:00
|
|
|
#include <arpa/inet.h>
|
2013-06-10 13:39:52 +00:00
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <sys/socket.h>
|
2013-05-24 14:55:56 +00:00
|
|
|
|
2011-06-13 14:46:03 +00:00
|
|
|
#include "pcrepp.hh"
|
|
|
|
#include "data_scanner.hh"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2011-06-18 20:42:07 +00:00
|
|
|
static struct {
|
|
|
|
const char *name;
|
2013-05-28 04:35:00 +00:00
|
|
|
pcrepp pcre;
|
2011-06-18 20:42:07 +00:00
|
|
|
} MATCHERS[DT_TERMINAL_MAX] = {
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "quot", pcrepp("\\A(?:(?:u|r)?\"((?:\\\\.|[^\"])+)\"|"
|
|
|
|
"(?:u|r)?'((?:\\\\.|[^'])+)')"), },
|
|
|
|
{ "url", pcrepp("\\A([\\w]+://[^\\s'\"\\[\\](){}]+[a-zA-Z0-9\\-=&])"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "path", pcrepp("\\A((?:/|\\./|\\.\\./)[\\w\\.\\-_\\~/]*)"), },
|
2013-05-28 04:35:00 +00:00
|
|
|
{ "mac", pcrepp(
|
2013-06-12 04:10:59 +00:00
|
|
|
"\\A([0-9a-fA-F][0-9a-fA-F](?::[0-9a-fA-F][0-9a-fA-F]){5})"), },
|
2013-05-28 04:35:00 +00:00
|
|
|
{ "time", pcrepp(
|
2013-06-13 05:36:31 +00:00
|
|
|
"\\A(\\d?\\d:\\d\\d(:\\d\\d)?(:\\d\\d)?([,.]\\d{3})?)\\b"), }, /* XXX be more specific */
|
|
|
|
/* { "qual", pcrepp("\\A([^\\s:=]+:[^\\s:=,]+(?!,)(?::[^\\s:=,]+)*)"), }, */
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "ipv6", pcrepp("\\A(::|[:\\da-fA-f\\.]+[a-fA-f\\d])"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "sep", pcrepp("\\A(:|=)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "comm", pcrepp("\\A(,)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "semi", pcrepp("\\A(;)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "lcurly", pcrepp("\\A({)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "rcurly", pcrepp("\\A(})"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "lsquare", pcrepp("\\A(\\[)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "rsquare", pcrepp("\\A(\\])"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "lparen", pcrepp("\\A(\\()"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "rparen", pcrepp("\\A(\\))"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "langle", pcrepp("\\A(\\<)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "rangle", pcrepp("\\A(\\>)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "ipv4", pcrepp("\\A(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
{ "uuid", pcrepp(
|
2013-06-12 04:10:59 +00:00
|
|
|
"\\A([0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12})"), },
|
2013-05-28 04:35:00 +00:00
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "vers", pcrepp("\\A([0-9]+(?:\\.[0-9]+){2,}\\b)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "oct", pcrepp("\\A(-?0[0-7]+\\b)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "pcnt", pcrepp("\\A(-?[0-9]+(\\.[0-9]+)?[ ]*%\\b)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-15 01:41:31 +00:00
|
|
|
{ "num", pcrepp("\\A(-?[0-9]+(\\.[0-9]+)?([eE][-+][0-9]+)?)"
|
|
|
|
"\\b(?![\\._\\-][a-zA-Z])"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-15 01:41:31 +00:00
|
|
|
{ "hex", pcrepp("\\A(-?(?:0x|[0-9])[0-9a-fA-F]+)"
|
|
|
|
"\\b(?![\\._\\-][a-zA-Z])"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-13 05:36:31 +00:00
|
|
|
{ "mail", pcrepp("\\A([a-zA-Z0-9\\._%+-]+@[a-zA-Z0-9\\.-]+\\.[a-zA-Z]+)\\b"), },
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "cnst", pcrepp("\\A(true|True|TRUE|false|False|FALSE|None|null)\\b") },
|
2013-05-28 04:35:00 +00:00
|
|
|
{ "word", pcrepp(
|
2013-06-12 04:10:59 +00:00
|
|
|
"\\A([a-zA-Z][a-z']+(?=[\\s\\(\\)!\\*:;'\\\"\\?,]|[\\.\\!,\\?]\\s|$))"), },
|
|
|
|
{ "sym", pcrepp(
|
2013-06-15 01:41:31 +00:00
|
|
|
"\\A([^\";\\s:=,\\(\\)\\{\\}\\[\\]\\+#!@%\\^&\\*'\\?<>\\~`\\|\\\\]+"
|
|
|
|
"(?:::[^\";\\s:=,\\(\\)\\{\\}\\[\\]\\+#!@%\\^&\\*'\\?<>\\~`\\|\\\\]+)*)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "line", pcrepp("\\A(\r?\n|\r|;)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "wspc", pcrepp("\\A([ \r\t]+)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "dot", pcrepp("\\A(\\.)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
{ "gbg", pcrepp("\\A(.)"),
|
2013-05-28 04:35:00 +00:00
|
|
|
},
|
2011-06-13 14:46:03 +00:00
|
|
|
};
|
|
|
|
|
2013-05-24 14:55:56 +00:00
|
|
|
const char *DNT_NAMES[DNT_MAX - DNT_KEY] = {
|
2011-06-18 20:42:07 +00:00
|
|
|
"key",
|
|
|
|
"pair",
|
|
|
|
"val",
|
|
|
|
"row",
|
|
|
|
"unit",
|
|
|
|
"meas",
|
|
|
|
"var",
|
|
|
|
"rang",
|
|
|
|
"date",
|
2013-05-24 14:55:56 +00:00
|
|
|
"grp",
|
2011-06-18 20:42:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
const char *data_scanner::token2name(data_token_t token)
|
|
|
|
{
|
2013-05-28 04:35:00 +00:00
|
|
|
if (token < 0) {
|
|
|
|
return "inv";
|
|
|
|
}
|
|
|
|
else if (token < DT_TERMINAL_MAX) {
|
|
|
|
return MATCHERS[token].name;
|
|
|
|
}
|
|
|
|
else if (token == DT_ANY) {
|
|
|
|
return "any";
|
|
|
|
}
|
|
|
|
else{
|
|
|
|
return DNT_NAMES[token - DNT_KEY];
|
|
|
|
}
|
2011-06-18 20:42:07 +00:00
|
|
|
}
|
|
|
|
|
2013-05-24 14:55:56 +00:00
|
|
|
static
|
|
|
|
bool find_string_end(const char *str, size_t &start, size_t length, char term)
|
|
|
|
{
|
2013-05-28 04:35:00 +00:00
|
|
|
for (; start < length; start++) {
|
|
|
|
if (str[start] == term) {
|
|
|
|
start += 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (str[start] == '\\') {
|
|
|
|
if (start + 1 >= length) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
start += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
2013-05-24 14:55:56 +00:00
|
|
|
}
|
|
|
|
|
2013-06-12 04:10:59 +00:00
|
|
|
static
|
|
|
|
void single_char_capture(pcre_context &pc, pcre_input &pi)
|
|
|
|
{
|
|
|
|
pc.all()[0].c_begin = pi.pi_offset;
|
|
|
|
pc.all()[0].c_end = pi.pi_offset + 1;
|
|
|
|
pc.all()[1] = pc.all()[0];
|
|
|
|
pc.set_count(2);
|
|
|
|
pi.pi_next_offset = pi.pi_offset + 1;
|
|
|
|
}
|
|
|
|
|
2011-06-13 14:46:03 +00:00
|
|
|
bool data_scanner::tokenize(pcre_context &pc, data_token_t &token_out)
|
|
|
|
{
|
2013-06-12 04:10:59 +00:00
|
|
|
const char *str = this->ds_pcre_input.get_string();
|
|
|
|
pcre_input &pi = this->ds_pcre_input;
|
2011-06-13 14:46:03 +00:00
|
|
|
int lpc;
|
|
|
|
|
|
|
|
token_out = data_token_t(-1);
|
2013-05-28 04:35:00 +00:00
|
|
|
|
2011-06-18 20:42:07 +00:00
|
|
|
if (this->ds_pcre_input.pi_next_offset > this->ds_pcre_input.pi_length) {
|
2013-05-28 04:35:00 +00:00
|
|
|
return false;
|
2011-06-18 20:42:07 +00:00
|
|
|
}
|
|
|
|
else if (this->ds_pcre_input.pi_next_offset ==
|
2013-05-28 04:35:00 +00:00
|
|
|
this->ds_pcre_input.pi_length) {
|
|
|
|
this->ds_pcre_input.pi_next_offset += 1;
|
|
|
|
token_out = DT_LINE;
|
|
|
|
|
|
|
|
return false;
|
2011-06-18 20:42:07 +00:00
|
|
|
}
|
|
|
|
|
2011-06-13 14:46:03 +00:00
|
|
|
for (lpc = 0; lpc < DT_TERMINAL_MAX; lpc++) {
|
2013-05-28 04:35:00 +00:00
|
|
|
switch (lpc) {
|
|
|
|
case DT_QUOTED_STRING: {
|
|
|
|
pcre_input &pi = this->ds_pcre_input;
|
|
|
|
const char *str = pi.get_string();
|
|
|
|
size_t str_start, str_end;
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
pi.pi_offset = pi.pi_next_offset;
|
|
|
|
str_end = str_start = pi.pi_offset + 1;
|
|
|
|
switch (str[pi.pi_offset]) {
|
|
|
|
case 'u':
|
|
|
|
case 'r':
|
|
|
|
if (pi.pi_offset + 1 < pi.pi_length &&
|
|
|
|
(str[pi.pi_offset + 1] == '\'' ||
|
|
|
|
str[pi.pi_offset + 1] == '\"')) {
|
|
|
|
str_start += 1;
|
|
|
|
str_end += 1;
|
|
|
|
found = find_string_end(str,
|
|
|
|
str_end,
|
|
|
|
pi.pi_length,
|
|
|
|
str[pi.pi_offset]);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '\'':
|
|
|
|
case '\"':
|
|
|
|
found = find_string_end(str,
|
|
|
|
str_end,
|
|
|
|
pi.pi_length,
|
|
|
|
str[pi.pi_offset]);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (found) {
|
|
|
|
token_out = data_token_t(DT_QUOTED_STRING);
|
|
|
|
pi.pi_next_offset = str_end;
|
|
|
|
pc.all()[0].c_begin = pi.pi_offset;
|
|
|
|
pc.all()[0].c_end = str_end;
|
|
|
|
pc.all()[1].c_begin = str_start;
|
|
|
|
pc.all()[1].c_end = str_end - 1;
|
|
|
|
pc.set_count(2);
|
|
|
|
return true;
|
2013-06-12 04:10:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DT_SEPARATOR: {
|
|
|
|
pi.pi_offset = pi.pi_next_offset;
|
|
|
|
|
|
|
|
if (str[pi.pi_offset] == ':' ||
|
|
|
|
str[pi.pi_offset] == '=') {
|
|
|
|
token_out = data_token_t(DT_SEPARATOR);
|
|
|
|
single_char_capture(pc, pi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DT_COMMA: {
|
|
|
|
pi.pi_offset = pi.pi_next_offset;
|
|
|
|
|
|
|
|
if (str[pi.pi_offset] == ',') {
|
|
|
|
token_out = data_token_t(DT_COMMA);
|
|
|
|
single_char_capture(pc, pi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DT_SEMI: {
|
|
|
|
pi.pi_offset = pi.pi_next_offset;
|
|
|
|
|
|
|
|
if (str[pi.pi_offset] == ';') {
|
|
|
|
token_out = data_token_t(DT_SEMI);
|
|
|
|
single_char_capture(pc, pi);
|
|
|
|
return true;
|
2013-05-28 04:35:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (MATCHERS[lpc].pcre.match(pc, this->ds_pcre_input,
|
|
|
|
PCRE_ANCHORED)) {
|
2013-05-24 14:55:56 +00:00
|
|
|
switch (lpc) {
|
|
|
|
case DT_IPV6_ADDRESS: {
|
2013-05-28 04:35:00 +00:00
|
|
|
std::string addr =
|
|
|
|
this->ds_pcre_input.get_substr(pc.all());
|
2013-05-24 14:55:56 +00:00
|
|
|
char buf[sizeof(struct in6_addr)];
|
|
|
|
|
|
|
|
if (inet_pton(AF_INET6, addr.c_str(), buf) == 1) {
|
|
|
|
token_out = data_token_t(lpc);
|
|
|
|
return true;
|
|
|
|
}
|
2013-05-28 04:35:00 +00:00
|
|
|
this->ds_pcre_input.pi_next_offset =
|
|
|
|
this->ds_pcre_input.pi_offset;
|
2013-05-24 14:55:56 +00:00
|
|
|
break;
|
|
|
|
}
|
2013-05-28 04:35:00 +00:00
|
|
|
|
2013-05-24 14:55:56 +00:00
|
|
|
default:
|
|
|
|
token_out = data_token_t(lpc);
|
|
|
|
return true;
|
|
|
|
}
|
2013-05-28 04:35:00 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2011-06-13 14:46:03 +00:00
|
|
|
}
|
|
|
|
|
2011-06-18 20:42:07 +00:00
|
|
|
assert((0 <= token_out && token_out < DT_TERMINAL_MAX));
|
2011-06-13 14:46:03 +00:00
|
|
|
|
2011-06-18 20:42:07 +00:00
|
|
|
return true;
|
2011-06-13 14:46:03 +00:00
|
|
|
}
|