mirror of https://github.com/tstack/lnav
[perf] improve initial indexing times
parent
2e10ca09d0
commit
2589345e5c
@ -0,0 +1,298 @@
|
||||
/*
|
||||
* is_utf8 is distributed under the following terms:
|
||||
*
|
||||
* Copyright (c) 2013 Palard Julien. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "is_utf8.hh"
|
||||
|
||||
/*
|
||||
Check if the given unsigned char * is a valid utf-8 sequence.
|
||||
|
||||
Return value :
|
||||
If the string is valid utf-8, 0 is returned.
|
||||
Else the position, starting from 1, is returned.
|
||||
|
||||
Source:
|
||||
http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf
|
||||
page 124, 3.9 "Unicode Encoding Forms", "UTF-8"
|
||||
|
||||
|
||||
Table 3-7. Well-Formed UTF-8 Byte Sequences
|
||||
-----------------------------------------------------------------------------
|
||||
| Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
|
||||
| U+0000..U+007F | 00..7F | | | |
|
||||
| U+0080..U+07FF | C2..DF | 80..BF | | |
|
||||
| U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
|
||||
| U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
|
||||
| U+D000..U+D7FF | ED | 80..9F | 80..BF | |
|
||||
| U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
|
||||
| U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
|
||||
| U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
|
||||
| U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Returns the first erroneous byte position, and give in
|
||||
`faulty_bytes` the number of actually existing bytes taking part in this error.
|
||||
*/
|
||||
ssize_t is_utf8(unsigned char *str, size_t len, const char **message, int *faulty_bytes)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
*message = nullptr;
|
||||
*faulty_bytes = 0;
|
||||
while (i < len)
|
||||
{
|
||||
if (str[i] == '\n') {
|
||||
*message = nullptr;
|
||||
return i;
|
||||
}
|
||||
|
||||
if (str[i] <= 0x7F) /* 00..7F */
|
||||
{
|
||||
i += 1;
|
||||
}
|
||||
else if (str[i] >= 0xC2 && str[i] <= 0xDF) /* C2..DF 80..BF */
|
||||
{
|
||||
if (i + 1 < len) /* Expect a 2nd byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte between C2 and DF, expecting a 2nd byte between 80 and BF";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte between C2 and DF, expecting a 2nd byte.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */
|
||||
{
|
||||
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
|
||||
{
|
||||
if (str[i + 1] < 0xA0 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of E0, expecting a 2nd byte between A0 and BF.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of E0, expecting a 3nd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte of E0, expecting two following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */
|
||||
{
|
||||
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte between E1 and EC, expecting the 2nd byte between 80 and BF.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte between E1 and EC, expecting the 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte between E1 and EC, expecting two following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
else if (str[i] == 0xED) /* ED 80..9F 80..BF */
|
||||
{
|
||||
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0x9F)
|
||||
{
|
||||
*message = "After a first byte of ED, expecting 2nd byte between 80 and 9F.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of ED, expecting 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte of ED, expecting two following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */
|
||||
{
|
||||
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte between EE and EF, expecting 2nd byte between 80 and BF.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte between EE and EF, expecting 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte between EE and EF, two following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */
|
||||
{
|
||||
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
|
||||
{
|
||||
if (str[i + 1] < 0x90 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F0, expecting 2nd byte between 90 and BF.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F0, expecting 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F0, expecting 4th byte between 80 and BF.";
|
||||
*faulty_bytes = 4;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte of F0, expecting three following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 4;
|
||||
}
|
||||
else if (str[i] >= 0xF1 && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */
|
||||
{
|
||||
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F1, F2, or F3, expecting a 2nd byte between 80 and BF.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F1, F2, or F3, expecting a 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F1, F2, or F3, expecting a 4th byte between 80 and BF.";
|
||||
*faulty_bytes = 4;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte of F1, F2, or F3, expecting three following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 4;
|
||||
}
|
||||
else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */
|
||||
{
|
||||
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
|
||||
{
|
||||
if (str[i + 1] < 0x80 || str[i + 1] > 0x8F)
|
||||
{
|
||||
*message = "After a first byte of F4, expecting 2nd byte between 80 and 8F.";
|
||||
*faulty_bytes = 2;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F4, expecting 3rd byte between 80 and BF.";
|
||||
*faulty_bytes = 3;
|
||||
return i;
|
||||
}
|
||||
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
|
||||
{
|
||||
*message = "After a first byte of F4, expecting 4th byte between 80 and BF.";
|
||||
*faulty_bytes = 4;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "After a first byte of F4, expecting three following bytes.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
i += 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
*message = "Expecting bytes in the following ranges: 00..7F C2..F4.";
|
||||
*faulty_bytes = 1;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* is_utf8 is distributed under the following terms:
|
||||
*
|
||||
* Copyright (c) 2013 Palard Julien. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _IS_UTF8_H
|
||||
#define _IS_UTF8_H
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
ssize_t is_utf8(unsigned char *str, size_t len, const char **message, int *faulty_bytes);
|
||||
|
||||
#endif /* _IS_UTF8_H */
|
@ -0,0 +1,590 @@
|
||||
/* Generated by re2c 1.1.1 on Tue Oct 16 06:58:50 2018 */
|
||||
#line 1 "../../lnav2/src/log_level_re.re"
|
||||
/**
|
||||
* Copyright (c) 2018, Timothy Stack
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
* * Neither the name of Timothy Stack nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "log_level.hh"
|
||||
|
||||
log_level_t string2level(const char *levelstr, ssize_t len, bool exact)
|
||||
{
|
||||
log_level_t retval = LEVEL_UNKNOWN;
|
||||
|
||||
if (len == (ssize_t)-1) {
|
||||
len = strlen(levelstr);
|
||||
}
|
||||
|
||||
if (((len == 1) || ((len > 1) && (levelstr[1] == ' '))) &&
|
||||
(retval = abbrev2level(levelstr, 1)) != LEVEL_UNKNOWN) {
|
||||
return retval;
|
||||
}
|
||||
|
||||
# define YYCTYPE unsigned char
|
||||
# define RET(tok) { \
|
||||
return tok; \
|
||||
}
|
||||
|
||||
const YYCTYPE *YYCURSOR = (const unsigned char *) levelstr;
|
||||
const YYCTYPE *YYLIMIT = (const unsigned char *) levelstr + len;
|
||||
const YYCTYPE *YYMARKER = YYCURSOR;
|
||||
const YYCTYPE *debug_level = nullptr;
|
||||
|
||||
# define YYPEEK() (YYCURSOR < YYLIMIT ? *YYCURSOR : 0)
|
||||
# define YYSKIP() ++YYCURSOR
|
||||
# define YYBACKUP() YYMARKER = YYCURSOR
|
||||
# define YYRESTORE() YYCURSOR = YYMARKER
|
||||
# define YYSTAGP(x) x = YYCURSOR - 1
|
||||
|
||||
const unsigned char *yyt1;
|
||||
loop:
|
||||
|
||||
#line 73 "log_level_re.cc"
|
||||
{
|
||||
YYCTYPE yych;
|
||||
unsigned int yyaccept = 0;
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 0x00: goto yy2;
|
||||
case 'C':
|
||||
case 'c': goto yy6;
|
||||
case 'D':
|
||||
case 'd': goto yy7;
|
||||
case 'E':
|
||||
case 'e': goto yy8;
|
||||
case 'F':
|
||||
case 'f': goto yy9;
|
||||
case 'I':
|
||||
case 'i': goto yy10;
|
||||
case 'N':
|
||||
case 'n': goto yy11;
|
||||
case 'S':
|
||||
case 's': goto yy12;
|
||||
case 'T':
|
||||
case 't': goto yy13;
|
||||
case 'W':
|
||||
case 'w': goto yy14;
|
||||
default: goto yy4;
|
||||
}
|
||||
yy2:
|
||||
YYSKIP ();
|
||||
#line 75 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_UNKNOWN); }
|
||||
#line 104 "log_level_re.cc"
|
||||
yy4:
|
||||
YYSKIP ();
|
||||
yy5:
|
||||
#line 102 "../../lnav2/src/log_level_re.re"
|
||||
{ goto loop; }
|
||||
#line 110 "log_level_re.cc"
|
||||
yy6:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy15;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy7:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy17;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy8:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy18;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy9:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy19;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy10:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'N':
|
||||
case 'n': goto yy20;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy11:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'O':
|
||||
case 'o': goto yy21;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy12:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy22;
|
||||
case 'T':
|
||||
case 't': goto yy23;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy13:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy24;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy14:
|
||||
yyaccept = 0;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy25;
|
||||
default: goto yy5;
|
||||
}
|
||||
yy15:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'I':
|
||||
case 'i': goto yy26;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy16:
|
||||
YYRESTORE ();
|
||||
switch (yyaccept) {
|
||||
case 0: goto yy5;
|
||||
case 1: goto yy29;
|
||||
default: goto yy48;
|
||||
}
|
||||
yy17:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'B':
|
||||
case 'b': goto yy27;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy18:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy28;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy19:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'T':
|
||||
case 't': goto yy30;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy20:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'F':
|
||||
case 'f': goto yy31;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy21:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'T':
|
||||
case 't': goto yy32;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy22:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'V':
|
||||
case 'v': goto yy33;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy23:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy34;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy24:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy35;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy25:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy36;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy26:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'T':
|
||||
case 't': goto yy37;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy27:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'U':
|
||||
case 'u': goto yy38;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy28:
|
||||
yyaccept = 1;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'O':
|
||||
case 'o': goto yy39;
|
||||
default: goto yy29;
|
||||
}
|
||||
yy29:
|
||||
#line 98 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_ERROR); }
|
||||
#line 319 "log_level_re.cc"
|
||||
yy30:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy40;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy31:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'O':
|
||||
case 'o': goto yy41;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy32:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'I':
|
||||
case 'i': goto yy43;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy33:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy44;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy34:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'T':
|
||||
case 't': goto yy45;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy35:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'C':
|
||||
case 'c': goto yy46;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy36:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'N':
|
||||
case 'n': goto yy47;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy37:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'I':
|
||||
case 'i': goto yy49;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy38:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'G':
|
||||
case 'g': goto yy50;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy39:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy52;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy40:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'L':
|
||||
case 'l': goto yy53;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy41:
|
||||
YYSKIP ();
|
||||
#line 94 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_INFO); }
|
||||
#line 412 "log_level_re.cc"
|
||||
yy43:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'C':
|
||||
case 'c': goto yy55;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy44:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'R':
|
||||
case 'r': goto yy56;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy45:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'S':
|
||||
case 's': goto yy57;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy46:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy59;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy47:
|
||||
yyaccept = 2;
|
||||
YYSKIP ();
|
||||
YYBACKUP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'I':
|
||||
case 'i': goto yy61;
|
||||
default: goto yy48;
|
||||
}
|
||||
yy48:
|
||||
#line 97 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_WARNING); }
|
||||
#line 458 "log_level_re.cc"
|
||||
yy49:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'C':
|
||||
case 'c': goto yy62;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy50:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5': goto yy63;
|
||||
default:
|
||||
YYSTAGP (yyt1);
|
||||
goto yy51;
|
||||
}
|
||||
yy51:
|
||||
debug_level = yyt1;
|
||||
#line 77 "../../lnav2/src/log_level_re.re"
|
||||
{
|
||||
if (debug_level == nullptr) {
|
||||
RET(LEVEL_DEBUG);
|
||||
}
|
||||
switch (*debug_level) {
|
||||
case '2':
|
||||
RET(LEVEL_DEBUG2);
|
||||
case '3':
|
||||
RET(LEVEL_DEBUG3);
|
||||
case '4':
|
||||
RET(LEVEL_DEBUG4);
|
||||
case '5':
|
||||
RET(LEVEL_DEBUG5);
|
||||
default:
|
||||
RET(LEVEL_DEBUG);
|
||||
}
|
||||
}
|
||||
#line 499 "log_level_re.cc"
|
||||
yy52:
|
||||
YYSKIP ();
|
||||
goto yy29;
|
||||
yy53:
|
||||
YYSKIP ();
|
||||
#line 101 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_FATAL); }
|
||||
#line 507 "log_level_re.cc"
|
||||
yy55:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy64;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy56:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'E':
|
||||
case 'e': goto yy66;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy57:
|
||||
YYSKIP ();
|
||||
#line 96 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_STATS); }
|
||||
#line 528 "log_level_re.cc"
|
||||
yy59:
|
||||
YYSKIP ();
|
||||
#line 76 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_TRACE); }
|
||||
#line 533 "log_level_re.cc"
|
||||
yy61:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'N':
|
||||
case 'n': goto yy68;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy62:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'A':
|
||||
case 'a': goto yy69;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy63:
|
||||
YYSKIP ();
|
||||
YYSTAGP (yyt1);
|
||||
goto yy51;
|
||||
yy64:
|
||||
YYSKIP ();
|
||||
#line 95 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_INFO); }
|
||||
#line 558 "log_level_re.cc"
|
||||
yy66:
|
||||
YYSKIP ();
|
||||
#line 100 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_CRITICAL); }
|
||||
#line 563 "log_level_re.cc"
|
||||
yy68:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'G':
|
||||
case 'g': goto yy70;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy69:
|
||||
YYSKIP ();
|
||||
yych = YYPEEK ();
|
||||
switch (yych) {
|
||||
case 'L':
|
||||
case 'l': goto yy71;
|
||||
default: goto yy16;
|
||||
}
|
||||
yy70:
|
||||
YYSKIP ();
|
||||
goto yy48;
|
||||
yy71:
|
||||
YYSKIP ();
|
||||
#line 99 "../../lnav2/src/log_level_re.re"
|
||||
{ RET(LEVEL_CRITICAL); }
|
||||
#line 587 "log_level_re.cc"
|
||||
}
|
||||
#line 104 "../../lnav2/src/log_level_re.re"
|
||||
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
/**
|
||||
* Copyright (c) 2018, Timothy Stack
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
* * Neither the name of Timothy Stack nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "log_level.hh"
|
||||
|
||||
log_level_t string2level(const char *levelstr, ssize_t len, bool exact)
|
||||
{
|
||||
log_level_t retval = LEVEL_UNKNOWN;
|
||||
|
||||
if (len == (ssize_t)-1) {
|
||||
len = strlen(levelstr);
|
||||
}
|
||||
|
||||
if (((len == 1) || ((len > 1) && (levelstr[1] == ' '))) &&
|
||||
(retval = abbrev2level(levelstr, 1)) != LEVEL_UNKNOWN) {
|
||||
return retval;
|
||||
}
|
||||
|
||||
# define YYCTYPE unsigned char
|
||||
# define RET(tok) { \
|
||||
return tok; \
|
||||
}
|
||||
|
||||
const YYCTYPE *YYCURSOR = (const unsigned char *) levelstr;
|
||||
const YYCTYPE *YYLIMIT = (const unsigned char *) levelstr + len;
|
||||
const YYCTYPE *YYMARKER = YYCURSOR;
|
||||
const YYCTYPE *debug_level = nullptr;
|
||||
|
||||
# define YYPEEK() (YYCURSOR < YYLIMIT ? *YYCURSOR : 0)
|
||||
# define YYSKIP() ++YYCURSOR
|
||||
# define YYBACKUP() YYMARKER = YYCURSOR
|
||||
# define YYRESTORE() YYCURSOR = YYMARKER
|
||||
# define YYSTAGP(x) x = YYCURSOR - 1
|
||||
|
||||
/*!stags:re2c format = 'const unsigned char *@@;'; */
|
||||
loop:
|
||||
/*!re2c
|
||||
re2c:yyfill:enable = 0;
|
||||
re2c:flags:input = custom;
|
||||
|
||||
EOF = "\x00";
|
||||
|
||||
EOF { RET(LEVEL_UNKNOWN); }
|
||||
'trace' { RET(LEVEL_TRACE); }
|
||||
'debug' [2-5]? @debug_level {
|
||||
if (debug_level == nullptr) {
|
||||
RET(LEVEL_DEBUG);
|
||||
}
|
||||
switch (*debug_level) {
|
||||
case '2':
|
||||
RET(LEVEL_DEBUG2);
|
||||
case '3':
|
||||
RET(LEVEL_DEBUG3);
|
||||
case '4':
|
||||
RET(LEVEL_DEBUG4);
|
||||
case '5':
|
||||
RET(LEVEL_DEBUG5);
|
||||
default:
|
||||
RET(LEVEL_DEBUG);
|
||||
}
|
||||
}
|
||||
'info' { RET(LEVEL_INFO); }
|
||||
'notice' { RET(LEVEL_INFO); }
|
||||
'stats' { RET(LEVEL_STATS); }
|
||||
'warn'|'warning' { RET(LEVEL_WARNING); }
|
||||
'err'|'error' { RET(LEVEL_ERROR); }
|
||||
'critical' { RET(LEVEL_CRITICAL); }
|
||||
'severe' { RET(LEVEL_CRITICAL); }
|
||||
'fatal' { RET(LEVEL_FATAL); }
|
||||
* { goto loop; }
|
||||
|
||||
*/
|
||||
}
|
@ -0,0 +1,237 @@
|
||||
/**
|
||||
* https://github.com/lemire/fastvalidate-utf-8
|
||||
*/
|
||||
|
||||
#ifndef SIMDUTF8CHECK_H
|
||||
#define SIMDUTF8CHECK_H
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "lnav_log.hh"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* legal utf-8 byte sequence
|
||||
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
|
||||
*
|
||||
* Code Points 1st 2s 3s 4s
|
||||
* U+0000..U+007F 00..7F
|
||||
* U+0080..U+07FF C2..DF 80..BF
|
||||
* U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
* U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
* U+D000..U+D7FF ED 80..9F 80..BF
|
||||
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
*
|
||||
*/
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
static inline void checkSmallerThan0xF4(__m128i current_bytes,
|
||||
__m128i *has_error)
|
||||
{
|
||||
// unsigned, saturates to 0 below max
|
||||
*has_error = _mm_or_si128(*has_error,
|
||||
_mm_subs_epu8(current_bytes,
|
||||
_mm_set1_epi8(0xF4)));
|
||||
}
|
||||
|
||||
static inline __m128i continuationLengths(__m128i high_nibbles)
|
||||
{
|
||||
return _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
2, 2, // 110x
|
||||
3, // 1110
|
||||
4), // 1111, next should be 0 (not checked here)
|
||||
high_nibbles);
|
||||
}
|
||||
|
||||
static inline __m128i carryContinuations(__m128i initial_lengths,
|
||||
__m128i previous_carries)
|
||||
{
|
||||
|
||||
__m128i right1 = _mm_subs_epu8(
|
||||
_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
|
||||
_mm_set1_epi8(1));
|
||||
__m128i sum = _mm_add_epi8(initial_lengths, right1);
|
||||
|
||||
__m128i right2 = _mm_subs_epu8(
|
||||
_mm_alignr_epi8(sum, previous_carries, 16 - 2),
|
||||
_mm_set1_epi8(2));
|
||||
return _mm_add_epi8(sum, right2);
|
||||
}
|
||||
|
||||
static inline void checkContinuations(__m128i initial_lengths,
|
||||
__m128i carries,
|
||||
__m128i *has_error)
|
||||
{
|
||||
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
// (carries > length) == (lengths > 0)
|
||||
__m128i overunder = _mm_cmpeq_epi8(
|
||||
_mm_cmpgt_epi8(carries, initial_lengths),
|
||||
_mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
|
||||
|
||||
*has_error = _mm_or_si128(*has_error, overunder);
|
||||
}
|
||||
|
||||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
static inline void checkFirstContinuationMax(__m128i current_bytes,
|
||||
__m128i off1_current_bytes,
|
||||
__m128i *has_error)
|
||||
{
|
||||
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
|
||||
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
|
||||
|
||||
__m128i badfollowED = _mm_and_si128(
|
||||
_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)),
|
||||
maskED);
|
||||
__m128i badfollowF4 = _mm_and_si128(
|
||||
_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)),
|
||||
maskF4);
|
||||
|
||||
*has_error = _mm_or_si128(*has_error,
|
||||
_mm_or_si128(badfollowED, badfollowF4));
|
||||
}
|
||||
|
||||
// map off1_hibits => error condition
|
||||
// hibits off1 cur
|
||||
// C => < C2 && true
|
||||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
static inline void checkOverlong(__m128i current_bytes,
|
||||
__m128i off1_current_bytes,
|
||||
__m128i hibits,
|
||||
__m128i previous_hibits,
|
||||
__m128i *has_error)
|
||||
{
|
||||
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
|
||||
__m128i initial_mins = _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1),
|
||||
off1_hibits);
|
||||
|
||||
__m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
|
||||
|
||||
__m128i second_mins = _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90),
|
||||
off1_hibits);
|
||||
__m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
|
||||
*has_error = _mm_or_si128(*has_error,
|
||||
_mm_and_si128(initial_under, second_under));
|
||||
}
|
||||
|
||||
struct processed_utf_bytes {
|
||||
__m128i rawbytes;
|
||||
__m128i high_nibbles;
|
||||
__m128i carried_continuations;
|
||||
};
|
||||
|
||||
static inline void count_nibbles(__m128i bytes,
|
||||
struct processed_utf_bytes *answer)
|
||||
{
|
||||
answer->rawbytes = bytes;
|
||||
answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4),
|
||||
_mm_set1_epi8(0x0F));
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static struct processed_utf_bytes
|
||||
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
|
||||
__m128i *has_error)
|
||||
{
|
||||
struct processed_utf_bytes pb;
|
||||
count_nibbles(current_bytes, &pb);
|
||||
|
||||
checkSmallerThan0xF4(current_bytes, has_error);
|
||||
|
||||
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations = carryContinuations(
|
||||
initial_lengths,
|
||||
previous->carried_continuations);
|
||||
|
||||
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
__m128i off1_current_bytes =
|
||||
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
|
||||
checkFirstContinuationMax(current_bytes, off1_current_bytes,
|
||||
has_error);
|
||||
|
||||
checkOverlong(current_bytes, off1_current_bytes,
|
||||
pb.high_nibbles, previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
static bool validate_utf8_fast(const char *src, size_t len, ssize_t *len_out)
|
||||
{
|
||||
size_t i = 0, orig_len = len;
|
||||
__m128i has_error = _mm_setzero_si128();
|
||||
__m128i lfchars = _mm_set1_epi8('\n');
|
||||
__m128i lfresult = _mm_setzero_si128();
|
||||
struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
|
||||
.high_nibbles = _mm_setzero_si128(),
|
||||
.carried_continuations = _mm_setzero_si128()};
|
||||
if (len >= 16) {
|
||||
for (; i <= len - 16; i += 16) {
|
||||
__m128i current_bytes = _mm_loadu_si128(
|
||||
(const __m128i *) (src + i));
|
||||
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
lfresult = _mm_cmpeq_epi8(current_bytes, lfchars);
|
||||
if (_mm_movemask_epi8(lfresult)) {
|
||||
for (; src[i] != '\n'; i++) {
|
||||
}
|
||||
len = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//last part
|
||||
if (i < len) {
|
||||
char buffer[16];
|
||||
memset(buffer, 0, 16);
|
||||
memcpy(buffer, src + i, len - i);
|
||||
__m128i current_bytes = _mm_loadu_si128((const __m128i *) (buffer));
|
||||
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
for (; i < len && src[i] != '\n'; i++) {
|
||||
}
|
||||
} else {
|
||||
has_error = _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 1)),
|
||||
has_error);
|
||||
}
|
||||
|
||||
if (i < orig_len && src[i] == '\n') {
|
||||
*len_out = i;
|
||||
}
|
||||
|
||||
return _mm_testz_si128(has_error, has_error);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
Binary file not shown.
Loading…
Reference in New Issue