mirror of https://github.com/sonertari/SSLproxy
Use Aho-Corasick machines for substring matching
Now, the filter uses B-trees for exact string matching and Aho-Corasick machines for substring matching. B-trees and AC machines are exported to linked lists for debug logging only. Also, - Separate all_sites and all_ports filters from substring filters. They are not related with substring filters actually, and ACM keywords cannot be empty strings anyway. So now they should be handled separately too. - Improve debug logging of filtering rules. - Update unit tests accordingly, and improve. - Fix pxyconn_filter(), keep searching for a match in substring filters if exact match does not have a matching site rule. - Increase common names max len and tokens. weather.gov has 73 tokens. - Rename keyword to desc. - Update documentation. - Clean up.pull/48/head
parent
97117d4e50
commit
9d2e523cd0
@ -0,0 +1,367 @@
|
||||
|
||||
/*
|
||||
* Copyright 2017 Laurent Farhi
|
||||
* Contact: lfarhi@sfr.fr
|
||||
*
|
||||
* This file is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This file is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this file. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef __ACM_TEMPLATE__
|
||||
|
||||
# define __ACM_TEMPLATE__
|
||||
|
||||
/// User interface ************************************************************************
|
||||
|
||||
/// Texts and keywprds are composed of symbols of type T.
|
||||
/// T can be any standard type (int, char, wchar_t, ...) or any user defined type (such as a structure).
|
||||
/// It can be declared and defined in global scope by:
|
||||
/// #include "aho_corasick_template_impl.h"
|
||||
/// ACM_DECLARE (T)
|
||||
/// ACM_DEFINE (T)
|
||||
///
|
||||
/// A destructor, and a copy constructor can be declared for type T if required.
|
||||
/// Type for destructor is: void (*destructor) (const T)
|
||||
# define DESTRUCTOR_TYPE(T) DESTROY_##T##_TYPE
|
||||
|
||||
/// Type for constructor is: T (*constructor) (const T)
|
||||
# define COPY_CONSTRUCTOR_TYPE(T) COPY_##T##_TYPE
|
||||
|
||||
/// Type for equality operator is: int (*equal_operator) (const T, const T)
|
||||
# define EQ_OPERATOR_TYPE(T) EQ_##T##_TYPE
|
||||
|
||||
/// SET_DESTRUCTOR optionally declares a destructor for type T.
|
||||
/// Example: SET_DESTRUCTOR (mytype, mydestructor);
|
||||
# define SET_DESTRUCTOR(T, destructor) do { DESTROY_##T = (destructor) ; } while (0)
|
||||
|
||||
/// SET_COPY_CONSTRUCTOR optionally declares a copy constructor for type T.
|
||||
/// Example: SET_COPY_CONSTRUCTOR (mytype, myconstructor);
|
||||
# define SET_COPY_CONSTRUCTOR(T, constructor) do { COPY_##T = (constructor) ; } while (0)
|
||||
|
||||
/// SET_EQ_OPERATOR optionally declares equality operator for type T.
|
||||
/// A user defined equality operator can be declared for type T if needed.
|
||||
/// A default equality operator (memcmp) is used otherwise.
|
||||
/// Example: static int nocaseeq (wchar_t k, wchar_t t) { return k == towlower (t); }
|
||||
/// SET_EQ_OPERATOR (wchar_t, nocaseeq);
|
||||
# define SET_EQ_OPERATOR(T, equal_operator) do { EQ_##T = (equal_operator) ; } while (0)
|
||||
|
||||
/// ACState (T) is the type of a Aho-Corasick state machine for type T
|
||||
# define ACState(T) ACState_##T
|
||||
|
||||
/// ACMachine (T) is the type of the Aho-Corasick finite state machine for type T
|
||||
# define ACMachine(T) ACMachine_##T
|
||||
|
||||
/// ACMachine (T) *ACM_create (T, [equality_operator], [copy constructor], [destructor])
|
||||
/// Creates a Aho-Corasick finite state machine for type T.
|
||||
/// @param [in] T type of symbols composing keywords and text to be parsed.
|
||||
/// @param [in, optional] equality_operator Equality operator of type EQ_OPERATOR_TYPE(T).
|
||||
/// @param [in, optional] copy constructor Copy constructor of type COPY_CONSTRUCTOR_TYPE(T).
|
||||
/// @param [in, optional] destructor Destructor of type DESTRUCTOR_TYPE(T).
|
||||
/// @returns A pointer to a Aho-Corasick machine for type T.
|
||||
/// Example: ACMachine (char) * M = ACM_create (char);
|
||||
/// Note: ACM_create accepts optional arguments thanks to the use of the VFUNC macro (see below).
|
||||
# define ACM_create(...) VFUNC(ACM_create, __VA_ARGS__)
|
||||
|
||||
/// void ACM_release (const ACMachine (T) *machine)
|
||||
/// Releases the ressources of a Aho-Corasick machine created with ACM_create.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine to be realeased.
|
||||
/// Example: ACM_release (M);
|
||||
# define ACM_release(machine) (machine)->vtable->release ((machine))
|
||||
|
||||
/// Keyword (T) is the type of a keyword composed of symbols of type T.
|
||||
/// Exemple: Keyword (char) kw;
|
||||
# define Keyword(T) Keyword_##T
|
||||
|
||||
/// void ACM_KEYWORD_SET (Keyword(T) kw, T* array, size_t length)
|
||||
/// Initializes a keyword from an array of symbols
|
||||
/// @param [in] kw Keyword of symbols of type T.
|
||||
/// @param [in] array Array of symbols
|
||||
/// @param [in] length Length of the array
|
||||
/// Note: The array is NOT duplicated by ACM_KEYWORD_SET and should be allocated by the calling user program.
|
||||
/// Exemple: ACM_KEYWORD_SET (kw, "Duck", 4);
|
||||
# define ACM_KEYWORD_SET(keyword,symbols,length) do { ACM_MATCH_SYMBOLS (keyword) = (symbols); ACM_MATCH_LENGTH (keyword) = (length); } while (0)
|
||||
|
||||
/// int ACM_register_keyword(ACMachine(T) *machine, Keyword(T) kw, [void * value_ptr], [void (*destructor) (void *)])
|
||||
/// Registers a keyword in the Aho-Corasick machine.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @param [in] kw Keyword of symbols of type T to be registered.
|
||||
/// @param [in, optional] value_ptr Pointer to a previously allocated value to associate with keyword kw.
|
||||
/// @param [in, optional] destructor A destructor to be used to free the value pointed by value_ptr.
|
||||
/// The default destructor is the standard library function `free.
|
||||
/// Use `0` if the allocated value need not be managed by the finite state machine
|
||||
/// (in case of automatic or static values).
|
||||
/// @return 1 if the keyword was successfully registered, 0 otherwise (if the keyword is empty).
|
||||
/// Note: When returning 0, the destructor, if any, is called on value, if any.
|
||||
/// Note: If the keywpord is already registered in the machine, its associated value is forgotten and replaced by the new value.
|
||||
/// Note: Keyword kw is duplicated and can be released after its registration.
|
||||
/// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
|
||||
/// Note: The keyword is registered together with its rank.
|
||||
/// The rank of the registered keyword is the number of times ACM_register_keyword was previously called
|
||||
/// since the machine was created. The rank is a 0-based sequence number.
|
||||
/// This rank can later be retrieved by ACM_get_match.
|
||||
/// Example: ACM_register_keyword (M, kw);
|
||||
/// ACM_register_keyword (M, kw, calloc (1, sizeof (int)), free);
|
||||
# define ACM_register_keyword(...) VFUNC(ACM_register_keyword, __VA_ARGS__)
|
||||
|
||||
/// int ACM_is_registered_keyword (const ACMachine(T) * machine, Keyword(T) kw, [void **value_ptr])
|
||||
/// Checks whether a keyword is already registered in the machine.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @param [in] kw Keyword of symbols of type T to be checked.
|
||||
/// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call.
|
||||
/// @return 1 if the keyword is registered in the machine, 0 otherwise.
|
||||
/// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
|
||||
# define ACM_is_registered_keyword(...) VFUNC(ACM_is_registered_keyword, __VA_ARGS__)
|
||||
|
||||
/// int ACM_unregister_keyword (ACMachine(T) *machine, Keyword(T) kw)
|
||||
/// Unregisters a keyword from the Aho-Corasick machine.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @param [in] kw Keyword of symbols of type T to be registered.
|
||||
/// @return 1 if the keyword was successfully unregistered, 0 otherwise (the keywpord is not registered in the machine).
|
||||
/// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
|
||||
# define ACM_unregister_keyword(machine, keyword) (machine)->vtable->unregister_keyword ( (machine), (keyword))
|
||||
|
||||
/// size_t ACM_nb_keywords (const ACMachine(T) *machine)
|
||||
/// Returns the number of keywords registered in the machine.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @return The number of keywords registered in the machine.
|
||||
# define ACM_nb_keywords(machine) (machine)->vtable->nb_keywords ((machine))
|
||||
|
||||
/// MatchHolder (T) is the type of a match composed of symbols of type T.
|
||||
/// Exemple: MatchHolder (char) match;
|
||||
# define MatchHolder(T) MatchHolder_##T
|
||||
|
||||
/// size_t ACM_MATCH_LENGTH (MatchHolder(T) match)
|
||||
/// Returns the length of a matching keyword.
|
||||
/// @param [in] match A matching keyword.
|
||||
/// @return The length of the matching keyword.
|
||||
/// Note: This function can also be applied to a keyword of type Keyword(T).
|
||||
# define ACM_MATCH_LENGTH(match) ((match).length)
|
||||
|
||||
/// T* ACM_MATCH_SYMBOLS (MatchHolder(T) match)
|
||||
/// Returns the array to the symbols of a matching keyword.
|
||||
/// @param [in] match A matching keyword.
|
||||
/// @return The array to the symbols of the matching keyword.
|
||||
/// Note: This function can also be applied to a keyword of type Keyword(T).
|
||||
# define ACM_MATCH_SYMBOLS(match) ((match).letter)
|
||||
|
||||
/// size_t ACM_MATCH_UID (MatchHolder(T) match)
|
||||
/// Returns the unique id of a matching keyword.
|
||||
/// @param [in] match A matching keyword returned by a previous call to `ACM_get_match`.
|
||||
/// @return The unique id of the matching keyword.
|
||||
# define ACM_MATCH_UID(match) ((match).rank)
|
||||
|
||||
/// void ACM_foreach_keyword (const ACMachine(T) * machine, void (*operator) (MatchHolder(T) kw, void *value))
|
||||
/// Applies an operator to each registered keyword (by `ACM_register_keyword`) in the machine.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @param [in] operator Function of type void (*operator) (Keyword (T), void *)
|
||||
/// Note: The operator is called for each registered keyword and pointer to associated value successively.
|
||||
/// Note: The order the keywords are processed in unspecified.
|
||||
/// Exemple: static void print_match (MatchHolder (wchar_t) match, void *value) { /* user code here */ }
|
||||
/// ACM_foreach_keyword (M, print_match);
|
||||
# define ACM_foreach_keyword(machine, operator) (machine)->vtable->foreach_keyword ((machine), (operator))
|
||||
|
||||
/// const ACState (T) * ACM_reset (ACMachine(T) * machine)
|
||||
/// Get a valid state, ignoring all the symbols previously matched by ACM_match.
|
||||
/// @param [in] machine A pointer to a Aho-Corasick machine.
|
||||
/// @param [in] state A pointer to a valid Aho-Corasick machine state.
|
||||
/// Note: Several calls to ACM_reset on the same machine can be used to
|
||||
/// parse several texts concurrently (e.g. by several threads).
|
||||
# define ACM_reset(machine) (machine)->vtable->reset ((machine))
|
||||
|
||||
# define ACM_print(machine, stream, printer) (machine)->vtable->print ((machine), (stream), (printer))
|
||||
|
||||
/// size_t ACM_match (const ACState(T) *& state, T letter)
|
||||
/// This is the main function used to parse a text, one symbol after the other, and search for pattern matching.
|
||||
/// Get the next state matching a symbol injected in the finite state machine.
|
||||
/// @param [in, out] state A pointer to a valid Aho-Corasick machine state. Argument passed by reference.
|
||||
/// @param [in] letter A symbol.
|
||||
/// @return The number of registered keywords that match a sequence of last letters sent to the last calls to `ACM_match`.
|
||||
/// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
|
||||
/// Note: The optional argument `nb_matches` avoids the call to ACM_nb_matches.
|
||||
/// Note: `state` is passed by reference. It is modified by the function.
|
||||
/// Usage: size_t nb = ACM_match(state, letter);
|
||||
# define ACM_match(state, letter) (state)->vtable->match(&(state), (letter))
|
||||
|
||||
/// void ACM_MATCH_INIT (MatchHolder(T) match)
|
||||
/// Initializes a match before its first use by ACM_get_match.
|
||||
/// @param [in] match A match
|
||||
/// Exemple: ACM_MATCH_INIT (match);
|
||||
/// Note: this function should only be applied to a matching keyword which reference is passed to ACM_get_match.
|
||||
# define ACM_MATCH_INIT(match) ACM_KEYWORD_SET((match), 0, ((match).rank = 0))
|
||||
|
||||
/// size_t ACM_get_match (const ACState(T) * state, size_t index, [MatchHolder(T) * match], [void **value_ptr])
|
||||
/// Gets the ith keyword matching with the last symbols.
|
||||
/// @param [in] state A pointer to a valid Aho-Corasick machine state.
|
||||
/// @param [in] index Index (ith) of the ith matching keyword.
|
||||
/// @param [out, optional] match *match is set to the ith matching keyword.
|
||||
/// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call.
|
||||
/// @return The rank (unique id) of the ith matching keyword.
|
||||
/// Note: index must be lower than value returned by the last call to ACM_match.
|
||||
/// ?ote: *match should have been initialized by ACM_MATCH_INIT before use.
|
||||
/// Exemple: size_t rank = ACM_get_match (state, j, &match, 0);
|
||||
# define ACM_get_match(...) VFUNC(ACM_get_match, __VA_ARGS__)
|
||||
|
||||
/// void ACM_MATCH_RELEASE (MatchHolder(T) match)
|
||||
/// Releases a match after its last use by ACM_get_match.
|
||||
/// @param [in] match A match
|
||||
/// Exemple: ACM_MATCH_RELEASE (match);
|
||||
/// Note: This function should only be applied to a matching keyword which reference is passed to `ACM_get_match`.
|
||||
/// It should not ne applied to a keyword of type Keyword(T).
|
||||
# define ACM_MATCH_RELEASE(match) do { free (ACM_MATCH_SYMBOLS (match)); ACM_MATCH_INIT (match); } while (0)
|
||||
|
||||
/// Internal declarations ********************************************************************
|
||||
|
||||
// BEGIN VFUNC
|
||||
// Credits: VFUNC is a macro for overloading on number (but not types) of arguments.
|
||||
// See https://stackoverflow.com/questions/11761703/overloading-macro-on-number-of-arguments
|
||||
# define __NARG__(...) __NARG_I_(__VA_ARGS__,__RSEQ_N())
|
||||
# define __NARG_I_(...) __ARG_N(__VA_ARGS__)
|
||||
# define __ARG_N( \
|
||||
_1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \
|
||||
_11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
|
||||
_21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
|
||||
_31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
|
||||
_41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
|
||||
_51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
|
||||
_61,_62,_63,N,...) N
|
||||
# define __RSEQ_N() \
|
||||
63,62,61,60, \
|
||||
59,58,57,56,55,54,53,52,51,50, \
|
||||
49,48,47,46,45,44,43,42,41,40, \
|
||||
39,38,37,36,35,34,33,32,31,30, \
|
||||
29,28,27,26,25,24,23,22,21,20, \
|
||||
19,18,17,16,15,14,13,12,11,10, \
|
||||
9,8,7,6,5,4,3,2,1,0
|
||||
|
||||
# define _VFUNC_(name, n) name##n
|
||||
# define _VFUNC(name, n) _VFUNC_(name, n)
|
||||
# define VFUNC(func, ...) _VFUNC(func, __NARG__(__VA_ARGS__)) (__VA_ARGS__)
|
||||
// END VFUNC
|
||||
|
||||
// BEGIN DECLARE_ACM
|
||||
# define ACM_DECLARE(T) \
|
||||
\
|
||||
typedef T (*COPY_##T##_TYPE) (const T); \
|
||||
typedef void (*DESTROY_##T##_TYPE) (const T); \
|
||||
typedef int (*EQ_##T##_TYPE) (const T, const T); \
|
||||
\
|
||||
typedef struct \
|
||||
{ \
|
||||
T *letter; /* An array of symbols */ \
|
||||
size_t length; /* Length of the array */ \
|
||||
} Keyword_##T; \
|
||||
\
|
||||
typedef struct \
|
||||
{ \
|
||||
T *letter; /* An array of symbols */ \
|
||||
size_t length; /* Length of the array */ \
|
||||
size_t rank; /* Rank of the regidtered keyword */\
|
||||
} MatchHolder_##T; \
|
||||
\
|
||||
struct _ac_state_##T; \
|
||||
typedef struct _ac_state_##T ACState_##T; \
|
||||
struct _ac_machine_##T; \
|
||||
typedef struct _ac_machine_##T ACMachine_##T; \
|
||||
typedef int (*PRINT_##T##_TYPE) (FILE *, T); \
|
||||
struct _acs_vtable_##T \
|
||||
{ \
|
||||
size_t (*match) (const ACState_##T ** state, T letter); \
|
||||
size_t (*get_match) (const ACState_##T * state, size_t index, MatchHolder_##T * match, void **value); \
|
||||
}; \
|
||||
/* A state of the state machine. */ \
|
||||
struct _ac_state_##T /* [state s] */ \
|
||||
{ \
|
||||
/* A link to the next states */ \
|
||||
struct _ac_next_##T \
|
||||
{ \
|
||||
T letter; /* [a symbol] */ \
|
||||
struct _ac_state_##T *state; /* [g(s, letter)] */\
|
||||
} *goto_array; /* next states in the tree of the goto function */\
|
||||
size_t nb_goto; \
|
||||
/* A link to the previous states */ \
|
||||
struct \
|
||||
{ \
|
||||
size_t i_letter; /* Index of the letter in the goto_array */ \
|
||||
/* letter = previous.state->goto_array[previous.i_letter].letter */ \
|
||||
struct _ac_state_##T *state; \
|
||||
} previous; /* Previous state */\
|
||||
const struct _ac_state_##T *fail_state; /* [f(s)] */\
|
||||
int is_matching; /* true if the state matches a keyword. */\
|
||||
size_t nb_sequence; /* Number of matching keywords (Aho-Corasick : size (output (s)) */\
|
||||
size_t rank; /* Rank (0-based) of insertion of a keyword in the machine. */\
|
||||
size_t id; /* state UID */ \
|
||||
void *value; /* An optional value associated to a state. */\
|
||||
void (*value_dtor) (void *); /* Destrcutor of the associated value, called a state machine release. */\
|
||||
ACMachine_##T * machine; \
|
||||
const struct _acs_vtable_##T *vtable; \
|
||||
}; \
|
||||
\
|
||||
struct _acm_vtable_##T \
|
||||
{ \
|
||||
int (*register_keyword) (ACMachine_##T * machine, Keyword_##T keyword, void *value, void (*dtor) (void *)); \
|
||||
int (*is_registered_keyword) (const ACMachine_##T * machine, Keyword_##T keyword, void **value); \
|
||||
int (*unregister_keyword) (ACMachine_##T * machine, Keyword_##T keyword); \
|
||||
size_t (*nb_keywords) (const ACMachine_##T * machine); \
|
||||
void (*foreach_keyword) (const ACMachine_##T * machine, void (*operator) (MatchHolder_##T, void *)); \
|
||||
void (*release) (const ACMachine_##T * machine); \
|
||||
const ACState_##T * (*reset) (const ACMachine_##T * machine); \
|
||||
void (*print) (ACMachine_##T * machine, FILE * stream, PRINT_##T##_TYPE printer); \
|
||||
}; \
|
||||
\
|
||||
struct _ac_machine_##T \
|
||||
{ \
|
||||
struct _ac_state_##T *state_0; /* state 0 */ \
|
||||
size_t rank; /* Number of keywords registered in the machine. */\
|
||||
size_t nb_sequence; /* Number of keywords in the machine. */\
|
||||
size_t state_counter; \
|
||||
int reconstruct; \
|
||||
size_t size; \
|
||||
pthread_mutex_t lock; \
|
||||
const struct _acm_vtable_##T *vtable; \
|
||||
T (*copy) (const T); \
|
||||
void (*destroy) (const T); \
|
||||
int (*eq) (const T, const T); \
|
||||
}; \
|
||||
\
|
||||
__attribute__ ((unused)) ACMachine_##T *ACM_create_##T (EQ_##T##_TYPE eq, \
|
||||
COPY_##T##_TYPE copier, \
|
||||
DESTROY_##T##_TYPE dtor); \
|
||||
struct __useless_struct_to_allow_trailing_semicolon__
|
||||
// END DECLARE_ACM
|
||||
|
||||
// BEGIN MACROS
|
||||
# define ACM_create4(T, eq, copy, dtor) ACM_create_##T((eq), (copy), (dtor))
|
||||
# define ACM_create2(T, eq) ACM_create4(T, (eq), 0, 0)
|
||||
# define ACM_create1(T) ACM_create4(T, 0, 0, 0)
|
||||
|
||||
# define ACM_register_keyword4(machine, keyword, value, dtor) (machine)->vtable->register_keyword ((machine), (keyword), (value), (dtor))
|
||||
# define ACM_register_keyword3(machine, keyword, value) ACM_register_keyword4((machine), (keyword), (value), free)
|
||||
# define ACM_register_keyword2(machine, keyword) ACM_register_keyword4((machine), (keyword), 0, 0)
|
||||
|
||||
# define ACM_is_registered_keyword3(machine, keyword, value) (machine)->vtable->is_registered_keyword ((machine), (keyword), (value))
|
||||
# define ACM_is_registered_keyword2(machine, keyword) ACM_is_registered_keyword3((machine), (keyword), 0)
|
||||
|
||||
# define ACM_get_match4(state, index, matchholder, value) (state)->vtable->get_match ((state), (index), (matchholder), (value))
|
||||
# define ACM_get_match3(state, index, matchholder) ACM_get_match4((state), (index), (matchholder), 0)
|
||||
# define ACM_get_match2(state, index) ACM_get_match4((state), (index), 0, 0)
|
||||
|
||||
#if defined(__GNUC__) || defined (__clang__)
|
||||
#define ACM_DECL5(var, T, eq, copy, dtor) \
|
||||
__attribute__ ((cleanup (ACM_cleanup_##T))) ACMachine_##T var; machine_init_##T (&(var), state_create_##T (), (eq), (copy), (dtor))
|
||||
#define ACM_DECL3(var, T, eq) ACM_DECL5(var, T, (eq), 0, 0)
|
||||
#define ACM_DECL2(var, T) ACM_DECL3(var, T, 0)
|
||||
#define ACM_DECL(...) VFUNC(ACM_DECL, __VA_ARGS__)
|
||||
#endif
|
||||
// END MACROS
|
||||
|
||||
#endif
|
@ -0,0 +1,678 @@
|
||||
/*
|
||||
* Copyright 2017 Laurent Farhi
|
||||
* Contact: lfarhi@sfr.fr
|
||||
*
|
||||
* This file is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This file is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this file. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is modified from the original to suppress ISO C and c99 warnings
|
||||
* issued by both gcc and clang, such as those for _Generic selection and %n$
|
||||
* operand number formats. So, this version is compatible with ISO C and c99,
|
||||
* but does not support generic programming. The keywords can only be of
|
||||
* (char *) type.
|
||||
*/
|
||||
|
||||
// Credits: This implementation of "templates" makes use of a nice idea of Randy Gaul for Generic Programming in C.
|
||||
// See http://www.randygaul.net/2012/08/10/generic-programming-in-c
|
||||
//
|
||||
// Initialized by gcc -fpreprocessed -dD -E -P aho_corasick.c | grep -v '^$' | indent
|
||||
|
||||
#ifndef __ACM_TEMPLATE_IMPL__
|
||||
|
||||
# define __ACM_TEMPLATE_IMPL__
|
||||
# include <stddef.h>
|
||||
# include <inttypes.h>
|
||||
# include <stdlib.h>
|
||||
# include <stdio.h>
|
||||
# include <pthread.h>
|
||||
# include <string.h>
|
||||
# include <signal.h>
|
||||
|
||||
# define ACM_KEEP_VALUE 0 // Configures the behavior of ACM_register_keyword_##ACM_SYMBOL if a keyword was already previously registered.
|
||||
# include "aho_corasick_template.h"
|
||||
|
||||
# define ACM_ASSERT(cond) do { if (!(cond)) { \
|
||||
fprintf(stderr, "FATAL ERROR: !(%s) in function %s at %s:%i)\n", #cond, __func__, __FILE__, __LINE__);\
|
||||
pthread_exit(0) ;\
|
||||
} } while (0)
|
||||
|
||||
static int UNUSED
|
||||
__eqchar (const char a, const char b)
|
||||
{
|
||||
return a == b;
|
||||
}
|
||||
|
||||
# define EQ_DEFAULT(ACM_SYMBOL) (__eqchar)
|
||||
|
||||
// BEGIN DEFINE_ACM
|
||||
# define ACM_DEFINE(ACM_SYMBOL) \
|
||||
\
|
||||
static int (*EQ_##ACM_SYMBOL) (const ACM_SYMBOL, const ACM_SYMBOL) = 0;\
|
||||
\
|
||||
static void \
|
||||
__DTOR_##ACM_SYMBOL(UNUSED const ACM_SYMBOL letter) \
|
||||
{ \
|
||||
((void)0); \
|
||||
} \
|
||||
\
|
||||
static ACM_SYMBOL \
|
||||
__COPY_##ACM_SYMBOL(const ACM_SYMBOL letter) \
|
||||
{ \
|
||||
return letter; \
|
||||
} \
|
||||
\
|
||||
static int __EQ_##ACM_SYMBOL(const ACM_SYMBOL a, const ACM_SYMBOL b) \
|
||||
{ \
|
||||
return EQ_##ACM_SYMBOL ? \
|
||||
EQ_##ACM_SYMBOL (a, b) : \
|
||||
(size_t)0 != (size_t)(EQ_DEFAULT (ACM_SYMBOL)) ? \
|
||||
EQ_DEFAULT (ACM_SYMBOL)(a, b) : \
|
||||
(fprintf (stderr, "%s", "ERROR: " "Missing equality operator for type '" #ACM_SYMBOL "'.\n" \
|
||||
" " "Use SET_EQ_OPERATOR(" #ACM_SYMBOL ", operator),\n" \
|
||||
" " "where operator is a function defined as:\n" \
|
||||
" " "int operator(" #ACM_SYMBOL " a, " #ACM_SYMBOL " b) { return a == b ; }.\n" \
|
||||
"ABORT " "\n"), fflush (0), raise (SIGABRT)); \
|
||||
} \
|
||||
\
|
||||
static const ACState_##ACM_SYMBOL *state_goto_##ACM_SYMBOL ( \
|
||||
const ACState_##ACM_SYMBOL * state, \
|
||||
ACM_SYMBOL letter, EQ_##ACM_SYMBOL##_TYPE eq); \
|
||||
\
|
||||
static void \
|
||||
state_reset_output_##ACM_SYMBOL (ACState_##ACM_SYMBOL * r) \
|
||||
{ \
|
||||
if (r->is_matching) \
|
||||
r->nb_sequence = 1; /* Reset to original output (as in state_goto_update) */\
|
||||
else \
|
||||
r->nb_sequence = 0; \
|
||||
struct _ac_next_##ACM_SYMBOL *p = r->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + r->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
state_reset_output_##ACM_SYMBOL (p->state); \
|
||||
} \
|
||||
/* Aho-Corasick Algorithm 3: construction of the failure function. */ \
|
||||
static void \
|
||||
state_fail_state_construct_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL * machine) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *state_0 = machine->state_0; /* [state 0] */ \
|
||||
if (machine->reconstruct == 2) \
|
||||
state_reset_output_##ACM_SYMBOL (state_0); \
|
||||
/* Aho-Corasick Algorithm: "(except state 0 for which the failure function is not defined)." */\
|
||||
state_0->fail_state = 0; \
|
||||
/* Aho-Corasick Algorithm 3: queue <- empty */ \
|
||||
/* The first element in the queue will not be processed, therefore it can be added harmlessly. */\
|
||||
size_t queue_length = 0; \
|
||||
ACState_##ACM_SYMBOL **queue = 0; \
|
||||
ACM_ASSERT (queue = malloc (sizeof (*queue) * (machine->size - 1))); \
|
||||
/* Aho-Corasick Algorithm 3: for each a such that s != 0 [fail], where s <- g(0, a) do [1] */\
|
||||
struct _ac_next_##ACM_SYMBOL *p = state_0->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state_0->nb_goto; \
|
||||
for (; p < end; p++) /* loop on state_0->goto_array */ \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *s = p->state; /* [for each a such that s != 0 [fail], where s <- g(0, a)] */\
|
||||
/* Aho-Corasick Algorithm 3: queue <- queue U {s} */ \
|
||||
queue_length++; \
|
||||
queue[queue_length - 1] = s; /* s */ \
|
||||
/* Aho-Corasick Algorithm 3: f(s) <- 0 */ \
|
||||
s->fail_state = state_0; \
|
||||
} /* loop on state_0->goto_array */ \
|
||||
size_t queue_read_pos = 0; \
|
||||
/* Aho-Corasick Algorithm 3: while queue != empty do */ \
|
||||
while (queue_read_pos < queue_length) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 3: let r be the next state in queue */ \
|
||||
ACState_##ACM_SYMBOL *r = queue[queue_read_pos]; \
|
||||
/* Aho-Corasick Algorithm 3: queue <- queue - {r} */ \
|
||||
queue_read_pos++; \
|
||||
/* Aho-Corasick Algorithm 3: for each a such that s != fail, where s <- g(r, a) */\
|
||||
struct _ac_next_##ACM_SYMBOL *p = r->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + r->nb_goto; \
|
||||
for (; p < end; p++) /* loop on r->goto_array */ \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *s = p->state; /* [s <- g(r, a)] */ \
|
||||
ACM_SYMBOL a = p->letter; \
|
||||
/* Aho-Corasick Algorithm 3: queue <- queue U {s} */ \
|
||||
queue_length++; \
|
||||
queue[queue_length - 1] = s; \
|
||||
/* Aho-Corasick Algorithm 3: state <- f(r) */ \
|
||||
const ACState_##ACM_SYMBOL *state = r->fail_state; /* f(r) */ \
|
||||
/* Aho-Corasick Algorithm 3: while g(state, a) = fail [and state != 0] do state <- f(state) [2] */\
|
||||
/* [if g(state, a) != fail then] f(s) <- g(state, a) [else f(s) <- 0] [3] */\
|
||||
s->fail_state /* f(s) */ = state_goto_##ACM_SYMBOL (state, a, machine->eq); \
|
||||
/* Aho-Corasick Algorithm 3: output (s) <-output (s) U output (f(s)) */\
|
||||
s->nb_sequence += s->fail_state->nb_sequence; \
|
||||
} /* loop on r->goto_array */ \
|
||||
} /* while (queue_read_pos < queue_length) */ \
|
||||
free (queue); \
|
||||
machine->reconstruct = 0; \
|
||||
} \
|
||||
\
|
||||
static const ACState_##ACM_SYMBOL * \
|
||||
state_goto_##ACM_SYMBOL (const ACState_##ACM_SYMBOL * state, ACM_SYMBOL letter /* a[i] */,\
|
||||
EQ_##ACM_SYMBOL##_TYPE eq) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 1: while g(state, a[i]) = fail [and state != 0] do state <- f(state) [2] */\
|
||||
/* [if g(state, a[i]) != fail then] state <- g(state, a[i]) [else state <- 0] [3] */\
|
||||
/* [The function returns state] */ \
|
||||
while (1) \
|
||||
{ \
|
||||
/* [if g(state, a[i]) != fail then return g(state, a[i])] */ \
|
||||
struct _ac_next_##ACM_SYMBOL *p = state->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
if (eq (p->letter, letter)) \
|
||||
return p->state; \
|
||||
/* From here, [g(state, a[i]) = fail] */ \
|
||||
\
|
||||
/* Algorithms 1 cannot consider that g(0, a) never fails because propoerty LOOP_0 has not been implemented. */\
|
||||
/* Therefore, for state 0, we must simulate the property LOOP_0, i.e state 0 must be returned, */\
|
||||
/* as if g(0, a[i]) would have been set to state 0 if g(0, a[i]) = fail (property LOOP_0). */\
|
||||
/* After Algorithm 3 has been processed, the only state for which f(state) = 0 is state 0. */\
|
||||
/* [if g(state, a[i]) = fail and state = 0 then return state 0] */ \
|
||||
/* Aho-Corasick Algorithm: "(except state 0 for which the failure function is not defined)." */\
|
||||
if (state->fail_state == 0) \
|
||||
return state; \
|
||||
/* From here, [state != 0] */ \
|
||||
\
|
||||
/* [if g(state, a[i]) = fail and state != 0 then state <- f(state) */\
|
||||
state = state->fail_state; \
|
||||
} \
|
||||
} \
|
||||
/* Aho-Corasick Algorithm 1: Pattern matching machine - if output (state) != empty */\
|
||||
static size_t \
|
||||
ACM_match_##ACM_SYMBOL (const ACState_##ACM_SYMBOL ** pstate, ACM_SYMBOL letter) \
|
||||
{ \
|
||||
/* N.B.: In Aho-Corasick, algorithm 3 is executed after all keywords have been inserted */\
|
||||
/* in the goto graph one after the other by algorithm 2. */ \
|
||||
/* As a slight enhancement: the fail state chains are rebuilt from scratch when needed, */\
|
||||
/* i.e. if a keyword has been added since the last pattern maching search. */\
|
||||
/* Therefore, algorithms 2 and 3 can be processed alternately. */\
|
||||
/* (algorithm 3 will traverse the full goto graph after a keyword has been added.) */\
|
||||
/* Double-checked locking */ \
|
||||
ACMachine_##ACM_SYMBOL * machine = (*pstate)->machine; \
|
||||
if (machine->reconstruct) \
|
||||
{ \
|
||||
pthread_mutex_lock (&machine->lock); \
|
||||
if (machine->reconstruct) \
|
||||
state_fail_state_construct_##ACM_SYMBOL (machine); \
|
||||
pthread_mutex_unlock (&machine->lock); \
|
||||
} \
|
||||
return \
|
||||
(*pstate = state_goto_##ACM_SYMBOL (*pstate, letter, machine->eq)) \
|
||||
->nb_sequence; \
|
||||
} \
|
||||
/* Aho-Corasick Algorithm 1: Pattern matching machine - print output (state) [ith element] */\
|
||||
static size_t \
|
||||
ACM_get_match_##ACM_SYMBOL (const ACState_##ACM_SYMBOL * state, size_t index, \
|
||||
MatchHolder_##ACM_SYMBOL * match, void **value) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 1: if output(state) [ith element] */ \
|
||||
ACM_ASSERT (index < state->nb_sequence); \
|
||||
size_t i = 0; \
|
||||
for (; state; state = state->fail_state, i++ /* skip to the next failing state */ )\
|
||||
{ \
|
||||
/* Look for the first state in the "failing states" chain which matches a keyword. */\
|
||||
while (!state->is_matching && state->fail_state) \
|
||||
state = state->fail_state; \
|
||||
if (i == index) \
|
||||
break; \
|
||||
} \
|
||||
/* Argument match could be passed to 0 if only value or rank is needed. */\
|
||||
if (match) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 1: [print i] */ \
|
||||
/* Aho-Corasick Algorithm 1: print output(state) [ith element] */ \
|
||||
/* Reconstruct the matching keyword moving backward from the matching state to the state 0. */\
|
||||
match->length = 0; \
|
||||
for (const ACState_##ACM_SYMBOL * s = state; s && s->previous.state; s = s->previous.state) \
|
||||
match->length++; \
|
||||
/* Reallocation of match->letter. match->letter should be freed by the user after the last call to ACM_get_match on match. */\
|
||||
ACM_ASSERT (match->letter = realloc (match->letter, sizeof (*match->letter) * match->length)); \
|
||||
i = 0; \
|
||||
for (const ACState_##ACM_SYMBOL * s = state; s && s->previous.state; s = s->previous.state) \
|
||||
{ \
|
||||
match->letter[match->length - i - 1] = s->previous.state->goto_array[s->previous.i_letter].letter; \
|
||||
i++; \
|
||||
} \
|
||||
match->rank = state->rank; \
|
||||
} \
|
||||
/* Argument value could passed to 0 if the associated value is not needed. */\
|
||||
if (value) \
|
||||
*value = state->value; \
|
||||
return state->rank; \
|
||||
} \
|
||||
\
|
||||
static const struct _acs_vtable_##ACM_SYMBOL ACS_VTABLE_##ACM_SYMBOL = \
|
||||
{ \
|
||||
ACM_match_##ACM_SYMBOL, \
|
||||
ACM_get_match_##ACM_SYMBOL, \
|
||||
}; \
|
||||
\
|
||||
ACState_##ACM_SYMBOL * \
|
||||
state_create_##ACM_SYMBOL (void) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *s = malloc (sizeof (*s)); /* [state s] */ \
|
||||
ACM_ASSERT (s); \
|
||||
/* [g(s, a) is undefined (= fail) for all input symbol a] */ \
|
||||
s->goto_array = 0; \
|
||||
s->nb_goto = 0; \
|
||||
s->previous.state = 0; \
|
||||
s->previous.i_letter = 0; \
|
||||
/* Aho-Corasick Algorithm 2: "We assume output(s) is empty when state s is first created." */ \
|
||||
s->nb_sequence = 0; /* number of outputs in [output(s)] */ \
|
||||
s->is_matching = 0; /* if 1, indicates that the state is the last node of a registered keyword */ \
|
||||
s->fail_state = 0; \
|
||||
s->rank = 0; \
|
||||
s->value = 0; \
|
||||
s->value_dtor = 0; \
|
||||
s->machine = 0; \
|
||||
s->vtable = &(ACS_VTABLE_##ACM_SYMBOL); \
|
||||
return s; \
|
||||
} \
|
||||
/* Aho-Corasick Algorithm 2: construction of the goto function - procedure enter(a[1] a[2] ... a[n]). */\
|
||||
static int \
|
||||
machine_goto_update_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL * machine, \
|
||||
Keyword_##ACM_SYMBOL sequence /* a[1] a[2] ... a[n] */, \
|
||||
void *value, void (*dtor) (void *)) \
|
||||
{ \
|
||||
if (!sequence.length) \
|
||||
{ \
|
||||
if (dtor) \
|
||||
dtor (value); \
|
||||
return 0; \
|
||||
} \
|
||||
ACState_##ACM_SYMBOL *state_0 = machine->state_0; /* [state 0] */ \
|
||||
/* Iterators */ \
|
||||
/* Aho-Corasick Algorithm 2: state <- 0 */ \
|
||||
ACState_##ACM_SYMBOL *state = state_0; \
|
||||
/* Aho-Corasick Algorithm 2: j <- 1 */ \
|
||||
size_t j = 0; /* j is 0-based here (and not 1-based like in original text) */\
|
||||
/* Aho-Corasick Algorithm 2: while g(state, a[j]) != fail [and j <= m] do */\
|
||||
/* Iterations on i and s until a final state */ \
|
||||
for (; j < sequence.length /* [j <= m] */ ;) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *next = 0; \
|
||||
/* Aho-Corasick Algorithm 2: "g(s, l) = fail if l is undefined or if g(s, l) has not been defined." */\
|
||||
/* Loop on all symbols a for which g(state, a) is defined. */ \
|
||||
struct _ac_next_##ACM_SYMBOL *p = state->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
if (machine->eq (p->letter, sequence.letter[j])) \
|
||||
{ \
|
||||
/* [if g(state, a[j]) is defined] */ \
|
||||
next = p->state; \
|
||||
break; \
|
||||
} \
|
||||
/* [if g(state, a[j]) is defined (!= fail)] */ \
|
||||
if (next) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 2: state <- g(state, a[j]) */ \
|
||||
state = next; \
|
||||
/* Aho-Corasick Algorithm 2: j <- j + 1 */ \
|
||||
j++; \
|
||||
} \
|
||||
/* [g(state, a[j]) is not defined (= fail)] */ \
|
||||
else \
|
||||
break; /* exit while g(state, a[j]) != fail */ \
|
||||
} \
|
||||
/* Aho-Corasick Algorithm 2: for p <- j until m do */ \
|
||||
/* Appending states for the new sequence to the final state found */ \
|
||||
for (size_t p = j; p < sequence.length /* [p <= m] */ ; p++) \
|
||||
{ \
|
||||
state->nb_goto++; \
|
||||
ACM_ASSERT (state->goto_array = realloc (state->goto_array, \
|
||||
sizeof (*state->goto_array) * state->nb_goto)); \
|
||||
/* Creation of a new state */ \
|
||||
/* Aho-Corasick Algorithm 2: newstate <- newstate + 1 */ \
|
||||
ACState_##ACM_SYMBOL *newstate = state_create_##ACM_SYMBOL (); \
|
||||
newstate->machine = machine; \
|
||||
newstate->id = ++machine->state_counter; /* state UID */ \
|
||||
/* Aho-Corasick Algorithm 2: g(state, a[p]) <- newstate */ \
|
||||
state->goto_array[state->nb_goto - 1].state = newstate; \
|
||||
state->goto_array[state->nb_goto - 1].letter = machine->copy (sequence.letter[p]); \
|
||||
/* Backward link: previous(newstate, a[p]) <- state */ \
|
||||
newstate->previous.state = state; \
|
||||
/* state->goto_array[state->nb_goto - 1].state->previous.i_letter = state->nb_goto - 1; */\
|
||||
newstate->previous.i_letter = state->nb_goto - 1; \
|
||||
/* Aho-Corasick Algorithm 2: state <- newstate */ \
|
||||
state = newstate; \
|
||||
machine->size++; \
|
||||
} \
|
||||
if (!state->is_matching) \
|
||||
{ \
|
||||
/* Aho-Corasick Algorithm 2: output (state) <- { a[1] a[2] ... a[n] } */\
|
||||
/* Aho-Corasick Algorithm 2: "We assume output(s) is empty when state s is first created." */\
|
||||
/* Adding the sequence to the last found state (created or not) */ \
|
||||
state->is_matching = 1; \
|
||||
state->nb_sequence = 1; \
|
||||
state->rank = machine->rank++; /* rank is a 0-based index */ \
|
||||
machine->nb_sequence++; \
|
||||
if (!machine->reconstruct) \
|
||||
machine->reconstruct = 2; /* f(s) must be recomputed */ \
|
||||
} \
|
||||
/* If the keyword was already previously registered (state->is_matching != 0) */\
|
||||
else if (ACM_KEEP_VALUE) \
|
||||
/* if !ACM_KEEP_VALUE: the new value replaces the old one: the associated old value is forgotten. */\
|
||||
/* if ACM_KEEP_VALUE: the rank and associated value are left unchanged. */\
|
||||
{ \
|
||||
if (dtor) \
|
||||
dtor (value); \
|
||||
return 0; \
|
||||
} \
|
||||
/* if (!state->is_matching || !ACM_KEEP_VALUE) */ \
|
||||
if (state->value_dtor) \
|
||||
state->value_dtor (state->value); \
|
||||
state->value = value; \
|
||||
state->value_dtor = dtor; \
|
||||
return 1; \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
machine_init_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL *machine, \
|
||||
ACState_##ACM_SYMBOL * state_0, \
|
||||
EQ_##ACM_SYMBOL##_TYPE eq, \
|
||||
COPY_##ACM_SYMBOL##_TYPE copier, \
|
||||
DESTROY_##ACM_SYMBOL##_TYPE dtor); \
|
||||
\
|
||||
__attribute__ ((unused)) ACMachine_##ACM_SYMBOL *ACM_create_##ACM_SYMBOL (EQ_##ACM_SYMBOL##_TYPE eq, \
|
||||
COPY_##ACM_SYMBOL##_TYPE copier, \
|
||||
DESTROY_##ACM_SYMBOL##_TYPE dtor) \
|
||||
{ \
|
||||
ACMachine_##ACM_SYMBOL *machine = malloc (sizeof (*machine)); \
|
||||
ACM_ASSERT (machine); \
|
||||
/* Aho-Corasick Algorithm 2: newstate <- 0 */ \
|
||||
/* Create state 0. */ \
|
||||
machine_init_##ACM_SYMBOL (machine, state_create_##ACM_SYMBOL (), eq, copier, dtor); \
|
||||
return machine; \
|
||||
} \
|
||||
\
|
||||
static int \
|
||||
ACM_register_keyword_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL * machine, Keyword_##ACM_SYMBOL y,\
|
||||
void *value, void (*dtor) (void *)) \
|
||||
{ \
|
||||
return machine_goto_update_##ACM_SYMBOL (machine, y, value, dtor); \
|
||||
\
|
||||
/* Aho-Corasick Algorithm 2: for all a such that g(0, a) = fail do g(0, a) <- 0 */\
|
||||
/* This statement is aimed to set the following property (here called the Aho-Corasick LOOP_0 property): */\
|
||||
/* "All our pattern matching machines have the property that g(0, l) != fail for all input symbol l. */\
|
||||
/* [...] this property of the goto function [g] on state 0 [root] ensures that one input symbol will be processed */\
|
||||
/* by the machine in every machine cycle [state_goto]." */\
|
||||
/* "We add a loop from state 0 to state 0 on all input symbols other than [the symbols l for which g(0, l) is already defined]. */\
|
||||
\
|
||||
/* N.B.: This property is *NOT* implemented in this code after calls to enter(y[i]) because */\
|
||||
/* it requires that the alphabet of all possible symbols is known in advance. */\
|
||||
/* This would kill the genericity of the code. */\
|
||||
/* Therefore, Algorithms 1, 3 and 4 *CANNOT* consider that g(0, l) never fails for any symbol l. */\
|
||||
/* g(0, l) can fail like any other state transition. */\
|
||||
/* Thus, the implementation slightly differs from the one proposed by Aho-Corasick. */\
|
||||
} \
|
||||
\
|
||||
static size_t \
|
||||
ACM_nb_keywords_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine) \
|
||||
{ \
|
||||
return machine->nb_sequence; \
|
||||
} \
|
||||
\
|
||||
static ACState_##ACM_SYMBOL * \
|
||||
get_last_state_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine, Keyword_##ACM_SYMBOL sequence) \
|
||||
{ \
|
||||
if (!sequence.length) \
|
||||
return 0; \
|
||||
ACState_##ACM_SYMBOL *state = machine->state_0; /* [state 0] */ \
|
||||
for (size_t j = 0; j < sequence.length; j++) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *next = 0; \
|
||||
struct _ac_next_##ACM_SYMBOL *p = state->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
if (machine->eq (p->letter, sequence.letter[j])) \
|
||||
{ \
|
||||
next = p->state; \
|
||||
break; \
|
||||
} \
|
||||
if (next) \
|
||||
state = next; \
|
||||
else \
|
||||
return 0; \
|
||||
} \
|
||||
return state->is_matching ? state : 0; \
|
||||
} \
|
||||
\
|
||||
static int \
|
||||
ACM_is_registered_keyword_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine, \
|
||||
Keyword_##ACM_SYMBOL sequence, \
|
||||
void **value) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *last = get_last_state_##ACM_SYMBOL (machine, sequence); \
|
||||
if (last && value) \
|
||||
*value = last->value; \
|
||||
return last ? 1 : 0; \
|
||||
} \
|
||||
\
|
||||
static int \
|
||||
ACM_unregister_keyword_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL * machine, Keyword_##ACM_SYMBOL y) \
|
||||
{ \
|
||||
ACState_##ACM_SYMBOL *last = get_last_state_##ACM_SYMBOL (machine, y); \
|
||||
if (!last) /* The keyword y is not a registered keyword */ \
|
||||
return 0; \
|
||||
ACState_##ACM_SYMBOL *state_0 = machine->state_0; /* [state 0] */ \
|
||||
/* machine->rank is not decreased, so as to ensure unicity. */ \
|
||||
machine->nb_sequence--; \
|
||||
if (last->nb_goto) /* The keyword y is the prefix of another registered keyword */ \
|
||||
{ \
|
||||
last->is_matching = 0; /* not matching nymore */ \
|
||||
last->nb_sequence = 0; \
|
||||
last->rank = 0; \
|
||||
return 1; \
|
||||
} \
|
||||
/* From here, last->nb_goto == 0 */ \
|
||||
ACState_##ACM_SYMBOL *prev = 0; \
|
||||
do /* backward processing the keyword y */ \
|
||||
{ \
|
||||
prev = last->previous.state; \
|
||||
/* Remove last from prev->goto_array */ \
|
||||
prev->nb_goto--; \
|
||||
for (size_t k = last->previous.i_letter; k < prev->nb_goto; k++) \
|
||||
{ \
|
||||
machine->destroy (prev->goto_array[k].letter); \
|
||||
prev->goto_array[k] = prev->goto_array[k + 1]; \
|
||||
prev->goto_array[k].state->previous.i_letter = k; \
|
||||
} \
|
||||
prev->goto_array = realloc (prev->goto_array, sizeof (*prev->goto_array) * prev->nb_goto); \
|
||||
ACM_ASSERT (!prev->nb_goto || prev->goto_array); \
|
||||
/* Release associated value; */ \
|
||||
if (last->value_dtor) \
|
||||
last->value_dtor (last->value); \
|
||||
/* Release last */ \
|
||||
free (last); \
|
||||
machine->size--; \
|
||||
last = prev; \
|
||||
} \
|
||||
while (prev && prev != state_0 && !prev->is_matching && !prev->nb_goto); \
|
||||
\
|
||||
if (!machine->reconstruct) \
|
||||
machine->reconstruct = 2; /* f(s) must be recomputed */ \
|
||||
\
|
||||
return 1; \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
foreach_keyword_##ACM_SYMBOL (const ACState_##ACM_SYMBOL * state, ACM_SYMBOL ** letters, size_t * length, size_t depth, \
|
||||
void (*operator) (MatchHolder_##ACM_SYMBOL, void *)) \
|
||||
{ \
|
||||
if (state->is_matching && depth) \
|
||||
{ \
|
||||
MatchHolder_##ACM_SYMBOL k = {.letter = *letters,.length = depth, .rank = state->rank }; \
|
||||
(*operator) (k, state->value); \
|
||||
} \
|
||||
if (state->nb_goto && depth >= *length) \
|
||||
{ \
|
||||
(*length)++; \
|
||||
*letters = realloc (*letters, sizeof (**letters) * (*length)); \
|
||||
ACM_ASSERT (letters); \
|
||||
} \
|
||||
struct _ac_next_##ACM_SYMBOL *p = state->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
{ \
|
||||
(*letters)[depth] = p->letter; \
|
||||
foreach_keyword_##ACM_SYMBOL (p->state, letters, length, depth + 1, operator); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
ACM_foreach_keyword_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine, void (*operator) (MatchHolder_##ACM_SYMBOL, void *)) \
|
||||
{ \
|
||||
if (!operator) \
|
||||
return; \
|
||||
ACState_##ACM_SYMBOL *state_0 = machine->state_0; /* [state 0] */ \
|
||||
ACM_SYMBOL *letters = 0; \
|
||||
size_t depth = 0; \
|
||||
foreach_keyword_##ACM_SYMBOL (state_0, &letters, &depth, 0, operator);\
|
||||
free (letters); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
state_release_##ACM_SYMBOL (const ACState_##ACM_SYMBOL * state, \
|
||||
DESTROY_##ACM_SYMBOL##_TYPE dtor) \
|
||||
{ \
|
||||
/* Release goto_array */ \
|
||||
struct _ac_next_##ACM_SYMBOL *p = state->goto_array; \
|
||||
struct _ac_next_##ACM_SYMBOL *end = p + state->nb_goto; \
|
||||
for (; p < end; p++) \
|
||||
{ \
|
||||
state_release_##ACM_SYMBOL (p->state, dtor); \
|
||||
if (dtor) \
|
||||
dtor (p->letter); \
|
||||
} \
|
||||
free (state->goto_array); \
|
||||
/* Release associated value */ \
|
||||
if (state->value_dtor) \
|
||||
state->value_dtor (state->value); \
|
||||
/* Release state */ \
|
||||
free ((ACState_##ACM_SYMBOL *) state); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
ACM_cleanup_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine) \
|
||||
{ \
|
||||
state_release_##ACM_SYMBOL (machine->state_0, machine->destroy); \
|
||||
pthread_mutex_destroy (&((ACMachine_##ACM_SYMBOL *) machine)->lock); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
ACM_release_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine) \
|
||||
{ \
|
||||
ACM_cleanup_##ACM_SYMBOL (machine); \
|
||||
free ((ACMachine_##ACM_SYMBOL *) machine); \
|
||||
} \
|
||||
\
|
||||
static const ACState_##ACM_SYMBOL * \
|
||||
ACM_reset_##ACM_SYMBOL (const ACMachine_##ACM_SYMBOL * machine) \
|
||||
{ \
|
||||
return machine->state_0; \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
state_print_##ACM_SYMBOL (ACState_##ACM_SYMBOL *state, \
|
||||
FILE* stream, size_t indent, size_t id_state,\
|
||||
PRINT_##ACM_SYMBOL##_TYPE printer) \
|
||||
{ \
|
||||
static size_t nb_states, cur_pos; \
|
||||
for (size_t i = 0 ; i < state->nb_goto ; i++) \
|
||||
{ \
|
||||
if (indent < cur_pos) \
|
||||
{ \
|
||||
cur_pos = 0; \
|
||||
fprintf (stream, "\n"); \
|
||||
if (indent) \
|
||||
{ \
|
||||
for (size_t t = 0 ; t < indent - 1 ; t++) \
|
||||
cur_pos += fprintf (stream, " "); \
|
||||
cur_pos += fprintf (stream, "L"); \
|
||||
} \
|
||||
} \
|
||||
else if (indent > cur_pos) \
|
||||
for (size_t t = 0 ; t < indent - cur_pos ; t++) \
|
||||
cur_pos += fprintf (stream, " "); \
|
||||
if (state == state->machine->state_0) \
|
||||
cur_pos += fprintf (stream, "(%03zu)", id_state); \
|
||||
cur_pos += fprintf (stream, "---"); \
|
||||
if (printer) \
|
||||
cur_pos += printer (stream, state->goto_array[i].letter); \
|
||||
cur_pos += fprintf (stream, "-->"); \
|
||||
/* cur_pos += fprintf (stream, "%03zu", ++nb_states); */ \
|
||||
cur_pos += fprintf (stream, "(%03zu)", state->goto_array[i].state->id);\
|
||||
if (state->goto_array[i].state->is_matching) \
|
||||
cur_pos += fprintf (stream, "[%zu]", state->goto_array[i].state->rank);\
|
||||
if (state->goto_array[i].state->fail_state && \
|
||||
state->goto_array[i].state->fail_state != state->machine->state_0)\
|
||||
cur_pos += fprintf (stream, "(-->%03zu)", state->goto_array[i].state->fail_state->id);\
|
||||
state_print_##ACM_SYMBOL (state->goto_array[i].state, stream, \
|
||||
cur_pos, nb_states, printer); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
ACM_print_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL *machine, \
|
||||
FILE* stream, \
|
||||
PRINT_##ACM_SYMBOL##_TYPE printer) \
|
||||
{ \
|
||||
if (machine->reconstruct) \
|
||||
{ \
|
||||
pthread_mutex_lock (&machine->lock); \
|
||||
if (machine->reconstruct) \
|
||||
state_fail_state_construct_##ACM_SYMBOL (machine); \
|
||||
pthread_mutex_unlock (&machine->lock); \
|
||||
} \
|
||||
fprintf (stream, "\n"); \
|
||||
state_print_##ACM_SYMBOL (machine->state_0, stream, 0, 0, printer); \
|
||||
fprintf (stream, "\n"); \
|
||||
} \
|
||||
\
|
||||
static const struct _acm_vtable_##ACM_SYMBOL ACM_VTABLE_##ACM_SYMBOL = \
|
||||
{ \
|
||||
ACM_register_keyword_##ACM_SYMBOL, \
|
||||
ACM_is_registered_keyword_##ACM_SYMBOL, \
|
||||
ACM_unregister_keyword_##ACM_SYMBOL, \
|
||||
ACM_nb_keywords_##ACM_SYMBOL, \
|
||||
ACM_foreach_keyword_##ACM_SYMBOL, \
|
||||
ACM_release_##ACM_SYMBOL, \
|
||||
ACM_reset_##ACM_SYMBOL, \
|
||||
ACM_print_##ACM_SYMBOL, \
|
||||
}; \
|
||||
\
|
||||
static void \
|
||||
machine_init_##ACM_SYMBOL (ACMachine_##ACM_SYMBOL *machine, \
|
||||
ACState_##ACM_SYMBOL * state_0, \
|
||||
EQ_##ACM_SYMBOL##_TYPE eq, \
|
||||
COPY_##ACM_SYMBOL##_TYPE copier, \
|
||||
DESTROY_##ACM_SYMBOL##_TYPE dtor) \
|
||||
{ \
|
||||
machine->reconstruct = 1; /* f(s) is undefined and has not been computed yet */\
|
||||
machine->size = 1; \
|
||||
machine->state_0 = state_0; \
|
||||
state_0->machine = machine; \
|
||||
machine->rank = machine->nb_sequence = machine->state_counter = 0; \
|
||||
pthread_mutex_init (&machine->lock, 0); \
|
||||
machine->vtable = &(ACM_VTABLE_##ACM_SYMBOL); \
|
||||
machine->copy = copier ? copier : __COPY_##ACM_SYMBOL; \
|
||||
machine->destroy = dtor ? dtor : __DTOR_##ACM_SYMBOL; \
|
||||
machine->eq = eq ? eq : __EQ_##ACM_SYMBOL; \
|
||||
} \
|
||||
struct __useless_struct_to_allow_trailing_semicolon__
|
||||
// END DEFINE_ACM
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue