From 686a72a6973decf993bd59018fb46d052b6e9209 Mon Sep 17 00:00:00 2001 From: nick black Date: Sat, 2 Oct 2021 06:49:23 -0400 Subject: [PATCH] [automaton] phase I and II of new numeric insert algorithm #2222 --- src/lib/automaton.c | 130 +++++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 50 deletions(-) diff --git a/src/lib/automaton.c b/src/lib/automaton.c index 4563fde5f..5dcc97063 100644 --- a/src/lib/automaton.c +++ b/src/lib/automaton.c @@ -88,28 +88,6 @@ void input_free_esctrie(automaton* a){ a->nodepool = NULL; } -static int -esctrie_make_numeric(automaton* a, esctrie* e){ - if(e->ntype == NODE_NUMERIC){ - return 0; - } - if(e->ntype != NODE_SPECIAL){ - logerror("can't make node type %d numeric\n", e->ntype); - return -1; - } - for(int i = '0' ; i < '9' ; ++i){ - if(e->trie[i]){ - logerror("can't make %c-followed numeric\n", i); - return -1; - } - } - e->ntype = NODE_NUMERIC; - for(int i = '0' ; i < '9' ; ++i){ - e->trie[i] = esctrie_idx(a, e); - } - return 0; -} - static int esctrie_make_kleene(automaton* a, esctrie* e, unsigned follow, esctrie* term){ if(e->ntype != NODE_SPECIAL){ @@ -225,7 +203,7 @@ link_kleene(automaton* a, esctrie* e, unsigned follow){ return esctrie_from_idx(a, e->trie[follow]); } -static void +/*static void fill_in_numerics(automaton* a, esctrie* e, esctrie* targ, unsigned follow, esctrie* efollow){ // fill in all NULL numeric links with the new target for(int i = '0' ; i <= '9' ; ++i){ @@ -236,48 +214,99 @@ fill_in_numerics(automaton* a, esctrie* e, esctrie* targ, unsigned follow, esctr } } e->trie[follow] = esctrie_idx(a, efollow); -} +}*/ -// accept any digit and transition to a numeric node. -static esctrie* -link_numeric(automaton* a, esctrie* e, unsigned follow){ - esctrie* targ = NULL; +// phase 1 of the numeric algorithm; find a φ node on e. not sure what +// to do if we have non-φ links at every digit...punt for now FIXME. +static inline esctrie* +get_phi_node(automaton* a, esctrie* e){ // find a linked NODE_NUMERIC, if one exists. we'll want to reuse it. + int nonphis = 0; + esctrie* targ; for(int i = '0' ; i <= '9' ; ++i){ - targ = esctrie_from_idx(a, e->trie[i]); - if(targ && targ->ntype == NODE_NUMERIC){ - break; + if( (targ = esctrie_from_idx(a, e->trie[i])) ){ + if(targ->ntype == NODE_NUMERIC){ + break; + }else{ + ++nonphis; + targ = NULL; + } } - targ = NULL; } - // we either have a numeric target, or will make one now + // we either have a numeric target, or will make one now. if we create a new + // one, be sure to mark it numeric, and add all digit links back to itself. if(targ == NULL){ - if((targ = esctrie_from_idx(a, create_esctrie_node(a, 0))) == 0){ + if(nonphis == 10){ + logerror("ten non-phi links from %u\n", esctrie_idx(a, e)); return NULL; } - if(esctrie_make_numeric(a, targ)){ + if((targ = esctrie_from_idx(a, create_esctrie_node(a, 0))) == 0){ return NULL; } + targ->ntype = NODE_NUMERIC; + for(int i = '0' ; i < '9' ; ++i){ + targ->trie[i] = esctrie_idx(a, targ); + } } - // targ is the numeric node we're either creating or coopting - esctrie* efollow = esctrie_from_idx(a, targ->trie[follow]); - if(efollow == NULL){ - if((efollow = esctrie_from_idx(a, create_esctrie_node(a, 0))) == NULL){ + return targ; +} + +// phase 2 of the numeric algorithm; find a ή node for |successor| on |phi|. +static inline esctrie* +get_eta_node(automaton* a, esctrie* phi, unsigned successor){ + esctrie* eta = esctrie_from_idx(a, phi->trie[successor]); + if(eta == NULL){ + if((eta = esctrie_from_idx(a, create_esctrie_node(a, 0))) == NULL){ return NULL; } - logdebug("EFOLLOW: %u\n", esctrie_idx(a, efollow)); + phi->trie[successor] = esctrie_idx(a, eta); } - for(int i = '0' ; i <= '9' ; ++i){ - if(e->trie[i] == 0){ - e->trie[i] = esctrie_idx(a, targ); - }else{ - logdebug("SKIPPING[%d] %u\n", i, e->trie[i]); - } + return eta; +} + +// phase 3 of the numeric algorithm: walk the automaton, finding all nodes +// which are prefixes of phi (all nodes matching the prefix, and all numeric +// non-phi chains from those nodes) and linking them to phi, and finding all +// nodes which are prefixes of eta (all numeric non-phi chains from the +// previous set) and linking them to eta. +static void +add_phi_and_eta(automaton* a, const char* prefix, size_t pfxlen, + esctrie* phi, unsigned follow, esctrie* eta){ +} + +// accept any digit and transition to a numeric node. |e| is the culmination of +// the prefix before the numeric. |follow| is the successor of the numeric. +// here's our approach: +// - find a link to a numeric from e. there can only be one node (thought it +// might have many links), so we can use the first one we find. +// - if there is no such numeric node linked from e, create one. +// (FIXME if all ten digits are occupied, what would we do?) +// - chosen numeric node is φ. +// - if an appropriate follow node exists linked from φ, choose it as ή. +// - otherwise, create a new ή and link it from φ. +// - walk from the top, finding all possible prefixes of φ. +// - at each, link all unused digits to φ. +// - from each that is also a possible prefix of ή, link ή. +static esctrie* +link_numeric(automaton* a, const char* prefix, int pfxlen, + esctrie* e, unsigned char follow){ + logdebug("adding numeric with follow %c following %*.*s\n", follow, pfxlen, pfxlen, prefix); + esctrie* phi = get_phi_node(a, e); + if(phi == NULL){ + return NULL; } - for(int i = '0' ; i <= '9' ; ++i){ - fill_in_numerics(a, esctrie_from_idx(a, e->trie[i]), targ, follow, efollow); + logtrace("phi node: %u\n", esctrie_idx(a, phi)); + esctrie* eta = get_eta_node(a, phi, follow); + if(eta == NULL){ + return NULL; } - return efollow; + logtrace("eta node: %u\n", esctrie_idx(a, eta)); + // eta is now bound to phi, and phi links something at all digits, but no + // other links are guaranteed. walk the automaton, finding all possible + // prefixes of φ (and linking to φ) and all possible prefixes of ή (and + // linking them to ή). + add_phi_and_eta(a, prefix, pfxlen, phi, follow, eta); + return eta; } static esctrie* @@ -289,6 +318,7 @@ insert_path(automaton* a, const char* seq){ } esctrie* eptr = esctrie_from_idx(a, a->escapes); bool inescape = false; + const char* seqstart = seq; unsigned char c; while( (c = *seq++) ){ if(c == '\\'){ @@ -305,7 +335,7 @@ insert_path(automaton* a, const char* seq){ return NULL; } c = *seq++; - eptr = link_numeric(a, eptr, c); + eptr = link_numeric(a, seqstart, seq - 3 - seqstart, eptr, c); if(eptr == NULL){ return NULL; }