diff --git a/src/lib/automaton.c b/src/lib/automaton.c index 5dcc97063..1eae01a52 100644 --- a/src/lib/automaton.c +++ b/src/lib/automaton.c @@ -226,6 +226,7 @@ get_phi_node(automaton* a, esctrie* e){ for(int i = '0' ; i <= '9' ; ++i){ if( (targ = esctrie_from_idx(a, e->trie[i])) ){ if(targ->ntype == NODE_NUMERIC){ + logtrace("found existing phi node %u[%c]->%u\n", esctrie_idx(a, e), i, esctrie_idx(a, targ)); break; }else{ ++nonphis; @@ -248,6 +249,7 @@ get_phi_node(automaton* a, esctrie* e){ targ->trie[i] = esctrie_idx(a, targ); } } + assert(NODE_NUMERIC == targ->ntype); return targ; } @@ -264,14 +266,76 @@ get_eta_node(automaton* a, esctrie* phi, unsigned successor){ return eta; } +// |e| is a known-standard node reached by our prefix; go ahead and prep both +// phi and eta links from it. +static inline void +add_phi_and_eta_chain(const automaton *a, esctrie* e, unsigned phi, + unsigned follow, unsigned eta){ + for(int i = '0' ; i <= '9' ; ++i){ + esctrie* chain = esctrie_from_idx(a, e->trie[i]); + if(chain == NULL){ + e->trie[i] = phi; + }else if(chain->ntype == NODE_SPECIAL){ +//logdebug("propagating along %u[%c]\n", e->trie[i], i); + add_phi_and_eta_chain(a, esctrie_from_idx(a, e->trie[i]), phi, follow, eta); + } + } + if(e->trie[follow] == 0){ + e->trie[follow] = eta; + } +} + // phase 3 of the numeric algorithm: walk the automaton, finding all nodes // which are prefixes of phi (all nodes matching the prefix, and all numeric // non-phi chains from those nodes) and linking them to phi, and finding all // nodes which are prefixes of eta (all numeric non-phi chains from the -// previous set) and linking them to eta. -static void +// previous set) and linking them to eta. |e| is the path thus far. +static inline void +add_phi_and_eta_recurse(automaton* a, esctrie* e, const char* prefix, + int pfxlen, esctrie* phi, unsigned follow, + esctrie* eta){ + //logtrace("working with %u %d prefix [%*.*s]\n", esctrie_idx(a, e), pfxlen, pfxlen, pfxlen, prefix); + // if pfxlen == 0, we found a match for our fixed prefix. start adding phi + // links whereever we can. where we find chained numerics, add an eta link. + if(pfxlen == 0){ + add_phi_and_eta_chain(a, e, esctrie_idx(a, phi), follow, esctrie_idx(a, eta)); + return; + } + --pfxlen; + unsigned char p = *prefix++; + // when we hit a \N in the prefix, we must recurse along all digit links + if(p == '\\'){ + if(*prefix != 'N'){ + logerror("illegal wildcard in prefix %c\n", *prefix); + return; + } + ++prefix; + --pfxlen; + for(int i = '0' ; i <= '9' ; ++i){ + if(e->trie[i] == 0){ + e->trie[i] = esctrie_idx(a, phi); + }else{ + add_phi_and_eta_recurse(a, esctrie_from_idx(a, e->trie[i]), + prefix, pfxlen, phi, follow, eta); + } + } + }else{ + if(e->trie[p]){ + add_phi_and_eta_recurse(a, esctrie_from_idx(a, e->trie[p]), + prefix, pfxlen, phi, follow, eta); + } + } +} + +// |prefix| does *not* lead with an escape, and does not include the numeric. +static inline void add_phi_and_eta(automaton* a, const char* prefix, size_t pfxlen, esctrie* phi, unsigned follow, esctrie* eta){ + esctrie* esc = esctrie_from_idx(a, a->escapes); + if(esc == NULL){ + return; + } + add_phi_and_eta_recurse(a, esc, prefix, pfxlen, phi, follow, eta); } // accept any digit and transition to a numeric node. |e| is the culmination of @@ -295,12 +359,12 @@ link_numeric(automaton* a, const char* prefix, int pfxlen, if(phi == NULL){ return NULL; } - logtrace("phi node: %u\n", esctrie_idx(a, phi)); + logtrace("phi node: %u->%u\n", esctrie_idx(a, e), esctrie_idx(a, phi)); esctrie* eta = get_eta_node(a, phi, follow); if(eta == NULL){ return NULL; } - logtrace("eta node: %u\n", esctrie_idx(a, eta)); + logtrace("eta node: %u philink[%c]: %u\n", esctrie_idx(a, eta), follow, phi->trie[follow]); // eta is now bound to phi, and phi links something at all digits, but no // other links are guaranteed. walk the automaton, finding all possible // prefixes of φ (and linking to φ) and all possible prefixes of ή (and