[automaton] numeric insertion algorithm perfected #2222

pull/2236/head
nick black 3 years ago committed by nick black
parent 686a72a697
commit 5d8fbc6137

@ -226,6 +226,7 @@ get_phi_node(automaton* a, esctrie* e){
for(int i = '0' ; i <= '9' ; ++i){
if( (targ = esctrie_from_idx(a, e->trie[i])) ){
if(targ->ntype == NODE_NUMERIC){
logtrace("found existing phi node %u[%c]->%u\n", esctrie_idx(a, e), i, esctrie_idx(a, targ));
break;
}else{
++nonphis;
@ -248,6 +249,7 @@ get_phi_node(automaton* a, esctrie* e){
targ->trie[i] = esctrie_idx(a, targ);
}
}
assert(NODE_NUMERIC == targ->ntype);
return targ;
}
@ -264,14 +266,76 @@ get_eta_node(automaton* a, esctrie* phi, unsigned successor){
return eta;
}
// |e| is a known-standard node reached by our prefix; go ahead and prep both
// phi and eta links from it.
static inline void
add_phi_and_eta_chain(const automaton *a, esctrie* e, unsigned phi,
unsigned follow, unsigned eta){
for(int i = '0' ; i <= '9' ; ++i){
esctrie* chain = esctrie_from_idx(a, e->trie[i]);
if(chain == NULL){
e->trie[i] = phi;
}else if(chain->ntype == NODE_SPECIAL){
//logdebug("propagating along %u[%c]\n", e->trie[i], i);
add_phi_and_eta_chain(a, esctrie_from_idx(a, e->trie[i]), phi, follow, eta);
}
}
if(e->trie[follow] == 0){
e->trie[follow] = eta;
}
}
// phase 3 of the numeric algorithm: walk the automaton, finding all nodes
// which are prefixes of phi (all nodes matching the prefix, and all numeric
// non-phi chains from those nodes) and linking them to phi, and finding all
// nodes which are prefixes of eta (all numeric non-phi chains from the
// previous set) and linking them to eta.
static void
// previous set) and linking them to eta. |e| is the path thus far.
static inline void
add_phi_and_eta_recurse(automaton* a, esctrie* e, const char* prefix,
int pfxlen, esctrie* phi, unsigned follow,
esctrie* eta){
//logtrace("working with %u %d prefix [%*.*s]\n", esctrie_idx(a, e), pfxlen, pfxlen, pfxlen, prefix);
// if pfxlen == 0, we found a match for our fixed prefix. start adding phi
// links whereever we can. where we find chained numerics, add an eta link.
if(pfxlen == 0){
add_phi_and_eta_chain(a, e, esctrie_idx(a, phi), follow, esctrie_idx(a, eta));
return;
}
--pfxlen;
unsigned char p = *prefix++;
// when we hit a \N in the prefix, we must recurse along all digit links
if(p == '\\'){
if(*prefix != 'N'){
logerror("illegal wildcard in prefix %c\n", *prefix);
return;
}
++prefix;
--pfxlen;
for(int i = '0' ; i <= '9' ; ++i){
if(e->trie[i] == 0){
e->trie[i] = esctrie_idx(a, phi);
}else{
add_phi_and_eta_recurse(a, esctrie_from_idx(a, e->trie[i]),
prefix, pfxlen, phi, follow, eta);
}
}
}else{
if(e->trie[p]){
add_phi_and_eta_recurse(a, esctrie_from_idx(a, e->trie[p]),
prefix, pfxlen, phi, follow, eta);
}
}
}
// |prefix| does *not* lead with an escape, and does not include the numeric.
static inline void
add_phi_and_eta(automaton* a, const char* prefix, size_t pfxlen,
esctrie* phi, unsigned follow, esctrie* eta){
esctrie* esc = esctrie_from_idx(a, a->escapes);
if(esc == NULL){
return;
}
add_phi_and_eta_recurse(a, esc, prefix, pfxlen, phi, follow, eta);
}
// accept any digit and transition to a numeric node. |e| is the culmination of
@ -295,12 +359,12 @@ link_numeric(automaton* a, const char* prefix, int pfxlen,
if(phi == NULL){
return NULL;
}
logtrace("phi node: %u\n", esctrie_idx(a, phi));
logtrace("phi node: %u->%u\n", esctrie_idx(a, e), esctrie_idx(a, phi));
esctrie* eta = get_eta_node(a, phi, follow);
if(eta == NULL){
return NULL;
}
logtrace("eta node: %u\n", esctrie_idx(a, eta));
logtrace("eta node: %u philink[%c]: %u\n", esctrie_idx(a, eta), follow, phi->trie[follow]);
// eta is now bound to phi, and phi links something at all digits, but no
// other links are guaranteed. walk the automaton, finding all possible
// prefixes of φ (and linking to φ) and all possible prefixes of ή (and

Loading…
Cancel
Save