Refactor FTS5 query sanitizing

pull/6/head
Mickaël Menu 3 years ago
parent 1176216eb8
commit be25da2b80
No known key found for this signature in database
GPG Key ID: 53D73664CD359895

@ -64,7 +64,7 @@ func (db *DB) Migrate() error {
path, title, body,
content = notes,
content_rowid = id,
tokenize = 'porter unicode61 remove_diacritics 1'
tokenize = "porter unicode61 remove_diacritics 1 tokenchars '''&/'"
)`,
// Triggers to keep the FTS index up to date.
`CREATE TRIGGER IF NOT EXISTS trigger_notes_ai AFTER INSERT ON notes BEGIN

@ -2,12 +2,12 @@ package sqlite
import (
"database/sql"
"strings"
"time"
"github.com/mickael-menu/zk/core/note"
"github.com/mickael-menu/zk/util"
"github.com/mickael-menu/zk/util/errors"
"github.com/mickael-menu/zk/util/fts5"
"github.com/mickael-menu/zk/util/paths"
)
@ -164,7 +164,7 @@ func (d *NoteDAO) Find(callback func(note.Match) error, filters ...note.Filter)
WHERE notes_fts MATCH ?
ORDER BY bm25(notes_fts, 1000.0, 500.0, 1.0)
--- ORDER BY rank
`, escapeForFTS5(string(filter)))
`, fts5.ConvertQuery(string(filter)))
}
}()
@ -203,81 +203,3 @@ func (d *NoteDAO) Find(callback func(note.Match) error, filters ...note.Filter)
return nil
}
func escapeForFTS5(query string) string {
quote := false
out := ""
term := ""
endTerm := func() {
if term == "" {
return
}
switch term {
case "AND", "OR", "NOT":
out += term
default:
isPrefixToken := strings.HasSuffix(term, "*")
if isPrefixToken {
term = strings.TrimSuffix(term, "*")
}
out += `"` + term + `"`
if isPrefixToken {
out += "*"
}
}
term = ""
}
for _, c := range query {
switch {
case c == '"':
if quote {
endTerm()
}
quote = !quote
case c == '^' || c == '*':
if term != "" {
term += string(c)
} else {
out += string(c)
}
case c == '-':
if term == "" {
out += " NOT "
} else {
term += string(c)
}
case c == ':':
if term != "" && !quote {
out += term + string(c)
term = ""
} else {
term += string(c)
}
case c == '+':
if term != "" || quote {
term += string(c)
}
case c == ' ', c == '\t', c == '\n', c == '(', c == ')':
if !quote {
endTerm()
out += string(c)
} else {
term += string(c)
}
default:
term = term + string(c)
}
}
endTerm()
return out
}

@ -278,61 +278,3 @@ func queryNoteRow(tx Transaction, where string) (noteRow, error) {
`, where)).Scan(&row.Path, &row.Title, &row.Body, &row.WordCount, &row.Checksum, &row.Created, &row.Modified)
return row, err
}
func TestEscapeForFTS5(t *testing.T) {
test := func(text, expected string) {
assert.Equal(t, escapeForFTS5(text), expected)
}
test(`foo`, `"foo"`)
test(`foo bar`, `"foo" "bar"`)
test(`"foo"`, `"foo"`)
test(`"foo bar"`, `"foo bar"`)
test(`"foo bar" qux`, `"foo bar" "qux"`)
test(`foo AND bar`, `"foo" AND "bar"`)
test(`foo AN bar`, `"foo" "AN" "bar"`)
test(`foo ANT bar`, `"foo" "ANT" "bar"`)
test(`"foo AND bar"`, `"foo AND bar"`)
test(`foo OR bar`, `"foo" OR "bar"`)
test(`foo NOT bar`, `"foo" NOT "bar"`)
test(`(foo AND bar) OR qux`, `("foo" AND "bar") OR "qux"`)
test(`foo -bar`, `"foo" NOT "bar"`)
test(`"foo -bar"`, `"foo -bar"`)
test(`foo-bar`, `"foo-bar"`)
test(`foo/bar`, `"foo/bar"`)
test(`foo;bar`, `"foo;bar"`)
test(`foo,bar`, `"foo,bar"`)
test(`foo&bar`, `"foo&bar"`)
test(`foo's bar`, `"foo's" "bar"`)
test(`foo ba*`, `"foo" "ba"*`)
test(`foo ba* qux`, `"foo" "ba"* "qux"`)
test(`"foo ba"*`, `"foo ba"*`)
test(`(foo ba*)`, `("foo" "ba"*)`)
test(`foo*bar`, `"foo*bar"`)
test(`"foo*bar"`, `"foo*bar"`)
test(`col:foo bar`, `col:"foo" "bar"`)
test(`foo col:bar`, `"foo" col:"bar"`)
test(`foo "col:bar"`, `"foo" "col:bar"`)
test(`":foo"`, `":foo"`)
test(`-col:foo bar`, ` NOT col:"foo" "bar"`)
test(`col:(foo bar)`, `col:("foo" "bar")`)
test(`^foo`, `^"foo"`)
test(`^foo bar`, `^"foo" "bar"`)
test(`foo ^bar`, `"foo" ^"bar"`)
test(`^"foo bar"`, `^"foo bar"`)
test(`"foo ^bar"`, `"foo ^bar"`)
test(`col:^foo`, `col:^"foo"`)
test(`foo + bar`, `"foo" "bar"`)
test(`"foo + bar"`, `"foo + bar"`)
test(`"+foo"`, `"+foo"`)
// NEAR is not supported
test(`NEAR(foo, bar, 4)`, `"NEAR"("foo," "bar," "4")`)
}

@ -0,0 +1,103 @@
package fts5
import "strings"
// ConvertQuery transforms a Google-like query into a SQLite FTS5 one.
func ConvertQuery(query string) string {
out := ""
// List of tokens which won't be automatically quoted in the output query.
passthroughTokens := map[string]bool{
"AND": true,
"OR": true,
"NOT": true,
}
// Whitespaces and parentheses are term separators outside explicit quotes.
termSeparators := map[rune]bool{
' ': true,
'\t': true,
'\n': true,
'(': true,
')': true,
}
// Indicates whether the current term was explicitely quoted in the query.
inQuote := false
// Current term being read.
term := ""
// Finishes the current term and write it to the output after quoting it.
closeTerm := func() {
if term == "" {
return
}
if !inQuote && passthroughTokens[term] {
out += term
} else {
// If the term has a wildcard suffix, it is a prefix token. We make
// sure that the * is not quoted or it will be ignored by the FTS5
// tokenizer.
isPrefixToken := !inQuote && strings.HasSuffix(term, "*")
if isPrefixToken {
term = strings.TrimSuffix(term, "*")
}
out += `"` + term + `"`
if isPrefixToken {
out += "*"
}
}
term = ""
}
for _, c := range query {
switch {
// Explicit quotes.
case c == '"':
if inQuote { // We are already in a quoted term? Then it's a closing quote.
closeTerm()
}
inQuote = !inQuote
// Passthrough for ^ and * when they are at the start of a term, to allow:
// ^foo -> ^"foo"
// "foo"* -> "foo"*
case term == "" && (c == '^' || c == '*'):
out += string(c)
// Passthrough for FTS5's column filters, e.g.
// col:foo -> col:"foo"
case !inQuote && c == ':':
out += term + string(c)
term = ""
// - is an alias to NOT, but only at the start of a term, to allow
// compound words such as "well-known"
case c == '-' && term == "":
out += " NOT "
// | is an alias to OR.
case !inQuote && c == '|':
closeTerm()
out += " OR "
// FTS5's + is ignored because it doesn't bring much to the syntax,
// compared to explicit quotes.
case !inQuote && c == '+' && term == "":
break
// Term separators outside explicit quotes terminates the current term.
case !inQuote && termSeparators[c]:
closeTerm()
out += string(c)
default:
term += string(c)
}
}
closeTerm()
return out
}

@ -0,0 +1,84 @@
package fts5
import (
"testing"
"github.com/mickael-menu/zk/util/test/assert"
)
func TestConvertQuery(t *testing.T) {
test := func(query, expected string) {
assert.Equal(t, ConvertQuery(query), expected)
}
// Quotes
test(`foo`, `"foo"`)
test(`foo bar`, `"foo" "bar"`)
test(`foo bar`, `"foo" "bar"`)
test(`"foo"`, `"foo"`)
test(`"foo bar"`, `"foo bar"`)
test(`"foo bar" qux`, `"foo bar" "qux"`)
test(`foo "bar qux`, `"foo" "bar qux"`)
// Conjunction
test(`foo AND bar`, `"foo" AND "bar"`)
test(`foo AN bar`, `"foo" "AN" "bar"`)
test(`foo ANT bar`, `"foo" "ANT" "bar"`)
test(`foo "AND" bar`, `"foo" "AND" "bar"`)
test(`"foo AND bar"`, `"foo AND bar"`)
// Disjunction
test(`foo OR bar`, `"foo" OR "bar"`)
test(`foo | bar`, `"foo" OR "bar"`)
test(`foo|bar`, `"foo" OR "bar"`)
test(`"foo | bar"`, `"foo | bar"`)
// Negation
test(`foo NOT bar`, `"foo" NOT "bar"`)
test(`foo -bar`, `"foo" NOT "bar"`)
test(`"foo -bar"`, `"foo -bar"`)
test(`foo-bar`, `"foo-bar"`)
// Grouping
test(`(foo AND bar) OR qux`, `("foo" AND "bar") OR "qux"`)
// Special characters
test(`foo/bar`, `"foo/bar"`)
test(`foo;bar`, `"foo;bar"`)
test(`foo,bar`, `"foo,bar"`)
test(`foo&bar`, `"foo&bar"`)
test(`foo's bar`, `"foo's" "bar"`)
// Prefix queries
test(`foo ba*`, `"foo" "ba"*`)
test(`foo ba* qux`, `"foo" "ba"* "qux"`)
test(`"foo ba"*`, `"foo ba"*`)
test(`"foo ba*"`, `"foo ba*"`)
test(`(foo ba*)`, `("foo" "ba"*)`)
test(`foo*bar`, `"foo*bar"`)
test(`"foo*bar"`, `"foo*bar"`)
// Column filters
test(`col:foo bar`, `col:"foo" "bar"`)
test(`foo col:bar`, `"foo" col:"bar"`)
test(`foo "col:bar"`, `"foo" "col:bar"`)
test(`":foo"`, `":foo"`)
test(`-col:foo bar`, ` NOT col:"foo" "bar"`)
test(`col:(foo bar)`, `col:("foo" "bar")`)
// First token
test(`^foo`, `^"foo"`)
test(`^foo bar`, `^"foo" "bar"`)
test(`foo ^bar`, `"foo" ^"bar"`)
test(`^"foo bar"`, `^"foo bar"`)
test(`"foo ^bar"`, `"foo ^bar"`)
test(`col:^foo`, `col:^"foo"`)
// FTS5's + is ignored
test(`foo + bar`, `"foo" "bar"`)
test(`"foo + bar"`, `"foo + bar"`)
test(`"+foo"`, `"+foo"`)
// NEAR is not supported
test(`NEAR(foo, bar, 4)`, `"NEAR"("foo," "bar," "4")`)
}
Loading…
Cancel
Save