Text search: various Kopt search fixes

- Properly parse input text for words (the previous
  code wasn't working with Greek letters)
- With multiple words search, don't allow "substring
  matching" for words in the middle
- Remove support for Lua pattens, so to get proper
  substring matching (as we have with cre text search)
reviewable/pr10648/r1
poire-z 10 months ago
parent dffc2404ca
commit a4720b44cd

@ -13,6 +13,7 @@ local Geom = require("ui/geometry")
local KOPTContext = require("ffi/koptcontext")
local Persist = require("persist")
local TileCacheItem = require("document/tilecacheitem")
local Utf8Proc = require("ffi/utf8proc")
local logger = require("logger")
local util = require("util")
@ -1337,18 +1338,22 @@ end
local function all_matches(boxes, pattern, caseInsensitive)
-- pattern list of single words
local plist = {}
-- split utf-8 characters
for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-- split space seperated words
for word in words:gmatch("[^%s]+") do
table.insert(plist, caseInsensitive and word:lower() or word)
-- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
for word in util.gsplit(pattern, "%s+") do
if util.hasCJKChar(word) then
for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
end
else
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
end
end
local pnb = #plist
-- return mached word indices from index i, j
local function match(i, j)
local pindex = 1
local matched_indices = {}
if #plist == 0 then return end
if pnb == 0 then return end
while true do
if #boxes[i] < j then
j = j - #boxes[i]
@ -1356,10 +1361,27 @@ local function all_matches(boxes, pattern, caseInsensitive)
end
if i > #boxes then break end
local box = boxes[i][j]
local word = caseInsensitive and box.word:lower() or box.word
if word:match(plist[pindex]) then
local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
local pword = plist[pindex]
local matched
if pnb == 1 then -- single word in plist
matched = word:find(pword, 1, true)
else -- multiple words in plist
if pindex == 1 then
-- first word of query should match at end of a word from the document
matched = word:sub(-#pword) == pword
elseif pindex == pnb then
-- last word of query should match at start of the word from the document
matched = word:sub(1, #pword) == pword
else
-- middle words in query should match exactly the word from the document
matched = word == pword
end
end
if matched then
table.insert(matched_indices, {i, j})
if pindex == #plist then
if pindex == pnb then
-- all words in plist iterated, all matched
return matched_indices
else
j = j + 1
@ -1370,6 +1392,8 @@ local function all_matches(boxes, pattern, caseInsensitive)
end
end
end
-- Note that this returns a full word box, even if what matches
-- is only a substring of a word box.
return coroutine.wrap(function()
for i, line in ipairs(boxes) do
for j, box in ipairs(line) do

@ -151,12 +151,6 @@ describe("Readersearch module", function()
it("should match whole phrase in one page", function()
assert.are.equal(1*3, #doc.koptinterface:findAllMatches(doc, "mean that the", true, 20))
end)
it("should match with lua pattern", function()
assert.are.equal(7*1, #doc.koptinterface:findAllMatches(doc, "chapter", true, 30))
assert.are.equal(3*2, #doc.koptinterface:findAllMatches(doc, "chapter %d", true, 30))
assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d", true, 30))
assert.are.equal(0*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d%d", true, 30))
end)
it("should not match empty string", function()
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "", true, 1))
end)

Loading…
Cancel
Save