mirror of
https://github.com/koreader/koreader
synced 2024-11-18 03:25:46 +00:00
43d36b2ea9
Allow for embedding "tags" (invalid Unicode codepoints) in the text string to trigger some text formatting: for now only bolding some parts of text is possible. Use it with fulltext search "all results" to highlight the matched word (instead of the previously used brackets).
1554 lines
54 KiB
Lua
1554 lines
54 KiB
Lua
--[[--
|
|
Interface to k2pdfoptlib backend.
|
|
--]]
|
|
|
|
local CacheItem = require("cacheitem")
|
|
local CanvasContext = require("document/canvascontext")
|
|
local DataStorage = require("datastorage")
|
|
local DEBUG = require("dbg")
|
|
local DocCache = require("document/doccache")
|
|
local Document = require("document/document")
|
|
local FFIUtil = require("ffi/util")
|
|
local Geom = require("ui/geometry")
|
|
local KOPTContext = require("ffi/koptcontext")
|
|
local Persist = require("persist")
|
|
local TextBoxWidget = require("ui/widget/textboxwidget")
|
|
local TileCacheItem = require("document/tilecacheitem")
|
|
local Utf8Proc = require("ffi/utf8proc")
|
|
local logger = require("logger")
|
|
local util = require("util")
|
|
|
|
local KoptInterface = {
|
|
ocrengine = "ocrengine",
|
|
tessocr_data = DataStorage:getDataDir() .. "/data",
|
|
ocr_lang = "eng",
|
|
ocr_type = 3, -- default 0, for more accuracy use 3
|
|
last_context_size = nil,
|
|
default_context_size = 1024*1024,
|
|
}
|
|
|
|
local ContextCacheItem = CacheItem:new{}
|
|
|
|
function ContextCacheItem:onFree()
|
|
KoptInterface:waitForContext(self.kctx)
|
|
logger.dbg("ContextCacheItem: free KOPTContext", self.kctx)
|
|
self.kctx:free()
|
|
end
|
|
|
|
function ContextCacheItem:dump(filename)
|
|
if self.kctx:isPreCache() == 0 then
|
|
logger.dbg("Dumping KOPTContext to", filename)
|
|
|
|
local cache_file = Persist:new{
|
|
path = filename,
|
|
codec = "zstd",
|
|
}
|
|
|
|
local t = KOPTContext.totable(self.kctx)
|
|
t.cache_size = self.size
|
|
|
|
local ok, size = cache_file:save(t)
|
|
if ok then
|
|
return size
|
|
else
|
|
logger.warn("Failed to dump KOPTContext")
|
|
return nil
|
|
end
|
|
end
|
|
end
|
|
|
|
function ContextCacheItem:load(filename)
|
|
logger.dbg("Loading KOPTContext from", filename)
|
|
|
|
local cache_file = Persist:new{
|
|
path = filename,
|
|
codec = "zstd",
|
|
}
|
|
|
|
local t = cache_file:load(filename)
|
|
if t then
|
|
self.size = t.cache_size
|
|
self.kctx = KOPTContext.fromtable(t)
|
|
else
|
|
logger.warn("Failed to load KOPTContext")
|
|
end
|
|
end
|
|
|
|
local OCREngine = CacheItem:new{}
|
|
|
|
function OCREngine:onFree()
|
|
if self.ocrengine.freeOCR then
|
|
logger.dbg("free OCREngine", self.ocrengine)
|
|
self.ocrengine:freeOCR()
|
|
end
|
|
end
|
|
|
|
function KoptInterface:setDefaultConfigurable(configurable)
|
|
configurable.doc_language = G_defaults:readSetting("DKOPTREADER_CONFIG_DOC_DEFAULT_LANG_CODE")
|
|
configurable.trim_page = G_defaults:readSetting("DKOPTREADER_CONFIG_TRIM_PAGE")
|
|
configurable.text_wrap = G_defaults:readSetting("DKOPTREADER_CONFIG_TEXT_WRAP")
|
|
configurable.detect_indent = G_defaults:readSetting("DKOPTREADER_CONFIG_DETECT_INDENT")
|
|
configurable.max_columns = G_defaults:readSetting("DKOPTREADER_CONFIG_MAX_COLUMNS")
|
|
configurable.auto_straighten = G_defaults:readSetting("DKOPTREADER_CONFIG_AUTO_STRAIGHTEN")
|
|
configurable.justification = G_defaults:readSetting("DKOPTREADER_CONFIG_JUSTIFICATION")
|
|
configurable.writing_direction = 0
|
|
configurable.font_size = G_defaults:readSetting("DKOPTREADER_CONFIG_FONT_SIZE")
|
|
configurable.page_margin = G_defaults:readSetting("DKOPTREADER_CONFIG_PAGE_MARGIN")
|
|
configurable.quality = G_defaults:readSetting("DKOPTREADER_CONFIG_RENDER_QUALITY")
|
|
configurable.contrast = G_defaults:readSetting("DKOPTREADER_CONFIG_CONTRAST")
|
|
configurable.defect_size = G_defaults:readSetting("DKOPTREADER_CONFIG_DEFECT_SIZE")
|
|
configurable.line_spacing = G_defaults:readSetting("DKOPTREADER_CONFIG_LINE_SPACING")
|
|
configurable.word_spacing = G_defaults:readSetting("DKOPTREADER_CONFIG_DEFAULT_WORD_SPACING")
|
|
end
|
|
|
|
function KoptInterface:waitForContext(kc)
|
|
-- If our koptcontext is busy in a background thread, isPreCache will return 1.
|
|
local waited = false
|
|
while kc and kc:isPreCache() == 1 do
|
|
waited = true
|
|
logger.dbg("waiting for background rendering")
|
|
FFIUtil.usleep(100000)
|
|
end
|
|
|
|
if waited or self.bg_thread then
|
|
-- Background thread is done, go back to a single CPU core.
|
|
CanvasContext:enableCPUCores(1)
|
|
self.bg_thread = nil
|
|
end
|
|
|
|
return kc
|
|
end
|
|
|
|
--[[--
|
|
Get reflow context.
|
|
--]]
|
|
function KoptInterface:createContext(doc, pageno, bbox)
|
|
-- Now koptcontext keeps track of its dst bitmap reflowed by libk2pdfopt.
|
|
-- So there is no need to check background context when creating new context.
|
|
local kc = KOPTContext.new()
|
|
local canvas_size = CanvasContext:getSize()
|
|
local lang = doc.configurable.doc_language
|
|
if lang == "chi_sim" or lang == "chi_tra" or
|
|
lang == "jpn" or lang == "kor" then
|
|
kc:setCJKChar()
|
|
end
|
|
kc:setLanguage(lang)
|
|
kc:setTrim(doc.configurable.trim_page)
|
|
kc:setWrap(doc.configurable.text_wrap)
|
|
kc:setIndent(doc.configurable.detect_indent)
|
|
kc:setColumns(doc.configurable.max_columns)
|
|
kc:setDeviceDim(canvas_size.w, canvas_size.h)
|
|
kc:setDeviceDPI(CanvasContext:getDPI())
|
|
kc:setStraighten(doc.configurable.auto_straighten)
|
|
kc:setJustification(doc.configurable.justification)
|
|
kc:setWritingDirection(doc.configurable.writing_direction)
|
|
kc:setZoom(doc.configurable.font_size)
|
|
kc:setMargin(doc.configurable.page_margin)
|
|
kc:setQuality(doc.configurable.quality)
|
|
-- k2pdfopt (for reflowing) and mupdf use different algorithms to apply gamma when rendering
|
|
kc:setContrast(1 / doc.configurable.contrast)
|
|
kc:setDefectSize(doc.configurable.defect_size)
|
|
kc:setLineSpacing(doc.configurable.line_spacing)
|
|
kc:setWordSpacing(doc.configurable.word_spacing)
|
|
if bbox then
|
|
if bbox.x0 >= bbox.x1 or bbox.y0 >= bbox.y1 then
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
bbox.x0, bbox.y0 = 0, 0
|
|
bbox.x1, bbox.y1 = page_size.w, page_size.h
|
|
end
|
|
kc:setBBox(bbox.x0, bbox.y0, bbox.x1, bbox.y1)
|
|
end
|
|
if DEBUG.is_on then kc:setDebug() end
|
|
return kc
|
|
end
|
|
|
|
function KoptInterface:getContextHash(doc, pageno, bbox, hash_list)
|
|
local canvas_size = CanvasContext:getSize()
|
|
table.insert(hash_list, doc.file)
|
|
table.insert(hash_list, doc.mod_time)
|
|
table.insert(hash_list, pageno)
|
|
doc.configurable:hash(hash_list)
|
|
table.insert(hash_list, bbox.x0)
|
|
table.insert(hash_list, bbox.y0)
|
|
table.insert(hash_list, bbox.x1)
|
|
table.insert(hash_list, bbox.y1)
|
|
table.insert(hash_list, canvas_size.w)
|
|
table.insert(hash_list, canvas_size.h)
|
|
end
|
|
|
|
function KoptInterface:getPageBBox(doc, pageno)
|
|
if doc.configurable.text_wrap ~= 1 and doc.configurable.trim_page == 1 then
|
|
-- auto bbox finding
|
|
return self:getAutoBBox(doc, pageno)
|
|
elseif doc.configurable.text_wrap ~= 1 and doc.configurable.trim_page == 2 then
|
|
-- semi-auto bbox finding
|
|
return self:getSemiAutoBBox(doc, pageno)
|
|
else
|
|
-- get saved manual bbox
|
|
return Document.getPageBBox(doc, pageno)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Auto detect bbox.
|
|
--]]
|
|
function KoptInterface:getAutoBBox(doc, pageno)
|
|
local native_size = Document.getNativePageDimensions(doc, pageno)
|
|
local bbox = {
|
|
x0 = 0,
|
|
y0 = 0,
|
|
x1 = native_size.w,
|
|
y1 = native_size.h,
|
|
}
|
|
local hash_list = { "autobbox" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local page = doc._document:openPage(pageno)
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
page:getPagePix(kc)
|
|
local x0, y0, x1, y1 = kc:getAutoBBox()
|
|
local w, h = native_size.w, native_size.h
|
|
if (x1 - x0)/w > 0.1 or (y1 - y0)/h > 0.1 then
|
|
bbox.x0, bbox.y0, bbox.x1, bbox.y1 = x0, y0, x1, y1
|
|
else
|
|
bbox = Document.getPageBBox(doc, pageno)
|
|
end
|
|
DocCache:insert(hash, CacheItem:new{ autobbox = bbox, size = 160 })
|
|
page:close()
|
|
kc:free()
|
|
return bbox
|
|
else
|
|
return cached.autobbox
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Detect bbox within user restricted bbox.
|
|
--]]
|
|
function KoptInterface:getSemiAutoBBox(doc, pageno)
|
|
-- use manual bbox
|
|
local bbox = Document.getPageBBox(doc, pageno)
|
|
local hash_list = { "semiautobbox" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local page = doc._document:openPage(pageno)
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
local auto_bbox = {}
|
|
page:getPagePix(kc)
|
|
auto_bbox.x0, auto_bbox.y0, auto_bbox.x1, auto_bbox.y1 = kc:getAutoBBox()
|
|
auto_bbox.x0 = auto_bbox.x0 + bbox.x0
|
|
auto_bbox.y0 = auto_bbox.y0 + bbox.y0
|
|
auto_bbox.x1 = auto_bbox.x1 + bbox.x0
|
|
auto_bbox.y1 = auto_bbox.y1 + bbox.y0
|
|
logger.dbg("Semi-auto detected bbox", auto_bbox)
|
|
local native_size = Document.getNativePageDimensions(doc, pageno)
|
|
if (auto_bbox.x1 - auto_bbox.x0)/native_size.w < 0.1 and (auto_bbox.y1 - auto_bbox.y0)/native_size.h < 0.1 then
|
|
logger.dbg("Semi-auto detected bbox too small, using manual bbox")
|
|
auto_bbox = bbox
|
|
end
|
|
page:close()
|
|
DocCache:insert(hash, CacheItem:new{ semiautobbox = auto_bbox, size = 160 })
|
|
kc:free()
|
|
return auto_bbox
|
|
else
|
|
return cached.semiautobbox
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get cached koptcontext for a certain page.
|
|
|
|
If the context doesn't exist in cache, make a new context and reflow the src page
|
|
immediately, or wait for the background thread with reflowed context.
|
|
--]]
|
|
function KoptInterface:getCachedContext(doc, pageno)
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "kctx" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash, ContextCacheItem)
|
|
if not cached then
|
|
-- If kctx is not cached, create one and get reflowed bmp in foreground.
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
local page = doc._document:openPage(pageno)
|
|
logger.dbg("reflowing page", pageno, "in foreground")
|
|
-- reflow page
|
|
--local secs, usecs = FFIUtil.gettime()
|
|
page:reflow(kc, doc.render_mode or G_defaults:readSetting("DRENDER_MODE")) -- Fall backs to a default set to DDJVU_RENDER_COLOR
|
|
page:close()
|
|
--local nsecs, nusecs = FFIUtil.gettime()
|
|
--local dur = nsecs - secs + (nusecs - usecs) / 1000000
|
|
--self:logReflowDuration(pageno, dur)
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
logger.dbg("reflowed page", pageno, "fullwidth:", fullwidth, "fullheight:", fullheight)
|
|
self.last_context_size = fullwidth * fullheight + 3072 -- estimation
|
|
DocCache:insert(hash, ContextCacheItem:new{
|
|
persistent = true,
|
|
doc_path = doc.file,
|
|
size = self.last_context_size,
|
|
kctx = kc
|
|
})
|
|
return kc
|
|
else
|
|
-- wait for background thread
|
|
local kc = self:waitForContext(cached.kctx)
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
self.last_context_size = fullwidth * fullheight + 3072 -- estimation
|
|
return kc
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get page dimensions.
|
|
--]]
|
|
function KoptInterface:getPageDimensions(doc, pageno, zoom, rotation)
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:getRFPageDimensions(doc, pageno, zoom, rotation)
|
|
else
|
|
return Document.getPageDimensions(doc, pageno, zoom, rotation)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get reflowed page dimensions.
|
|
--]]
|
|
function KoptInterface:getRFPageDimensions(doc, pageno, zoom, rotation)
|
|
local kc = self:getCachedContext(doc, pageno)
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
return Geom:new{ w = fullwidth, h = fullheight }
|
|
end
|
|
|
|
--[[--
|
|
Get first page image.
|
|
--]]
|
|
function KoptInterface:getCoverPageImage(doc)
|
|
local native_size = Document.getNativePageDimensions(doc, 1)
|
|
local canvas_size = CanvasContext:getSize()
|
|
local zoom = math.min(canvas_size.w / native_size.w, canvas_size.h / native_size.h)
|
|
local tile = Document.renderPage(doc, 1, nil, zoom, 0, 1, 0)
|
|
if tile then
|
|
return tile.bb:copy()
|
|
end
|
|
end
|
|
|
|
function KoptInterface:renderPage(doc, pageno, rect, zoom, rotation, gamma, render_mode, hinting)
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:renderReflowedPage(doc, pageno, rect, zoom, rotation, render_mode, hinting)
|
|
elseif doc.configurable.page_opt == 1 or doc.configurable.auto_straighten > 0 then
|
|
return self:renderOptimizedPage(doc, pageno, rect, zoom, rotation, render_mode, hinting)
|
|
else
|
|
return Document.renderPage(doc, pageno, rect, zoom, rotation, gamma, render_mode, hinting)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Render reflowed page into tile cache.
|
|
|
|
Inherited from common document interface.
|
|
--]]
|
|
function KoptInterface:renderReflowedPage(doc, pageno, rect, zoom, rotation, render_mode)
|
|
doc.render_mode = render_mode
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "renderpg" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
-- do the real reflowing if kctx has not been cached yet
|
|
local kc = self:getCachedContext(doc, pageno)
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
if not DocCache:willAccept(fullwidth * fullheight) then
|
|
-- whole page won't fit into cache
|
|
error("aborting, since we don't have enough cache for this page")
|
|
end
|
|
-- prepare cache item with contained blitbuffer
|
|
local tile = TileCacheItem:new{
|
|
excerpt = Geom:new{ w = fullwidth, h = fullheight },
|
|
pageno = pageno,
|
|
}
|
|
tile.bb = kc:dstToBlitBuffer()
|
|
tile.size = tonumber(tile.bb.stride) * tile.bb.h + 512 -- estimation
|
|
DocCache:insert(hash, tile)
|
|
return tile
|
|
else
|
|
return cached
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Render optimized page into tile cache.
|
|
|
|
Inherited from common document interface.
|
|
--]]
|
|
function KoptInterface:renderOptimizedPage(doc, pageno, rect, zoom, rotation, render_mode, hinting)
|
|
doc.render_mode = render_mode
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "renderoptpg" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
|
|
local cached = DocCache:check(hash, TileCacheItem)
|
|
if not cached then
|
|
if hinting then
|
|
CanvasContext:enableCPUCores(2)
|
|
end
|
|
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
local full_page_bbox = {
|
|
x0 = 0, y0 = 0,
|
|
x1 = page_size.w,
|
|
y1 = page_size.h,
|
|
}
|
|
local kc = self:createContext(doc, pageno, full_page_bbox)
|
|
local page = doc._document:openPage(pageno)
|
|
kc:setZoom(zoom)
|
|
page:getPagePix(kc)
|
|
page:close()
|
|
logger.dbg("optimizing page", pageno)
|
|
kc:optimizePage()
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
-- prepare cache item with contained blitbuffer
|
|
local tile = TileCacheItem:new{
|
|
persistent = true,
|
|
doc_path = doc.file,
|
|
excerpt = Geom:new{
|
|
x = 0, y = 0,
|
|
w = fullwidth,
|
|
h = fullheight
|
|
},
|
|
pageno = pageno,
|
|
}
|
|
tile.bb = kc:dstToBlitBuffer()
|
|
tile.size = tonumber(tile.bb.stride) * tile.bb.h + 512 -- estimation
|
|
kc:free()
|
|
DocCache:insert(hash, tile)
|
|
|
|
if hinting then
|
|
CanvasContext:enableCPUCores(1)
|
|
end
|
|
|
|
return tile
|
|
else
|
|
return cached
|
|
end
|
|
end
|
|
|
|
function KoptInterface:hintPage(doc, pageno, zoom, rotation, gamma, render_mode)
|
|
--- @note: Crappy safeguard around memory issues like in #7627: if we're eating too much RAM, drop half the cache...
|
|
DocCache:memoryPressureCheck()
|
|
|
|
if doc.configurable.text_wrap == 1 then
|
|
self:hintReflowedPage(doc, pageno, zoom, rotation, gamma, render_mode, true)
|
|
elseif doc.configurable.page_opt == 1 or doc.configurable.auto_straighten > 0 then
|
|
self:renderOptimizedPage(doc, pageno, nil, zoom, rotation, gamma, render_mode, true)
|
|
else
|
|
Document.hintPage(doc, pageno, zoom, rotation, gamma, render_mode)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Render reflowed page into cache in background thread.
|
|
|
|
This method returns immediately, leaving the precache flag on
|
|
in context. Subsequent usage of this context should wait for the precache flag
|
|
off by calling self:waitForContext(kctx)
|
|
|
|
Inherited from common document interface.
|
|
--]]
|
|
function KoptInterface:hintReflowedPage(doc, pageno, zoom, rotation, gamma, render_mode, hinting)
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "kctx" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
if hinting then
|
|
CanvasContext:enableCPUCores(2)
|
|
end
|
|
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
local page = doc._document:openPage(pageno)
|
|
logger.dbg("hinting page", pageno, "in background")
|
|
-- reflow will return immediately and running in background thread
|
|
kc:setPreCache()
|
|
self.bg_thread = true
|
|
page:reflow(kc, render_mode)
|
|
page:close()
|
|
DocCache:insert(hash, ContextCacheItem:new{
|
|
size = self.last_context_size or self.default_context_size,
|
|
kctx = kc,
|
|
})
|
|
|
|
-- We'll wait until the background thread is done to go back to a single core, as this returns immediately!
|
|
-- c.f., :waitForContext
|
|
end
|
|
end
|
|
|
|
function KoptInterface:drawPage(doc, target, x, y, rect, pageno, zoom, rotation, gamma, render_mode)
|
|
if doc.configurable.text_wrap == 1 then
|
|
self:drawContextPage(doc, target, x, y, rect, pageno, zoom, rotation, render_mode)
|
|
elseif doc.configurable.page_opt == 1 or doc.configurable.auto_straighten > 0 then
|
|
self:drawContextPage(doc, target, x, y, rect, pageno, zoom, rotation, render_mode)
|
|
else
|
|
Document.drawPage(doc, target, x, y, rect, pageno, zoom, rotation, gamma, render_mode)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Draw cached tile pixels into target blitbuffer.
|
|
|
|
Inherited from common document interface.
|
|
--]]
|
|
function KoptInterface:drawContextPage(doc, target, x, y, rect, pageno, zoom, rotation, render_mode)
|
|
local tile = self:renderPage(doc, pageno, rect, zoom, rotation, render_mode)
|
|
target:blitFrom(tile.bb,
|
|
x, y,
|
|
rect.x - tile.excerpt.x,
|
|
rect.y - tile.excerpt.y,
|
|
rect.w, rect.h)
|
|
end
|
|
|
|
--[[
|
|
Extract text boxes in a MuPDF/Djvu page.
|
|
|
|
Returned boxes are in native page coordinates zoomed at `1.0`.
|
|
--]]
|
|
function KoptInterface:getTextBoxes(doc, pageno)
|
|
local text = doc:getPageTextBoxes(pageno)
|
|
if text and #text > 1 and doc.configurable.forced_ocr ~= 1 then
|
|
return text
|
|
-- if we have no text in original page then we will reuse native word boxes
|
|
-- in reflow mode and find text boxes from scratch in non-reflow mode
|
|
else
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:getNativeTextBoxes(doc, pageno)
|
|
else
|
|
return self:getNativeTextBoxesFromScratch(doc, pageno)
|
|
end
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text boxes in reflowed page via rectmaps in koptcontext.
|
|
--]]
|
|
function KoptInterface:getReflowedTextBoxes(doc, pageno)
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "rfpgboxes" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local kc
|
|
local kctx_hash = hash:gsub("^rfpgboxes|", "kctx|")
|
|
cached = DocCache:check(kctx_hash)
|
|
if not cached then
|
|
kc = self:getCachedContext(doc, pageno)
|
|
else
|
|
kc = self:waitForContext(cached.kctx)
|
|
end
|
|
--kc:setDebug()
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
local boxes, nr_word = kc:getReflowedWordBoxes("dst", 0, 0, fullwidth, fullheight)
|
|
if not boxes then
|
|
return
|
|
end
|
|
DocCache:insert(hash, CacheItem:new{ rfpgboxes = boxes, size = 192 * nr_word }) -- estimation
|
|
return boxes
|
|
else
|
|
return cached.rfpgboxes
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text boxes in native page via rectmaps in koptcontext.
|
|
--]]
|
|
function KoptInterface:getNativeTextBoxes(doc, pageno)
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "nativepgboxes" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local kc
|
|
local kctx_hash = hash:gsub("^nativepgboxes|", "kctx|")
|
|
cached = DocCache:check(kctx_hash)
|
|
if not cached then
|
|
kc = self:createContext(doc, pageno)
|
|
DocCache:insert(kctx_hash, ContextCacheItem:new{
|
|
persistent = true,
|
|
doc_path = doc.file,
|
|
size = self.last_context_size or self.default_context_size,
|
|
kctx = kc,
|
|
})
|
|
else
|
|
kc = self:waitForContext(cached.kctx)
|
|
end
|
|
--kc:setDebug()
|
|
local fullwidth, fullheight = kc:getPageDim()
|
|
local boxes, nr_word = kc:getNativeWordBoxes("dst", 0, 0, fullwidth, fullheight)
|
|
if not boxes then
|
|
return
|
|
end
|
|
DocCache:insert(hash, CacheItem:new{ nativepgboxes = boxes, size = 192 * nr_word }) -- estimation
|
|
return boxes
|
|
else
|
|
return cached.nativepgboxes
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text boxes in reflowed page via optical method.
|
|
|
|
Done by OCR pre-processing in Tesseract and Leptonica.
|
|
--]]
|
|
function KoptInterface:getReflowedTextBoxesFromScratch(doc, pageno)
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "scratchrfpgboxes" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local reflowed_kc
|
|
local kctx_hash = hash:gsub("^scratchrfpgboxes|", "kctx|")
|
|
cached = DocCache:check(kctx_hash)
|
|
if not cached then
|
|
reflowed_kc = self:getCachedContext(doc, pageno)
|
|
else
|
|
reflowed_kc = self:waitForContext(cached.kctx)
|
|
end
|
|
local fullwidth, fullheight = reflowed_kc:getPageDim()
|
|
local kc = self:createContext(doc, pageno)
|
|
kc:copyDestBMP(reflowed_kc)
|
|
local boxes, nr_word = kc:getNativeWordBoxes("dst", 0, 0, fullwidth, fullheight)
|
|
kc:free()
|
|
if not boxes then
|
|
return
|
|
end
|
|
DocCache:insert(hash, CacheItem:new{ scratchrfpgboxes = boxes, size = 192 * nr_word }) -- estimation
|
|
return boxes
|
|
else
|
|
return cached.scratchrfpgboxes
|
|
end
|
|
end
|
|
|
|
function KoptInterface:getPanelFromPage(doc, pageno, ges)
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
local bbox = {
|
|
x0 = 0, y0 = 0,
|
|
x1 = page_size.w,
|
|
y1 = page_size.h,
|
|
}
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
kc:setZoom(1.0)
|
|
local page = doc._document:openPage(pageno)
|
|
page:getPagePix(kc)
|
|
local panel = kc:getPanelFromPage(ges)
|
|
page:close()
|
|
kc:free()
|
|
return panel
|
|
end
|
|
|
|
--[[--
|
|
Get text boxes in native page via optical method.
|
|
|
|
Done by OCR pre-processing in Tesseract and Leptonica.
|
|
--]]
|
|
function KoptInterface:getNativeTextBoxesFromScratch(doc, pageno)
|
|
local hash = "scratchnativepgboxes|"..doc.file.."|"..pageno
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
local bbox = {
|
|
x0 = 0, y0 = 0,
|
|
x1 = page_size.w,
|
|
y1 = page_size.h,
|
|
}
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
kc:setZoom(1.0)
|
|
local page = doc._document:openPage(pageno)
|
|
page:getPagePix(kc)
|
|
local boxes, nr_word = kc:getNativeWordBoxes("src", 0, 0, page_size.w, page_size.h)
|
|
if boxes then
|
|
DocCache:insert(hash, CacheItem:new{ scratchnativepgboxes = boxes, size = 192 * nr_word }) -- estimation
|
|
end
|
|
page:close()
|
|
kc:free()
|
|
return boxes
|
|
else
|
|
return cached.scratchnativepgboxes
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get page regions in native page via optical method.
|
|
--]]
|
|
function KoptInterface:getPageBlock(doc, pageno, x, y)
|
|
local kctx
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "pageblocks" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
local full_page_bbox = {
|
|
x0 = 0, y0 = 0,
|
|
x1 = page_size.w,
|
|
y1 = page_size.h,
|
|
}
|
|
local kc = self:createContext(doc, pageno, full_page_bbox)
|
|
-- leptonica needs a source image of at least 300dpi
|
|
kc:setZoom(CanvasContext:getWidth() / page_size.w * 300 / CanvasContext:getDPI())
|
|
local page = doc._document:openPage(pageno)
|
|
page:getPagePix(kc)
|
|
kc:findPageBlocks()
|
|
DocCache:insert(hash, CacheItem:new{ kctx = kc, size = 3072 }) -- estimation
|
|
page:close()
|
|
kctx = kc
|
|
else
|
|
kctx = cached.kctx
|
|
end
|
|
return kctx:getPageBlock(x, y)
|
|
end
|
|
|
|
--[[--
|
|
Get word from OCR providing selected word box.
|
|
--]]
|
|
function KoptInterface:getOCRWord(doc, pageno, wbox)
|
|
if not DocCache:check(self.ocrengine) then
|
|
DocCache:insert(self.ocrengine, OCREngine:new{ ocrengine = KOPTContext.new(), size = 3072 }) -- estimation
|
|
end
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:getReflewOCRWord(doc, pageno, wbox.sbox)
|
|
else
|
|
return self:getNativeOCRWord(doc, pageno, wbox.sbox)
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get word from OCR in reflew page.
|
|
--]]
|
|
function KoptInterface:getReflewOCRWord(doc, pageno, rect)
|
|
self.ocr_lang = doc.configurable.doc_language
|
|
local bbox = doc:getPageBBox(pageno)
|
|
local hash_list = { "rfocrword" }
|
|
self:getContextHash(doc, pageno, bbox, hash_list)
|
|
table.insert(hash_list, rect.x)
|
|
table.insert(hash_list, rect.y)
|
|
table.insert(hash_list, rect.w)
|
|
table.insert(hash_list, rect.h)
|
|
local hash = table.concat(hash_list, "|")
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local kc
|
|
local kctx_hash = hash:gsub("^rfocrword|", "kctx|")
|
|
cached = DocCache:check(kctx_hash)
|
|
if not cached then
|
|
kc = self:getCachedContext(doc, pageno)
|
|
else
|
|
kc = self:waitForContext(cached.kctx)
|
|
end
|
|
local _, word = pcall(
|
|
kc.getTOCRWord, kc, "dst",
|
|
rect.x, rect.y, rect.w, rect.h,
|
|
self.tessocr_data, self.ocr_lang, self.ocr_type, 0, 1)
|
|
DocCache:insert(hash, CacheItem:new{ rfocrword = word, size = #word + 64 }) -- estimation
|
|
return word
|
|
else
|
|
return cached.rfocrword
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get word from OCR in native page.
|
|
--]]
|
|
function KoptInterface:getNativeOCRWord(doc, pageno, rect)
|
|
self.ocr_lang = doc.configurable.doc_language
|
|
local hash = "ocrword|"..doc.file.."|"..pageno..rect.x..rect.y..rect.w..rect.h
|
|
logger.dbg("hash", hash)
|
|
local cached = DocCache:check(hash)
|
|
if not cached then
|
|
local bbox = {
|
|
x0 = rect.x - math.floor(rect.h * 0.3),
|
|
y0 = rect.y - math.floor(rect.h * 0.3),
|
|
x1 = rect.x + rect.w + math.floor(rect.h * 0.3),
|
|
y1 = rect.y + rect.h + math.floor(rect.h * 0.3),
|
|
}
|
|
local kc = self:createContext(doc, pageno, bbox)
|
|
kc:setZoom(30/rect.h)
|
|
local page = doc._document:openPage(pageno)
|
|
page:getPagePix(kc)
|
|
--kc:exportSrcPNGFile({rect}, nil, "ocr-word.png")
|
|
local word_w, word_h = kc:getPageDim()
|
|
local _, word = pcall(
|
|
kc.getTOCRWord, kc, "src",
|
|
0, 0, word_w, word_h,
|
|
self.tessocr_data, self.ocr_lang, self.ocr_type, 0, 1)
|
|
DocCache:insert(hash, CacheItem:new{ ocrword = word, size = #word + 64 }) -- estimation
|
|
logger.dbg("word", word)
|
|
page:close()
|
|
kc:free()
|
|
return word
|
|
else
|
|
return cached.ocrword
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text from OCR providing selected text boxes.
|
|
--]]
|
|
function KoptInterface:getOCRText(doc, pageno, tboxes)
|
|
if not DocCache:check(self.ocrengine) then
|
|
DocCache:insert(self.ocrengine, OCREngine:new{ ocrengine = KOPTContext.new(), size = 3072 }) -- estimation
|
|
end
|
|
logger.info("Not implemented yet")
|
|
end
|
|
|
|
function KoptInterface:getClipPageContext(doc, pos0, pos1, pboxes, drawer)
|
|
assert(pos0.page == pos1.page)
|
|
assert(pos0.zoom == pos1.zoom)
|
|
local rect
|
|
if pboxes and #pboxes > 0 then
|
|
rect = Geom.boundingBox(pboxes)
|
|
else
|
|
local zoom = pos0.zoom or 1
|
|
rect = {
|
|
x = math.min(pos0.x, pos1.x)/zoom,
|
|
y = math.min(pos0.y, pos1.y)/zoom,
|
|
w = math.abs(pos0.x - pos1.x)/zoom,
|
|
h = math.abs(pos0.y - pos1.y)/zoom
|
|
}
|
|
end
|
|
|
|
local bbox = {
|
|
x0 = rect.x, y0 = rect.y,
|
|
x1 = rect.x + rect.w,
|
|
y1 = rect.y + rect.h
|
|
}
|
|
local kc = self:createContext(doc, pos0.page, bbox)
|
|
local page = doc._document:openPage(pos0.page)
|
|
page:getPagePix(kc)
|
|
page:close()
|
|
return kc, rect
|
|
end
|
|
|
|
function KoptInterface:clipPagePNGFile(doc, pos0, pos1, pboxes, drawer, filename)
|
|
local kc = self:getClipPageContext(doc, pos0, pos1, pboxes, drawer)
|
|
kc:exportSrcPNGFile(pboxes, drawer, filename)
|
|
kc:free()
|
|
end
|
|
|
|
function KoptInterface:clipPagePNGString(doc, pos0, pos1, pboxes, drawer)
|
|
local kc = self:getClipPageContext(doc, pos0, pos1, pboxes, drawer)
|
|
-- there is no fmemopen in Android so leptonica.pixWriteMemPng will
|
|
-- fail silently, workaround is creating a PNG file and read back the string
|
|
local png = nil
|
|
if FFIUtil.isAndroid() then
|
|
local tmp = "cache/tmpclippng.png"
|
|
kc:exportSrcPNGFile(pboxes, drawer, tmp)
|
|
local pngfile = io.open(tmp, "rb")
|
|
if pngfile then
|
|
png = pngfile:read("*all")
|
|
pngfile:close()
|
|
end
|
|
else
|
|
png = kc:exportSrcPNGString(pboxes, drawer)
|
|
end
|
|
kc:free()
|
|
return png
|
|
end
|
|
|
|
--[[--
|
|
Get index of nearest word box around `pos`.
|
|
--]]
|
|
local function inside_box(box, pos)
|
|
local x, y = pos.x, pos.y
|
|
if box.x0 <= x and box.y0 <= y and box.x1 >= x and box.y1 >= y then
|
|
return true
|
|
end
|
|
return false
|
|
end
|
|
|
|
local function box_distance(box, pos)
|
|
if inside_box(box, pos) then
|
|
return 0
|
|
else
|
|
local x0, y0 = pos.x, pos.y
|
|
local x1, y1 = (box.x0 + box.x1) / 2, (box.y0 + box.y1) / 2
|
|
return (x0 - x1)*(x0 - x1) + (y0 - y1)*(y0 - y1)
|
|
end
|
|
end
|
|
|
|
local function getWordBoxIndices(boxes, pos)
|
|
local m, n = 1, 1
|
|
for i = 1, #boxes do
|
|
for j = 1, #boxes[i] do
|
|
if box_distance(boxes[i][j], pos) < box_distance(boxes[m][n], pos) then
|
|
m, n = i, j
|
|
end
|
|
end
|
|
end
|
|
return m, n
|
|
end
|
|
|
|
--[[--
|
|
Get word and word box around `pos`.
|
|
--]]
|
|
function KoptInterface:getWordFromBoxes(boxes, pos)
|
|
if not pos or not boxes or #boxes == 0 then return {} end
|
|
local i, j = getWordBoxIndices(boxes, pos)
|
|
local lb = boxes[i]
|
|
local wb = boxes[i][j]
|
|
if lb and wb then
|
|
local box = Geom:new{
|
|
x = wb.x0, y = lb.y0,
|
|
w = wb.x1 - wb.x0,
|
|
h = lb.y1 - lb.y0,
|
|
}
|
|
return {
|
|
word = wb.word,
|
|
box = box,
|
|
}
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text and text boxes between `pos0` and `pos1`.
|
|
--]]
|
|
function KoptInterface:getTextFromBoxes(boxes, pos0, pos1)
|
|
if not pos0 or not pos1 or #boxes == 0 then return {} end
|
|
local line_text = ""
|
|
local line_boxes = {}
|
|
local i_start, j_start = getWordBoxIndices(boxes, pos0)
|
|
local i_stop, j_stop = getWordBoxIndices(boxes, pos1)
|
|
if i_start == i_stop and j_start > j_stop or i_start > i_stop then
|
|
i_start, i_stop = i_stop, i_start
|
|
j_start, j_stop = j_stop, j_start
|
|
end
|
|
for i = i_start, i_stop do
|
|
if i_start == i_stop and #boxes[i] == 0 then break end
|
|
-- insert line words
|
|
local j0 = i > i_start and 1 or j_start
|
|
local j1 = i < i_stop and #boxes[i] or j_stop
|
|
local line_first_word_seen = false
|
|
local prev_word
|
|
local prev_word_end_x
|
|
for j = j0, j1 do
|
|
local word = boxes[i][j].word
|
|
if word then
|
|
if not line_first_word_seen then
|
|
line_first_word_seen = true
|
|
if #line_text > 0 then
|
|
if line_text:sub(-1) == "-" then
|
|
-- Previous line ended with a minus.
|
|
-- Assume it's some hyphenation and discard it.
|
|
line_text = line_text:sub(1, -2)
|
|
elseif line_text:sub(-2, -1) == "\u{00AD}" then
|
|
-- Previous line ended with a hyphen.
|
|
-- Assume it's some hyphenation and discard it.
|
|
line_text = line_text:sub(1, -3)
|
|
else
|
|
-- No hyphenation, add a space (might be not welcome
|
|
-- with CJK text, but well...)
|
|
line_text = line_text .. " "
|
|
end
|
|
end
|
|
end
|
|
local box = boxes[i][j]
|
|
if prev_word then
|
|
-- A box should have been made for each word, so assume
|
|
-- we want a space between them, with some exceptions
|
|
local add_space = true
|
|
local box_height = box.y1 - box.y0
|
|
local dist_from_prev_word = box.x0 - prev_word_end_x
|
|
if prev_word:sub(-1, -1) == " " or word:sub(1, 1) == " " then
|
|
-- Already a space between these words
|
|
add_space = false
|
|
elseif dist_from_prev_word < box_height * 0.03 then
|
|
-- If the space between previous word box and this word box
|
|
-- is smaller than 5% of box height, assume these boxes
|
|
-- should be stuck
|
|
add_space = false
|
|
elseif dist_from_prev_word < box_height * 0.8 then
|
|
local prev_word_end = prev_word:match(util.UTF8_CHAR_PATTERN.."$")
|
|
local word_start = word:match(util.UTF8_CHAR_PATTERN)
|
|
if util.isCJKChar(prev_word_end) and util.isCJKChar(word_start) then
|
|
-- Two CJK chars whose spacing is not large enough,
|
|
-- but even so they must not have a space added.
|
|
add_space = false
|
|
end
|
|
end
|
|
if add_space then
|
|
word = " " .. word
|
|
end
|
|
end
|
|
line_text = line_text .. word
|
|
prev_word = word
|
|
prev_word_end_x = box.x1
|
|
end
|
|
end
|
|
-- insert line box
|
|
local lb = boxes[i]
|
|
if i > i_start and i < i_stop then
|
|
local line_box = Geom:new{
|
|
x = lb.x0, y = lb.y0,
|
|
w = lb.x1 - lb.x0,
|
|
h = lb.y1 - lb.y0,
|
|
}
|
|
table.insert(line_boxes, line_box)
|
|
elseif i == i_start and i < i_stop then
|
|
local wb = boxes[i][j_start]
|
|
local line_box = Geom:new{
|
|
x = wb.x0, y = lb.y0,
|
|
w = lb.x1 - wb.x0,
|
|
h = lb.y1 - lb.y0,
|
|
}
|
|
table.insert(line_boxes, line_box)
|
|
elseif i > i_start and i == i_stop then
|
|
local wb = boxes[i][j_stop]
|
|
local line_box = Geom:new{
|
|
x = lb.x0, y = lb.y0,
|
|
w = wb.x1 - lb.x0,
|
|
h = lb.y1 - lb.y0,
|
|
}
|
|
table.insert(line_boxes, line_box)
|
|
elseif i == i_start and i == i_stop then
|
|
local wb_start = boxes[i][j_start]
|
|
local wb_stop = boxes[i][j_stop]
|
|
local line_box = Geom:new{
|
|
x = wb_start.x0, y = lb.y0,
|
|
w = wb_stop.x1 - wb_start.x0,
|
|
h = lb.y1 - lb.y0,
|
|
}
|
|
table.insert(line_boxes, line_box)
|
|
end
|
|
end
|
|
return {
|
|
text = line_text,
|
|
boxes = line_boxes,
|
|
}
|
|
end
|
|
|
|
--[[--
|
|
Get word and word box from `doc` position.
|
|
]]--
|
|
function KoptInterface:getWordFromPosition(doc, pos)
|
|
local text_boxes = self:getTextBoxes(doc, pos.page)
|
|
if text_boxes then
|
|
self.last_text_boxes = text_boxes
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:getWordFromReflowPosition(doc, text_boxes, pos)
|
|
else
|
|
return self:getWordFromNativePosition(doc, text_boxes, pos)
|
|
end
|
|
end
|
|
end
|
|
|
|
local function getBoxRelativePosition(s_box, l_box)
|
|
local pos_rel = {}
|
|
local s_box_center = s_box:center()
|
|
pos_rel.x = (s_box_center.x - l_box.x)/l_box.w
|
|
pos_rel.y = (s_box_center.y - l_box.y)/l_box.h
|
|
return pos_rel
|
|
end
|
|
|
|
--[[--
|
|
Get word and word box from position in reflowed page.
|
|
]]--
|
|
function KoptInterface:getWordFromReflowPosition(doc, boxes, pos)
|
|
local pageno = pos.page
|
|
|
|
local scratch_reflowed_page_boxes = self:getReflowedTextBoxesFromScratch(doc, pageno)
|
|
local scratch_reflowed_word_box = self:getWordFromBoxes(scratch_reflowed_page_boxes, pos)
|
|
|
|
local reflowed_page_boxes = self:getReflowedTextBoxes(doc, pageno)
|
|
local reflowed_word_box = self:getWordFromBoxes(reflowed_page_boxes, pos)
|
|
|
|
local reflowed_pos_abs = scratch_reflowed_word_box.box:center()
|
|
local reflowed_pos_rel = getBoxRelativePosition(scratch_reflowed_word_box.box, reflowed_word_box.box)
|
|
|
|
local native_pos = self:reflowToNativePosTransform(doc, pageno, reflowed_pos_abs, reflowed_pos_rel)
|
|
local native_word_box = self:getWordFromBoxes(boxes, native_pos)
|
|
|
|
local word_box = {
|
|
word = native_word_box.word,
|
|
pbox = native_word_box.box, -- box on page
|
|
sbox = scratch_reflowed_word_box.box, -- box on screen
|
|
pos = native_pos,
|
|
}
|
|
return word_box
|
|
end
|
|
|
|
--[[--
|
|
Get word and word box from position in native page.
|
|
]]--
|
|
function KoptInterface:getWordFromNativePosition(doc, boxes, pos)
|
|
local native_word_box = self:getWordFromBoxes(boxes, pos)
|
|
local word_box = {
|
|
word = native_word_box.word,
|
|
pbox = native_word_box.box, -- box on page
|
|
sbox = native_word_box.box, -- box on screen
|
|
pos = pos,
|
|
}
|
|
return word_box
|
|
end
|
|
|
|
local function get_prev_text(boxes, i, j, nb_words)
|
|
local prev_count = 0
|
|
local prev_text = {}
|
|
while prev_count < nb_words do
|
|
if i == 1 and j == 1 then
|
|
break
|
|
elseif j == 1 then
|
|
i = i - 1
|
|
j = #boxes[i]
|
|
else
|
|
j = j - 1
|
|
end
|
|
local current_word = boxes[i][j].word
|
|
if #current_word > 0 then
|
|
table.insert(prev_text, 1, current_word)
|
|
prev_count = prev_count + 1
|
|
end
|
|
end
|
|
if #prev_text > 0 then
|
|
return table.concat(prev_text, " ")
|
|
end
|
|
end
|
|
|
|
local function get_next_text(boxes, i, j, nb_words)
|
|
local next_count = 0
|
|
local next_text = {}
|
|
while next_count < nb_words do
|
|
if i == #boxes and j == #boxes[i] then
|
|
break
|
|
elseif j == #boxes[i] then
|
|
i = i + 1
|
|
j = 1
|
|
else
|
|
j = j + 1
|
|
end
|
|
local current_word = boxes[i][j].word
|
|
if #current_word > 0 then
|
|
table.insert(next_text, current_word)
|
|
next_count = next_count + 1
|
|
end
|
|
end
|
|
if #next_text > 0 then
|
|
return table.concat(next_text, " ")
|
|
end
|
|
end
|
|
|
|
function KoptInterface:getSelectedWordContext(word, nb_words, pos)
|
|
local boxes = self.last_text_boxes
|
|
if not pos or not boxes or #boxes == 0 then return end
|
|
local i, j = getWordBoxIndices(boxes, pos)
|
|
if boxes[i][j].word ~= word then return end
|
|
local prev_text = get_prev_text(boxes, i, j, nb_words)
|
|
local next_text = get_next_text(boxes, i, j, nb_words)
|
|
return prev_text, next_text
|
|
end
|
|
|
|
--[[--
|
|
Get link from position in screen page.
|
|
]]--
|
|
function KoptInterface:getLinkFromPosition(doc, pageno, pos)
|
|
local function _inside_box(coords, box)
|
|
if coords then
|
|
local x, y = coords.x, coords.y
|
|
if box.x <= x and box.y <= y
|
|
and box.x + box.w >= x
|
|
and box.y + box.h >= y then
|
|
return true
|
|
end
|
|
end
|
|
end
|
|
local page_links = doc:getPageLinks(pageno)
|
|
if page_links then
|
|
if doc.configurable.text_wrap == 1 then
|
|
pos = self:reflowToNativePosTransform(doc, pageno, pos, {x=0.5, y=0.5})
|
|
end
|
|
|
|
local offset = CanvasContext:scaleBySize(5)
|
|
local len = CanvasContext:scaleBySize(10)
|
|
for i = 1, #page_links do
|
|
local link = page_links[i]
|
|
-- enlarge tappable link box
|
|
local lbox = Geom:new{
|
|
x = link.x0 - offset,
|
|
y = link.y0 - offset,
|
|
w = link.x1 - link.x0 + len,
|
|
h = link.y1 - link.y0 + len,
|
|
}
|
|
-- Allow external links, with link.uri instead of link.page
|
|
if _inside_box(pos, lbox) then -- and link.page then
|
|
return link, lbox
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Transform position in native page to reflowed page.
|
|
]]--
|
|
function KoptInterface:nativeToReflowPosTransform(doc, pageno, pos)
|
|
local kc = self:getCachedContext(doc, pageno)
|
|
local rpos = {page = pageno}
|
|
rpos.x, rpos.y = kc:nativeToReflowPosTransform(pos.x, pos.y)
|
|
return rpos
|
|
end
|
|
|
|
--[[--
|
|
Transform position in reflowed page to native page.
|
|
]]--
|
|
function KoptInterface:reflowToNativePosTransform(doc, pageno, abs_pos, rel_pos)
|
|
local kc = self:getCachedContext(doc, pageno)
|
|
local npos = {page = pageno}
|
|
npos.x, npos.y = kc:reflowToNativePosTransform(abs_pos.x, abs_pos.y, rel_pos.x, rel_pos.y)
|
|
return npos
|
|
end
|
|
|
|
--[[--
|
|
Get text and text boxes from screen positions.
|
|
--]]
|
|
function KoptInterface:getTextFromPositions(doc, pos0, pos1)
|
|
local text_boxes = self:getTextBoxes(doc, pos0.page)
|
|
if text_boxes then
|
|
if doc.configurable.text_wrap == 1 then
|
|
return self:getTextFromReflowPositions(doc, text_boxes, pos0, pos1)
|
|
else
|
|
return self:getTextFromNativePositions(doc, text_boxes, pos0, pos1)
|
|
end
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Get text and text boxes from screen positions for reflowed page.
|
|
]]--
|
|
function KoptInterface:getTextFromReflowPositions(doc, native_boxes, pos0, pos1)
|
|
local pageno = pos0.page
|
|
|
|
local scratch_reflowed_page_boxes = self:getReflowedTextBoxesFromScratch(doc, pageno)
|
|
local reflowed_page_boxes = self:getReflowedTextBoxes(doc, pageno)
|
|
|
|
local scratch_reflowed_word_box0 = self:getWordFromBoxes(scratch_reflowed_page_boxes, pos0)
|
|
local reflowed_word_box0 = self:getWordFromBoxes(reflowed_page_boxes, pos0)
|
|
local scratch_reflowed_word_box1 = self:getWordFromBoxes(scratch_reflowed_page_boxes, pos1)
|
|
local reflowed_word_box1 = self:getWordFromBoxes(reflowed_page_boxes, pos1)
|
|
|
|
local reflowed_pos_abs0 = scratch_reflowed_word_box0.box:center()
|
|
local reflowed_pos_rel0 = getBoxRelativePosition(scratch_reflowed_word_box0.box, reflowed_word_box0.box)
|
|
local reflowed_pos_abs1 = scratch_reflowed_word_box1.box:center()
|
|
local reflowed_pos_rel1 = getBoxRelativePosition(scratch_reflowed_word_box1.box, reflowed_word_box1.box)
|
|
|
|
local native_pos0 = self:reflowToNativePosTransform(doc, pageno, reflowed_pos_abs0, reflowed_pos_rel0)
|
|
local native_pos1 = self:reflowToNativePosTransform(doc, pageno, reflowed_pos_abs1, reflowed_pos_rel1)
|
|
|
|
local reflowed_text_boxes = self:getTextFromBoxes(reflowed_page_boxes, pos0, pos1)
|
|
local native_text_boxes = self:getTextFromBoxes(native_boxes, native_pos0, native_pos1)
|
|
local text_boxes = {
|
|
text = native_text_boxes.text,
|
|
pboxes = native_text_boxes.boxes, -- boxes on page
|
|
sboxes = reflowed_text_boxes.boxes, -- boxes on screen
|
|
pos0 = native_pos0,
|
|
pos1 = native_pos1
|
|
}
|
|
return text_boxes
|
|
end
|
|
|
|
--[[--
|
|
Get text and text boxes from screen positions for native page.
|
|
]]--
|
|
function KoptInterface:getTextFromNativePositions(doc, native_boxes, pos0, pos1)
|
|
local native_text_boxes = self:getTextFromBoxes(native_boxes, pos0, pos1)
|
|
local text_boxes = {
|
|
text = native_text_boxes.text,
|
|
pboxes = native_text_boxes.boxes, -- boxes on page
|
|
sboxes = native_text_boxes.boxes, -- boxes on screen
|
|
pos0 = pos0,
|
|
pos1 = pos1,
|
|
}
|
|
return text_boxes
|
|
end
|
|
|
|
--[[--
|
|
Get text boxes from page positions.
|
|
--]]
|
|
function KoptInterface:getPageBoxesFromPositions(doc, pageno, ppos0, ppos1)
|
|
if not ppos0 or not ppos1 then return end
|
|
if doc.configurable.text_wrap == 1 then
|
|
local spos0 = self:nativeToReflowPosTransform(doc, pageno, ppos0)
|
|
local spos1 = self:nativeToReflowPosTransform(doc, pageno, ppos1)
|
|
local page_boxes = self:getReflowedTextBoxes(doc, pageno)
|
|
if not page_boxes then
|
|
logger.warn("KoptInterface: missing page_boxes")
|
|
return
|
|
end
|
|
local text_boxes = self:getTextFromBoxes(page_boxes, spos0, spos1)
|
|
return text_boxes.boxes
|
|
else
|
|
local page_boxes = self:getTextBoxes(doc, pageno)
|
|
if not page_boxes then
|
|
logger.warn("KoptInterface: missing page_boxes")
|
|
return
|
|
end
|
|
local text_boxes = self:getTextFromBoxes(page_boxes, ppos0, ppos1)
|
|
return text_boxes.boxes
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Compare positions within one page.
|
|
Returns 1 if positions are ordered (if ppos2 is after ppos1), -1 if not, 0 if same.
|
|
Positions of the word boxes containing ppos1 and ppos2 are compared.
|
|
--]]
|
|
function KoptInterface:comparePositions(doc, ppos1, ppos2)
|
|
if ppos1.page < ppos2.page then
|
|
return 1
|
|
elseif ppos1.page > ppos2.page then
|
|
return -1
|
|
end
|
|
local box1 = self:getWordFromPosition(doc, ppos1).pbox
|
|
local box2 = self:getWordFromPosition(doc, ppos2).pbox
|
|
if box1.y == box2.y then
|
|
if box1.x == box2.x then
|
|
return 0
|
|
elseif box1.x > box2.x then
|
|
return -1
|
|
end
|
|
elseif box1.y > box2.y then
|
|
return -1
|
|
end
|
|
return 1
|
|
end
|
|
|
|
--[[--
|
|
Get page rect from native rect.
|
|
--]]
|
|
function KoptInterface:nativeToPageRectTransform(doc, pageno, rect)
|
|
if doc.configurable.text_wrap == 1 then
|
|
local pos0 = {
|
|
x = rect.x + 5, y = rect.y + 5
|
|
}
|
|
local pos1 = {
|
|
x = rect.x + rect.w - 5,
|
|
y = rect.y + rect.h - 5
|
|
}
|
|
local boxes = self:getPageBoxesFromPositions(doc, pageno, pos0, pos1)
|
|
if boxes then
|
|
return Geom.boundingBox(boxes)
|
|
end
|
|
else
|
|
return rect
|
|
end
|
|
end
|
|
|
|
local function get_pattern_list(pattern, case_insensitive)
|
|
-- pattern list of single words
|
|
local plist = {}
|
|
-- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
|
|
for word in util.gsplit(pattern, "%s+") do
|
|
if util.hasCJKChar(word) then
|
|
for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
|
|
table.insert(plist, case_insensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
|
|
end
|
|
else
|
|
table.insert(plist, case_insensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
|
|
end
|
|
end
|
|
return plist
|
|
end
|
|
|
|
local function all_matches(boxes, plist, case_insensitive)
|
|
local pnb = #plist
|
|
-- return mached word indices from index i, j
|
|
local function match(i, j)
|
|
local pindex = 1
|
|
local matched_indices = {}
|
|
if pnb == 0 then return end
|
|
while true do
|
|
if #boxes[i] < j then
|
|
j = j - #boxes[i]
|
|
i = i + 1
|
|
end
|
|
if i > #boxes then break end
|
|
local box = boxes[i][j]
|
|
local word = case_insensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
|
|
local pword = plist[pindex]
|
|
local matched
|
|
if pnb == 1 then -- single word in plist
|
|
matched = word:find(pword, 1, true)
|
|
else -- multiple words in plist
|
|
if pindex == 1 then
|
|
-- first word of query should match at end of a word from the document
|
|
matched = word:sub(-#pword) == pword
|
|
elseif pindex == pnb then
|
|
-- last word of query should match at start of the word from the document
|
|
matched = word:sub(1, #pword) == pword
|
|
else
|
|
-- middle words in query should match exactly the word from the document
|
|
matched = word == pword
|
|
end
|
|
end
|
|
if matched then
|
|
table.insert(matched_indices, {i, j})
|
|
if pindex == pnb then
|
|
-- all words in plist iterated, all matched
|
|
return matched_indices
|
|
else
|
|
j = j + 1
|
|
pindex = pindex + 1
|
|
end
|
|
else
|
|
break
|
|
end
|
|
end
|
|
end
|
|
-- Note that this returns a full word box, even if what matches
|
|
-- is only a substring of a word box.
|
|
return coroutine.wrap(function()
|
|
for i, line in ipairs(boxes) do
|
|
for j, box in ipairs(line) do
|
|
local matches = match(i, j)
|
|
if matches then
|
|
coroutine.yield(matches)
|
|
end
|
|
end
|
|
end
|
|
end)
|
|
end
|
|
|
|
function KoptInterface:findAllMatches(doc, pattern, case_insensitive, page)
|
|
local text_boxes = doc:getPageTextBoxes(page)
|
|
if not text_boxes then return end
|
|
local plist = get_pattern_list(pattern, case_insensitive)
|
|
local matches = {}
|
|
for indices in all_matches(text_boxes, plist, case_insensitive) do
|
|
for _, index in ipairs(indices) do
|
|
local i, j = unpack(index)
|
|
local word = text_boxes[i][j]
|
|
local word_box = {
|
|
x = word.x0, y = word.y0,
|
|
w = word.x1 - word.x0,
|
|
h = word.y1 - word.y0,
|
|
}
|
|
-- rects will be transformed to reflowed page rects if needed
|
|
table.insert(matches, self:nativeToPageRectTransform(doc, page, word_box))
|
|
end
|
|
end
|
|
return matches
|
|
end
|
|
|
|
function KoptInterface:findText(doc, pattern, origin, reverse, case_insensitive, pageno)
|
|
logger.dbg("Koptinterface: find text", pattern, origin, reverse, case_insensitive, pageno)
|
|
local last_pageno = doc:getPageCount()
|
|
local start_page, end_page
|
|
if reverse == 1 then
|
|
-- backward
|
|
if origin == 0 then
|
|
-- from end of current page to first page
|
|
start_page, end_page = pageno, 1
|
|
elseif origin == -1 then
|
|
-- from the last page to end of current page
|
|
start_page, end_page = last_pageno, pageno + 1
|
|
elseif origin == 1 then
|
|
start_page, end_page = pageno - 1, 1
|
|
end
|
|
else
|
|
-- forward
|
|
if origin == 0 then
|
|
-- from current page to the last page
|
|
start_page, end_page = pageno, last_pageno
|
|
elseif origin == -1 then
|
|
-- from the first page to current page
|
|
start_page, end_page = 1, pageno - 1
|
|
elseif origin == 1 then
|
|
-- from next page to the last page
|
|
start_page, end_page = pageno + 1, last_pageno
|
|
end
|
|
end
|
|
for i = start_page, end_page, (reverse == 1) and -1 or 1 do
|
|
local matches = self:findAllMatches(doc, pattern, case_insensitive, i)
|
|
if #matches > 0 then
|
|
matches.page = i
|
|
return matches
|
|
end
|
|
end
|
|
end
|
|
|
|
function KoptInterface:findAllText(doc, pattern, case_insensitive, nb_context_words, max_hits)
|
|
local plist = get_pattern_list(pattern, case_insensitive)
|
|
local res = {}
|
|
for page = 1, doc:getPageCount() do
|
|
local text_boxes = doc:getPageTextBoxes(page)
|
|
if text_boxes then
|
|
for indices in all_matches(text_boxes, plist, case_insensitive) do -- each found pattern in the page
|
|
local res_item = { -- item of the Menu item_table
|
|
text = nil,
|
|
mandatory = page,
|
|
boxes = {}, -- to draw temp highlight in onMenuSelect
|
|
}
|
|
local text = {}
|
|
local i_prev, j_prev, i_next, j_next
|
|
for ind, index in ipairs(indices) do -- each word in the pattern
|
|
local i, j = unpack(index)
|
|
local word = text_boxes[i][j]
|
|
res_item.boxes[ind] = {
|
|
x = word.x0, y = word.y0,
|
|
w = word.x1 - word.x0,
|
|
h = word.y1 - word.y0,
|
|
}
|
|
text[ind] = word.word
|
|
if ind == 1 then
|
|
i_prev, j_prev = i, j
|
|
end
|
|
if ind == #indices then
|
|
i_next, j_next = i, j
|
|
end
|
|
end
|
|
-- Make this word bolder, using Poor Text Formatting provided by TextBoxWidget
|
|
-- (we know this text ends up in a TextBoxWidget).
|
|
text = TextBoxWidget.PTF_BOLD_START .. table.concat(text, " ") .. TextBoxWidget.PTF_BOLD_END
|
|
local prev_text = get_prev_text(text_boxes, i_prev, j_prev, nb_context_words)
|
|
if prev_text then
|
|
text = prev_text .. " " .. text
|
|
end
|
|
local next_text = get_next_text(text_boxes, i_next, j_next, nb_context_words)
|
|
if next_text then
|
|
text = text .. " " .. next_text
|
|
end
|
|
text = TextBoxWidget.PTF_HEADER .. text -- enable handling of our bold tags
|
|
res_item.text = text
|
|
table.insert(res, res_item)
|
|
if #res == max_hits then
|
|
return res
|
|
end
|
|
end
|
|
end
|
|
end
|
|
if #res > 0 then
|
|
return res
|
|
end
|
|
end
|
|
|
|
--[[--
|
|
Log reflow duration.
|
|
--]]
|
|
function KoptInterface:logReflowDuration(pageno, dur)
|
|
local file = io.open("reflow_dur_log.txt", "a+")
|
|
if file then
|
|
if file:seek("end") == 0 then -- write the header only once
|
|
file:write("PAGE\tDUR\n")
|
|
end
|
|
file:write(string.format("%s\t%s\n", pageno, dur))
|
|
file:close()
|
|
end
|
|
end
|
|
|
|
return KoptInterface
|