|
|
@ -338,9 +338,8 @@ function KoptInterface:getTextBoxes(doc, pageno)
|
|
|
|
else
|
|
|
|
else
|
|
|
|
if doc.configurable.text_wrap == 1 then
|
|
|
|
if doc.configurable.text_wrap == 1 then
|
|
|
|
return self:getNativeTextBoxes(doc, pageno)
|
|
|
|
return self:getNativeTextBoxes(doc, pageno)
|
|
|
|
--return self:getTextBoxesFromScratch(doc, pageno)
|
|
|
|
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return self:getTextBoxesFromScratch(doc, pageno)
|
|
|
|
return self:getNativeTextBoxesFromScratch(doc, pageno)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
@ -393,12 +392,39 @@ function KoptInterface:getNativeTextBoxes(doc, pageno)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
--[[
|
|
|
|
|
|
|
|
get text boxes in reflowed page via optical method,
|
|
|
|
|
|
|
|
i.e. OCR pre-processing in Tesseract and Leptonica.
|
|
|
|
|
|
|
|
--]]
|
|
|
|
|
|
|
|
function KoptInterface:getReflowedTextBoxesFromScratch(doc, pageno)
|
|
|
|
|
|
|
|
local bbox = doc:getPageBBox(pageno)
|
|
|
|
|
|
|
|
local context_hash = self:getContextHash(doc, pageno, bbox)
|
|
|
|
|
|
|
|
local hash = "scratchrfpgboxes|"..context_hash
|
|
|
|
|
|
|
|
local cached = Cache:check(hash)
|
|
|
|
|
|
|
|
if not cached then
|
|
|
|
|
|
|
|
local kctx_hash = "kctx|"..context_hash
|
|
|
|
|
|
|
|
local cached = Cache:check(kctx_hash)
|
|
|
|
|
|
|
|
if cached then
|
|
|
|
|
|
|
|
local reflowed_kc = self:waitForContext(cached.kctx)
|
|
|
|
|
|
|
|
local fullwidth, fullheight = reflowed_kc:getPageDim()
|
|
|
|
|
|
|
|
local kc = self:createContext(doc, pageno)
|
|
|
|
|
|
|
|
kc:copyDestBMP(reflowed_kc)
|
|
|
|
|
|
|
|
local boxes = kc:getNativeWordBoxes(0, 0, fullwidth, fullheight)
|
|
|
|
|
|
|
|
Cache:insert(hash, CacheItem:new{ scratchrfpgboxes = boxes })
|
|
|
|
|
|
|
|
kc:free()
|
|
|
|
|
|
|
|
return boxes
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
return cached.scratchrfpgboxes
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
--[[
|
|
|
|
--[[
|
|
|
|
get text boxes in native page via optical method,
|
|
|
|
get text boxes in native page via optical method,
|
|
|
|
i.e. OCR pre-processing in Tesseract and Leptonica.
|
|
|
|
i.e. OCR pre-processing in Tesseract and Leptonica.
|
|
|
|
--]]
|
|
|
|
--]]
|
|
|
|
function KoptInterface:getTextBoxesFromScratch(doc, pageno)
|
|
|
|
function KoptInterface:getNativeTextBoxesFromScratch(doc, pageno)
|
|
|
|
local hash = "pgboxes|"..doc.file.."|"..pageno
|
|
|
|
local hash = "scratchnativepgboxes|"..doc.file.."|"..pageno
|
|
|
|
local cached = Cache:check(hash)
|
|
|
|
local cached = Cache:check(hash)
|
|
|
|
if not cached then
|
|
|
|
if not cached then
|
|
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
|
|
local page_size = Document.getNativePageDimensions(doc, pageno)
|
|
|
@ -412,12 +438,12 @@ function KoptInterface:getTextBoxesFromScratch(doc, pageno)
|
|
|
|
local page = doc._document:openPage(pageno)
|
|
|
|
local page = doc._document:openPage(pageno)
|
|
|
|
page:getPagePix(kc)
|
|
|
|
page:getPagePix(kc)
|
|
|
|
local boxes = kc:getNativeWordBoxes(0, 0, page_size.w, page_size.h)
|
|
|
|
local boxes = kc:getNativeWordBoxes(0, 0, page_size.w, page_size.h)
|
|
|
|
Cache:insert(hash, CacheItem:new{ pgboxes = boxes })
|
|
|
|
Cache:insert(hash, CacheItem:new{ scratchnativepgboxes = boxes })
|
|
|
|
page:close()
|
|
|
|
page:close()
|
|
|
|
kc:free()
|
|
|
|
kc:free()
|
|
|
|
return boxes
|
|
|
|
return boxes
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return cached.pgboxes
|
|
|
|
return cached.scratchnativepgboxes
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
@ -643,7 +669,7 @@ get word and word box from position in reflowed page
|
|
|
|
]]--
|
|
|
|
]]--
|
|
|
|
function KoptInterface:getWordFromReflowPosition(doc, boxes, pos)
|
|
|
|
function KoptInterface:getWordFromReflowPosition(doc, boxes, pos)
|
|
|
|
local pageno = pos.page
|
|
|
|
local pageno = pos.page
|
|
|
|
local reflowed_page_boxes = self:getReflowedTextBoxes(doc, pageno)
|
|
|
|
local reflowed_page_boxes = self:getReflowedTextBoxesFromScratch(doc, pageno)
|
|
|
|
local reflowed_word_box = self:getWordFromBoxes(reflowed_page_boxes, pos)
|
|
|
|
local reflowed_word_box = self:getWordFromBoxes(reflowed_page_boxes, pos)
|
|
|
|
local reflowed_pos = reflowed_word_box.box:center()
|
|
|
|
local reflowed_pos = reflowed_word_box.box:center()
|
|
|
|
local native_pos = self:reflowToNativePosTransform(doc, pageno, reflowed_pos)
|
|
|
|
local native_pos = self:reflowToNativePosTransform(doc, pageno, reflowed_pos)
|
|
|
|