kopt: correctly handle CJK character detection for space insertion (#8438)

Previously getTextFromBoxes would just pass the first and last three
bytes of the current and previous words when trying to detect CJK
characters (which shouldn't have spaces inserted).

However, this handling was not correct because CJK characters can be
longer than 3 bytes, and internally BaseUtil.utf8charcode doesn't ensure
that it was only given a single utf8 character (it blindly does the bit
operations on whatever length code you give it).

As a result, before this patch selections in PDF documents would have
lots of spaces stripped because getTextFromBoxes would think that almost
all characters were CJK characters.

Fixes: 6f1b70e5eb ("util.utf8: improve CJK character detection")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
pull/8441/head^2
Aleksa Sarai 2 years ago committed by GitHub
parent 15778ac939
commit 5709b4c2f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,12 +9,13 @@ local DEBUG = require("dbg")
local Device = require("device")
local DocCache = require("document/doccache")
local Document = require("document/document")
local FFIUtil = require("ffi/util")
local Geom = require("ui/geometry")
local KOPTContext = require("ffi/koptcontext")
local Persist = require("persist")
local TileCacheItem = require("document/tilecacheitem")
local logger = require("logger")
local util = require("ffi/util")
local util = require("util")
local KoptInterface = {
ocrengine = "ocrengine",
@ -105,7 +106,7 @@ function KoptInterface:waitForContext(kc)
while kc and kc:isPreCache() == 1 do
waited = true
logger.dbg("waiting for background rendering")
util.usleep(100000)
FFIUtil.usleep(100000)
end
if waited or self.bg_thread then
@ -274,10 +275,10 @@ function KoptInterface:getCachedContext(doc, pageno)
local page = doc._document:openPage(pageno)
logger.dbg("reflowing page", pageno, "in foreground")
-- reflow page
--local secs, usecs = util.gettime()
--local secs, usecs = FFIUtil.gettime()
page:reflow(kc, doc.render_mode or DRENDER_MODE) -- Fall backs to a default set to DDJVU_RENDER_COLOR
page:close()
--local nsecs, nusecs = util.gettime()
--local nsecs, nusecs = FFIUtil.gettime()
--local dur = nsecs - secs + (nusecs - usecs) / 1000000
--self:logReflowDuration(pageno, dur)
local fullwidth, fullheight = kc:getPageDim()
@ -840,7 +841,7 @@ function KoptInterface:clipPagePNGString(doc, pos0, pos1, pboxes, drawer)
-- there is no fmemopen in Android so leptonica.pixWriteMemPng will
-- fail silently, workaround is creating a PNG file and read back the string
local png = nil
if util.isAndroid() then
if FFIUtil.isAndroid() then
local tmp = "cache/tmpclippng.png"
kc:exportSrcPNGFile(pboxes, drawer, tmp)
local pngfile = io.open(tmp, "rb")
@ -914,7 +915,6 @@ Get text and text boxes between `pos0` and `pos1`.
--]]
function KoptInterface:getTextFromBoxes(boxes, pos0, pos1)
if not pos0 or not pos1 or #boxes == 0 then return {} end
local isCJKChar = require("util").isCJKChar
local line_text = ""
local line_boxes = {}
local i_start, j_start = getWordBoxIndices(boxes, pos0)
@ -968,10 +968,11 @@ function KoptInterface:getTextFromBoxes(boxes, pos0, pos1)
-- should be stuck
add_space = false
elseif dist_from_prev_word < box_height * 0.8 then
if isCJKChar(prev_word:sub(-3, -1)) and isCJKChar(word:sub(1, 3)) then
-- Two CJK chars whose spacing is not large enough
-- (we checked the 3 UTF8 bytes that CJK chars must be,
-- no need to split into unicode codepoints)
local prev_word_end = prev_word:match(util.UTF8_CHAR_PATTERN.."$")
local word_start = word:match(util.UTF8_CHAR_PATTERN)
if util.isCJKChar(prev_word_end) and util.isCJKChar(word_start) then
-- Two CJK chars whose spacing is not large enough,
-- but even so they must not have a space added.
add_space = false
end
end

Loading…
Cancel
Save