2
0
mirror of https://github.com/koreader/koreader synced 2024-11-10 01:10:34 +00:00

Merge pull request #2381 from Hzj-jie/master3

PR #2356 breaks CJK character splitting
This commit is contained in:
Huang Xin 2016-11-26 10:22:04 +08:00 committed by GitHub
commit 2859d8ee07
2 changed files with 32 additions and 2 deletions

View File

@ -117,6 +117,16 @@ function util.splitToChars(text)
return tab return tab
end end
-- Test whether c is a CJK character
function util.isCJKChar(c)
return string.match(c, "[\228-\234][\128-\191].") == c
end
-- Test whether str contains CJK characters
function util.hasCJKChar(str)
return string.match(str, "[\228-\234][\128-\191].") ~= nil
end
--- Split text into a list of words, spaces and punctuations. --- Split text into a list of words, spaces and punctuations.
---- @string text text to split ---- @string text text to split
---- @treturn table list of words, spaces and punctuations ---- @treturn table list of words, spaces and punctuations
@ -124,7 +134,7 @@ function util.splitToWords(text)
local wlist = {} local wlist = {}
for word in util.gsplit(text, "[%s%p]+", true) do for word in util.gsplit(text, "[%s%p]+", true) do
-- if space splitted word contains CJK characters -- if space splitted word contains CJK characters
if word:match("[\228-\234][\128-\191]+") then if util.hasCJKChar(word) then
-- split with CJK characters -- split with CJK characters
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
table.insert(wlist, char) table.insert(wlist, char)
@ -138,7 +148,7 @@ end
-- Test whether a string could be separated by a char for multi-line rendering -- Test whether a string could be separated by a char for multi-line rendering
function util.isSplitable(c) function util.isSplitable(c)
return c == " " or string.match(c, "%p") ~= nil return util.isCJKChar(c) or c == " " or string.match(c, "%p") ~= nil
end end
return util return util

View File

@ -106,4 +106,24 @@ describe("util module", function()
}) })
end) end)
it("should split text to line - CJK", function()
local text = "彩虹是通过太阳光的折射引起的。"
local word = ""
local table_of_words = {}
local c
local table_chars = util.splitToChars(text)
for i = 1, #table_chars do
c = table_chars[i]
word = word .. c
if util.isSplitable(c) then
table.insert(table_of_words, word)
word = ""
end
if i == #table_chars then table.insert(table_of_words, word) end
end
assert.are_same(table_of_words, {
"","","","","","","","","","","","","","","",
})
end)
end) end)