mirror of
https://github.com/koreader/koreader
synced 2024-11-10 01:10:34 +00:00
Merge pull request #2381 from Hzj-jie/master3
PR #2356 breaks CJK character splitting
This commit is contained in:
commit
2859d8ee07
@ -117,6 +117,16 @@ function util.splitToChars(text)
|
|||||||
return tab
|
return tab
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- Test whether c is a CJK character
|
||||||
|
function util.isCJKChar(c)
|
||||||
|
return string.match(c, "[\228-\234][\128-\191].") == c
|
||||||
|
end
|
||||||
|
|
||||||
|
-- Test whether str contains CJK characters
|
||||||
|
function util.hasCJKChar(str)
|
||||||
|
return string.match(str, "[\228-\234][\128-\191].") ~= nil
|
||||||
|
end
|
||||||
|
|
||||||
--- Split text into a list of words, spaces and punctuations.
|
--- Split text into a list of words, spaces and punctuations.
|
||||||
---- @string text text to split
|
---- @string text text to split
|
||||||
---- @treturn table list of words, spaces and punctuations
|
---- @treturn table list of words, spaces and punctuations
|
||||||
@ -124,7 +134,7 @@ function util.splitToWords(text)
|
|||||||
local wlist = {}
|
local wlist = {}
|
||||||
for word in util.gsplit(text, "[%s%p]+", true) do
|
for word in util.gsplit(text, "[%s%p]+", true) do
|
||||||
-- if space splitted word contains CJK characters
|
-- if space splitted word contains CJK characters
|
||||||
if word:match("[\228-\234][\128-\191]+") then
|
if util.hasCJKChar(word) then
|
||||||
-- split with CJK characters
|
-- split with CJK characters
|
||||||
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
|
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
|
||||||
table.insert(wlist, char)
|
table.insert(wlist, char)
|
||||||
@ -138,7 +148,7 @@ end
|
|||||||
|
|
||||||
-- Test whether a string could be separated by a char for multi-line rendering
|
-- Test whether a string could be separated by a char for multi-line rendering
|
||||||
function util.isSplitable(c)
|
function util.isSplitable(c)
|
||||||
return c == " " or string.match(c, "%p") ~= nil
|
return util.isCJKChar(c) or c == " " or string.match(c, "%p") ~= nil
|
||||||
end
|
end
|
||||||
|
|
||||||
return util
|
return util
|
||||||
|
@ -106,4 +106,24 @@ describe("util module", function()
|
|||||||
})
|
})
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it("should split text to line - CJK", function()
|
||||||
|
local text = "彩虹是通过太阳光的折射引起的。"
|
||||||
|
local word = ""
|
||||||
|
local table_of_words = {}
|
||||||
|
local c
|
||||||
|
local table_chars = util.splitToChars(text)
|
||||||
|
for i = 1, #table_chars do
|
||||||
|
c = table_chars[i]
|
||||||
|
word = word .. c
|
||||||
|
if util.isSplitable(c) then
|
||||||
|
table.insert(table_of_words, word)
|
||||||
|
word = ""
|
||||||
|
end
|
||||||
|
if i == #table_chars then table.insert(table_of_words, word) end
|
||||||
|
end
|
||||||
|
assert.are_same(table_of_words, {
|
||||||
|
"彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
|
||||||
|
})
|
||||||
|
end)
|
||||||
|
|
||||||
end)
|
end)
|
||||||
|
Loading…
Reference in New Issue
Block a user