2015-04-22 06:17:06 +00:00
|
|
|
describe("util module", function()
|
2016-04-03 04:52:30 +00:00
|
|
|
local util
|
|
|
|
setup(function()
|
|
|
|
require("commonrequire")
|
|
|
|
util = require("util")
|
|
|
|
end)
|
|
|
|
|
2015-04-22 06:17:06 +00:00
|
|
|
it("should strip punctuations around word", function()
|
|
|
|
assert.is_equal(util.stripePunctuations("\"hello world\""), "hello world")
|
|
|
|
assert.is_equal(util.stripePunctuations("\"hello world?\""), "hello world")
|
|
|
|
assert.is_equal(util.stripePunctuations("\"hello, world?\""), "hello, world")
|
|
|
|
assert.is_equal(util.stripePunctuations("“你好“"), "你好")
|
|
|
|
assert.is_equal(util.stripePunctuations("“你好?“"), "你好")
|
|
|
|
end)
|
2016-04-03 04:52:30 +00:00
|
|
|
|
2015-04-22 06:17:06 +00:00
|
|
|
it("should split string with patterns", function()
|
2016-04-16 10:21:49 +00:00
|
|
|
local sentence = "Hello world, welcome to KOReader!"
|
2015-04-22 06:17:06 +00:00
|
|
|
local words = {}
|
|
|
|
for word in util.gsplit(sentence, "%s+", false) do
|
|
|
|
table.insert(words, word)
|
|
|
|
end
|
2016-04-16 10:21:49 +00:00
|
|
|
assert.are_same(words, {"Hello", "world,", "welcome", "to", "KOReader!"})
|
2015-04-22 06:17:06 +00:00
|
|
|
end)
|
2016-04-03 04:52:30 +00:00
|
|
|
|
2015-04-22 06:17:06 +00:00
|
|
|
it("should split command line arguments with quotation", function()
|
|
|
|
local command = "./sdcv -nj \"words\" \"a lot\" 'more or less' --data-dir=dict"
|
|
|
|
local argv = {}
|
|
|
|
for arg1 in util.gsplit(command, "[\"'].-[\"']", true) do
|
|
|
|
for arg2 in util.gsplit(arg1, "^[^\"'].-%s+", true) do
|
|
|
|
for arg3 in util.gsplit(arg2, "[\"']", false) do
|
|
|
|
local trimed = arg3:gsub("^%s*(.-)%s*$", "%1")
|
|
|
|
if trimed ~= "" then
|
|
|
|
table.insert(argv, trimed)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
assert.are_same(argv, {"./sdcv", "-nj", "words", "a lot", "more or less", "--data-dir=dict"})
|
|
|
|
end)
|
2016-06-05 07:33:31 +00:00
|
|
|
|
|
|
|
it("should split line into words", function()
|
|
|
|
local words = util.splitToWords("one two,three four . five")
|
|
|
|
assert.are_same(words, {
|
|
|
|
"one",
|
|
|
|
" ",
|
|
|
|
"two",
|
|
|
|
",",
|
|
|
|
"three",
|
|
|
|
" ",
|
|
|
|
"four",
|
|
|
|
" . ",
|
|
|
|
"five",
|
|
|
|
})
|
|
|
|
end)
|
2016-06-28 15:50:21 +00:00
|
|
|
|
|
|
|
it("should split ancient greek words", function()
|
|
|
|
local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.")
|
|
|
|
assert.are_same(words, {
|
|
|
|
"Λαρισαῖος",
|
|
|
|
" ",
|
|
|
|
"Λευκοθέα",
|
|
|
|
" ",
|
|
|
|
"Λιγυαστάδης",
|
|
|
|
"."
|
|
|
|
})
|
|
|
|
end)
|
|
|
|
|
|
|
|
it("should split Chinese words", function()
|
|
|
|
local words = util.splitToWords("彩虹是通过太阳光的折射引起的。")
|
|
|
|
assert.are_same(words, {
|
|
|
|
"彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
|
|
|
|
})
|
|
|
|
end)
|
|
|
|
|
|
|
|
it("should split words of multilingual text", function()
|
|
|
|
local words = util.splitToWords("BBC纪录片")
|
|
|
|
assert.are_same(words, {"BBC", "纪", "录", "片"})
|
|
|
|
end)
|
2016-11-19 20:26:53 +00:00
|
|
|
|
|
|
|
it("should split text to line - unicode", function()
|
|
|
|
local text = "Pójdźże, chmurność glück schließen Štěstí neštěstí. Uñas gavilán"
|
|
|
|
local word = ""
|
|
|
|
local table_of_words = {}
|
|
|
|
local c
|
|
|
|
local table_chars = util.splitToChars(text)
|
|
|
|
for i = 1, #table_chars do
|
|
|
|
c = table_chars[i]
|
|
|
|
word = word .. c
|
|
|
|
if util.isSplitable(c) then
|
|
|
|
table.insert(table_of_words, word)
|
|
|
|
word = ""
|
|
|
|
end
|
|
|
|
if i == #table_chars then table.insert(table_of_words, word) end
|
|
|
|
end
|
|
|
|
assert.are_same(table_of_words, {
|
|
|
|
"Pójdźże,",
|
|
|
|
" ",
|
|
|
|
"chmurność ",
|
|
|
|
"glück ",
|
|
|
|
"schließen ",
|
|
|
|
"Štěstí ",
|
|
|
|
"neštěstí.",
|
|
|
|
" ",
|
|
|
|
"Uñas ",
|
|
|
|
"gavilán",
|
|
|
|
})
|
|
|
|
end)
|
|
|
|
|
2015-04-22 06:17:06 +00:00
|
|
|
end)
|