Other minor frontend.util cleanups (#5629)

* Resync fixUtf8 w/ upstream
* Fix lastIndexOf desc
* Drop unichar usage, it's a crappier unicodeCodepointToUtf8 ;).
reviewable/pr5636/r1
NiLuJe 5 years ago committed by GitHub
parent 4740ab1fdc
commit d8e0b1759b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,7 +9,8 @@ local Screen = require("device").screen
local UIManager = require("ui/uimanager") local UIManager = require("ui/uimanager")
local lfs = require("libs/libkoreader-lfs") local lfs = require("libs/libkoreader-lfs")
local logger = require("logger") local logger = require("logger")
local util = require("ffi/util") local FFIUtil = require("ffi/util")
local util = require("util")
local _ = require("gettext") local _ = require("gettext")
local T = require("ffi/util").template local T = require("ffi/util").template
@ -255,7 +256,7 @@ function Search:find(option)
s=string.sub(s, n, string.len(s)-j) s=string.sub(s, n, string.len(s)-j)
end end
s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unichar(tonumber(w, 16)) end) s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unicodeCodepointToUtf8(tonumber(w, 16)) end)
return s return s
end end
@ -606,7 +607,7 @@ function Search:browse(option, run, chosen)
if run == 1 then if run == 1 then
self.results = {} self.results = {}
if option == "series" then if option == "series" then
for v,n in util.orderedPairs(self.browse_series) do for v,n in FFIUtil.orderedPairs(self.browse_series) do
dummy = v dummy = v
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
if string.find(dummy, upsearch, nil, true) then if string.find(dummy, upsearch, nil, true) then
@ -619,7 +620,7 @@ function Search:browse(option, run, chosen)
end end
end end
else else
for v,n in util.orderedPairs(self.browse_tags) do for v,n in FFIUtil.orderedPairs(self.browse_tags) do
dummy = v dummy = v
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
if string.find(dummy, upsearch, nil, true) then if string.find(dummy, upsearch, nil, true) then

@ -567,7 +567,7 @@ function ReaderDictionary:cleanSelection(text)
-- with plain ascii quote (for french words like "aujourdhui") -- with plain ascii quote (for french words like "aujourdhui")
text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark) text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark)
-- Strip punctuation characters around selection -- Strip punctuation characters around selection
text = util.stripePunctuations(text) text = util.stripPunctuation(text)
-- Strip some common english grammatical construct -- Strip some common english grammatical construct
text = text:gsub("'s$", '') -- english possessive text = text:gsub("'s$", '') -- english possessive
-- Strip some common french grammatical constructs -- Strip some common french grammatical constructs

@ -1116,7 +1116,7 @@ function ReaderHighlight:onHighlightSearch()
logger.dbg("search highlight") logger.dbg("search highlight")
self:highlightFromHoldPos() self:highlightFromHoldPos()
if self.selected_text then if self.selected_text then
local text = require("util").stripePunctuations(self.selected_text.text) local text = require("util").stripPunctuation(self.selected_text.text)
self.ui:handleEvent(Event:new("ShowSearchDialog", text)) self.ui:handleEvent(Event:new("ShowSearchDialog", text))
end end
end end

@ -22,6 +22,7 @@
--]] --]]
local BaseUtil = require("ffi/util") local BaseUtil = require("ffi/util")
local util = require("util")
local logger = require("logger") local logger = require("logger")
-- Hangul Syllables -- Hangul Syllables
@ -82,8 +83,8 @@ end
function HgSylbls:get_combined_char(initial, medial, final) function HgSylbls:get_combined_char(initial, medial, final)
-- utf8.char() -- utf8.char() (i.e., encode)
return BaseUtil.unichar(HgSylbls:_get_combined_charcode(initial, medial, final)) return util.unicodeCodepointToUtf8(HgSylbls:_get_combined_charcode(initial, medial, final))
end end
function HgSylbls:_get_combined_charcode(initial, medial, final) function HgSylbls:_get_combined_charcode(initial, medial, final)
local len_medial = #HgSylbls.CHARS_MEDIAL local len_medial = #HgSylbls.CHARS_MEDIAL
@ -145,7 +146,7 @@ function HgSylbls:in_vowel_char(char)
HgSylbls.UNI_HG_COMPAT_VOWEL_BASE, HgSylbls.UNI_HG_COMPAT_VOWEL_UPPER) HgSylbls.UNI_HG_COMPAT_VOWEL_BASE, HgSylbls.UNI_HG_COMPAT_VOWEL_UPPER)
end end
function HgSylbls:_in_target_char_group(char, base, upper, compat_base, compat_upper) function HgSylbls:_in_target_char_group(char, base, upper, compat_base, compat_upper)
local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() (i.e., decode)
if code == nil then if code == nil then
return false return false

@ -3,7 +3,7 @@
https://github.com/Wiladams/LAPHLibs https://github.com/Wiladams/LAPHLibs
--]] --]]
local util = require("ffi/util") local util = require("util")
local luxl = require("luxl") local luxl = require("luxl")
local ffi = require("ffi") local ffi = require("ffi")
@ -23,7 +23,7 @@ local function unescape(str)
if unescape_map[s] then if unescape_map[s] then
return unescape_map[s] return unescape_map[s]
elseif n == "#" then -- unescape unicode elseif n == "#" then -- unescape unicode
return util.unichar(tonumber(s)) return util.unicodeCodepointToUtf8(tonumber(s))
else else
return orig return orig
end end

@ -7,19 +7,20 @@ local dbg = require("dbg")
local _ = require("gettext") local _ = require("gettext")
local T = BaseUtil.template local T = BaseUtil.template
local lshift = bit.lshift
local rshift = bit.rshift local rshift = bit.rshift
local band = bit.band local band = bit.band
local bor = bit.bor local bor = bit.bor
local util = {} local util = {}
--- Strips all punctuation and spaces from a string. --- Strips all punctuation marks and spaces from a string.
---- @string text the string to be stripped ---- @string text the string to be stripped
---- @treturn string stripped text ---- @treturn string stripped text
function util.stripePunctuations(text) function util.stripPunctuation(text)
if not text then return end if not text then return end
-- strip ASCII punctuation characters around text -- strip ASCII punctuation marks around text
-- and strip any generic punctuation (U+2000 - U+206F) in the text -- and strip any generic punctuation marks (U+2000 - U+206F) in the text
return text:gsub("\226[\128-\131][\128-\191]", ''):gsub("^%p+", ''):gsub("%p+$", '') return text:gsub("\226[\128-\131][\128-\191]", ''):gsub("^%p+", ''):gsub("%p+$", '')
end end
@ -286,7 +287,7 @@ function util.tableMerge(t1, t2)
end end
--[[-- --[[--
Gets last index of string in character Gets last index of character in string (i.e., strrchr)
Returns the index within this string of the last occurrence of the specified character Returns the index within this string of the last occurrence of the specified character
or -1 if the character does not occur. or -1 if the character does not occur.
@ -348,7 +349,7 @@ function util.splitToChars(text)
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
-- low surrogate following a high surrogate, good, let's make them a single char -- low surrogate following a high surrogate, good, let's make them a single char
charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000 charcode = lshift((hi_surrogate - 0xD800), 10) + (charcode - 0xDC00) + 0x10000
table.insert(tab, util.unicodeCodepointToUtf8(charcode)) table.insert(tab, util.unicodeCodepointToUtf8(charcode))
hi_surrogate = nil hi_surrogate = nil
else else
@ -379,13 +380,13 @@ function util.hasCJKChar(str)
return string.match(str, "[\228-\234][\128-\191].") ~= nil return string.match(str, "[\228-\234][\128-\191].") ~= nil
end end
--- Split texts into a list of words, spaces and punctuation. --- Split texts into a list of words, spaces and punctuation marks.
---- @string text text to split ---- @string text text to split
---- @treturn table list of words, spaces and punctuation ---- @treturn table list of words, spaces and punctuation marks
function util.splitToWords(text) function util.splitToWords(text)
local wlist = {} local wlist = {}
for word in util.gsplit(text, "[%s%p]+", true) do for word in util.gsplit(text, "[%s%p]+", true) do
-- if space splitted word contains CJK characters -- if space split word contains CJK characters
if util.hasCJKChar(word) then if util.hasCJKChar(word) then
-- split with CJK characters -- split with CJK characters
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
@ -399,11 +400,11 @@ function util.splitToWords(text)
end end
-- We don't want to split on a space if it is followed by some -- We don't want to split on a space if it is followed by some
-- specific punctuation : e.g. "word :" or "word )" -- specific punctuation marks : e.g. "word :" or "word )"
-- (In french, there is a space before a colon, and it better -- (In French, there is a non-breaking space before a colon, and it better
-- not be wrapped there.) -- not be wrapped there.)
local non_splittable_space_tailers = ":;,.!?)]}$%=-+*/|<>»”" local non_splittable_space_tailers = ":;,.!?)]}$%=-+*/|<>»”"
-- Same if a space has some specific other punctuation before it -- Same if a space has some specific other punctuation mark before it
local non_splittable_space_leaders = "([{$=-+*/|<>«“" local non_splittable_space_leaders = "([{$=-+*/|<>«“"
@ -460,20 +461,20 @@ function util.isSplittable(c, next_c, prev_c)
return true return true
end end
elseif c == " " then elseif c == " " then
-- we only split on a space (so punctuation sticks to prev word) -- we only split on a space (so a punctuation mark sticks to prev word)
-- if next_c or prev_c is provided, we can make a better decision -- if next_c or prev_c is provided, we can make a better decision
if next_c and non_splittable_space_tailers:find(next_c, 1, true) then if next_c and non_splittable_space_tailers:find(next_c, 1, true) then
-- this space is followed by some punctuation that is better kept with us -- this space is followed by some punctuation mark that is better kept with us
return false return false
elseif prev_c and non_splittable_space_leaders:find(prev_c, 1, true) then elseif prev_c and non_splittable_space_leaders:find(prev_c, 1, true) then
-- this space is lead by some punctuation that is better kept with us -- this space is lead by some punctuation mark that is better kept with us
return false return false
else else
-- we can split on this space -- we can split on this space
return true return true
end end
end end
-- otherwise, non splittable -- otherwise, not splittable
return false return false
end end
@ -570,7 +571,7 @@ local function replaceSlashChar(str)
end end
--[[-- --[[--
Replaces characters that are invalid filenames. Replaces characters that are invalid in filenames.
Replaces the characters `\/:*?"<>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem. Replaces the characters `\/:*?"<>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
@ -683,7 +684,7 @@ function util.getMenuText(item)
text = item.text text = item.text
end end
if item.sub_item_table ~= nil or item.sub_item_table_func then if item.sub_item_table ~= nil or item.sub_item_table_func then
text = text .. " \226\150\184" text = text .. " "
end end
return text return text
end end
@ -692,6 +693,8 @@ end
Replaces invalid UTF-8 characters with a replacement string. Replaces invalid UTF-8 characters with a replacement string.
Based on <http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua>. Based on <http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua>.
c.f., FixUTF8 @ <https://github.com/pkulchenko/ZeroBraneStudio/blob/master/src/util.lua>.
@string str the string to be checked for invalid characters @string str the string to be checked for invalid characters
@string replacement the string to replace invalid characters with @string replacement the string to replace invalid characters with
@treturn string valid UTF-8 @treturn string valid UTF-8
@ -700,15 +703,15 @@ function util.fixUtf8(str, replacement)
local pos = 1 local pos = 1
local len = #str local len = #str
while pos <= len do while pos <= len do
if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1 if str:find("^[%z\1-\127]", pos) then pos = pos + 1
elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2 elseif str:find("^[\194-\223][\128-\191]", pos) then pos = pos + 2
elseif pos == str:find( "\224[\160-\191][\128-\191]", pos) elseif str:find( "^\224[\160-\191][\128-\191]", pos)
or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos) or str:find("^[\225-\236][\128-\191][\128-\191]", pos)
or pos == str:find( "\237[\128-\159][\128-\191]", pos) or str:find( "^\237[\128-\159][\128-\191]", pos)
or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3 or str:find("^[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos) elseif str:find( "^\240[\144-\191][\128-\191][\128-\191]", pos)
or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos) or str:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4 or str:find( "^\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
else else
str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1) str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1)
pos = pos + #replacement pos = pos + #replacement
@ -735,6 +738,7 @@ end
--- Convert a Unicode codepoint (number) to UTF-8 char --- Convert a Unicode codepoint (number) to UTF-8 char
--- c.f., <https://stackoverflow.com/a/4609989> --- c.f., <https://stackoverflow.com/a/4609989>
--- & <https://stackoverflow.com/a/38492214> --- & <https://stackoverflow.com/a/38492214>
--- See utf8charcode in ffi/util for a decoder.
-- --
--- @int c Unicode codepoint --- @int c Unicode codepoint
--- @treturn string UTF-8 char --- @treturn string UTF-8 char
@ -779,12 +783,12 @@ local HTML_ENTITIES_TO_UTF8 = {
{"&amp;", "&"}, -- must be last {"&amp;", "&"}, -- must be last
} }
--[[-- --[[--
Replace HTML entities with their UTF8 equivalent in text. Replace HTML entities with their UTF-8 encoded equivalent in text.
Supports only basic ones and those with numbers (no support for named entities like `&eacute;`). Supports only basic ones and those with numbers (no support for named entities like `&eacute;`).
@int string text with HTML entities @int string text with HTML entities
@treturn string UTF8 text @treturn string UTF-8 text
]] ]]
function util.htmlEntitiesToUtf8(text) function util.htmlEntitiesToUtf8(text)
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
@ -834,7 +838,7 @@ function util.htmlToPlainTextIfHtml(text)
is_html = true is_html = true
else else
-- no <tag> found -- no <tag> found
-- but we may meet some text badly twicely encoded html containing "&lt;br&gt;" -- but we may meet some text badly/twice encoded html containing "&lt;br&gt;"
local nb_encoded_tags local nb_encoded_tags
_, nb_encoded_tags = text:gsub("&lt;%a+&gt;", "") _, nb_encoded_tags = text:gsub("&lt;%a+&gt;", "")
if nb_encoded_tags > 0 then if nb_encoded_tags > 0 then

@ -6,14 +6,14 @@ describe("util module", function()
util = require("util") util = require("util")
end) end)
it("should strip punctuations around word", function() it("should strip punctuation marks around word", function()
assert.is_equal("hello world", util.stripePunctuations("\"hello world\"")) assert.is_equal("hello world", util.stripPunctuation("\"hello world\""))
assert.is_equal("hello world", util.stripePunctuations("\"hello world?\"")) assert.is_equal("hello world", util.stripPunctuation("\"hello world?\""))
assert.is_equal("hello, world", util.stripePunctuations("\"hello, world?\"")) assert.is_equal("hello, world", util.stripPunctuation("\"hello, world?\""))
assert.is_equal("你好", util.stripePunctuations("“你好“")) assert.is_equal("你好", util.stripPunctuation("“你好“"))
assert.is_equal("你好", util.stripePunctuations("“你好?“")) assert.is_equal("你好", util.stripPunctuation("“你好?“"))
assert.is_equal("", util.stripePunctuations("")) assert.is_equal("", util.stripPunctuation(""))
assert.is_nil(util.stripePunctuations(nil)) assert.is_nil(util.stripPunctuation(nil))
end) end)
describe("gsplit()", function() describe("gsplit()", function()

Loading…
Cancel
Save