Other minor frontend.util cleanups (#5629)

* Resync fixUtf8 w/ upstream
* Fix lastIndexOf desc
* Drop unichar usage, it's a crappier unicodeCodepointToUtf8 ;).
pull/5636/head
NiLuJe 5 years ago committed by GitHub
parent 4740ab1fdc
commit d8e0b1759b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,7 +9,8 @@ local Screen = require("device").screen
local UIManager = require("ui/uimanager")
local lfs = require("libs/libkoreader-lfs")
local logger = require("logger")
local util = require("ffi/util")
local FFIUtil = require("ffi/util")
local util = require("util")
local _ = require("gettext")
local T = require("ffi/util").template
@ -255,7 +256,7 @@ function Search:find(option)
s=string.sub(s, n, string.len(s)-j)
end
s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unichar(tonumber(w, 16)) end)
s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unicodeCodepointToUtf8(tonumber(w, 16)) end)
return s
end
@ -606,7 +607,7 @@ function Search:browse(option, run, chosen)
if run == 1 then
self.results = {}
if option == "series" then
for v,n in util.orderedPairs(self.browse_series) do
for v,n in FFIUtil.orderedPairs(self.browse_series) do
dummy = v
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
if string.find(dummy, upsearch, nil, true) then
@ -619,7 +620,7 @@ function Search:browse(option, run, chosen)
end
end
else
for v,n in util.orderedPairs(self.browse_tags) do
for v,n in FFIUtil.orderedPairs(self.browse_tags) do
dummy = v
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
if string.find(dummy, upsearch, nil, true) then

@ -567,7 +567,7 @@ function ReaderDictionary:cleanSelection(text)
-- with plain ascii quote (for french words like "aujourdhui")
text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark)
-- Strip punctuation characters around selection
text = util.stripePunctuations(text)
text = util.stripPunctuation(text)
-- Strip some common english grammatical construct
text = text:gsub("'s$", '') -- english possessive
-- Strip some common french grammatical constructs

@ -1116,7 +1116,7 @@ function ReaderHighlight:onHighlightSearch()
logger.dbg("search highlight")
self:highlightFromHoldPos()
if self.selected_text then
local text = require("util").stripePunctuations(self.selected_text.text)
local text = require("util").stripPunctuation(self.selected_text.text)
self.ui:handleEvent(Event:new("ShowSearchDialog", text))
end
end

@ -22,6 +22,7 @@
--]]
local BaseUtil = require("ffi/util")
local util = require("util")
local logger = require("logger")
-- Hangul Syllables
@ -82,8 +83,8 @@ end
function HgSylbls:get_combined_char(initial, medial, final)
-- utf8.char()
return BaseUtil.unichar(HgSylbls:_get_combined_charcode(initial, medial, final))
-- utf8.char() (i.e., encode)
return util.unicodeCodepointToUtf8(HgSylbls:_get_combined_charcode(initial, medial, final))
end
function HgSylbls:_get_combined_charcode(initial, medial, final)
local len_medial = #HgSylbls.CHARS_MEDIAL
@ -145,7 +146,7 @@ function HgSylbls:in_vowel_char(char)
HgSylbls.UNI_HG_COMPAT_VOWEL_BASE, HgSylbls.UNI_HG_COMPAT_VOWEL_UPPER)
end
function HgSylbls:_in_target_char_group(char, base, upper, compat_base, compat_upper)
local code = BaseUtil.utf8charcode(char) -- utf8.codepoint()
local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() (i.e., decode)
if code == nil then
return false

@ -3,7 +3,7 @@
https://github.com/Wiladams/LAPHLibs
--]]
local util = require("ffi/util")
local util = require("util")
local luxl = require("luxl")
local ffi = require("ffi")
@ -23,7 +23,7 @@ local function unescape(str)
if unescape_map[s] then
return unescape_map[s]
elseif n == "#" then -- unescape unicode
return util.unichar(tonumber(s))
return util.unicodeCodepointToUtf8(tonumber(s))
else
return orig
end

@ -7,19 +7,20 @@ local dbg = require("dbg")
local _ = require("gettext")
local T = BaseUtil.template
local lshift = bit.lshift
local rshift = bit.rshift
local band = bit.band
local bor = bit.bor
local util = {}
--- Strips all punctuation and spaces from a string.
--- Strips all punctuation marks and spaces from a string.
---- @string text the string to be stripped
---- @treturn string stripped text
function util.stripePunctuations(text)
function util.stripPunctuation(text)
if not text then return end
-- strip ASCII punctuation characters around text
-- and strip any generic punctuation (U+2000 - U+206F) in the text
-- strip ASCII punctuation marks around text
-- and strip any generic punctuation marks (U+2000 - U+206F) in the text
return text:gsub("\226[\128-\131][\128-\191]", ''):gsub("^%p+", ''):gsub("%p+$", '')
end
@ -286,7 +287,7 @@ function util.tableMerge(t1, t2)
end
--[[--
Gets last index of string in character
Gets last index of character in string (i.e., strrchr)
Returns the index within this string of the last occurrence of the specified character
or -1 if the character does not occur.
@ -348,7 +349,7 @@ function util.splitToChars(text)
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
-- low surrogate following a high surrogate, good, let's make them a single char
charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000
charcode = lshift((hi_surrogate - 0xD800), 10) + (charcode - 0xDC00) + 0x10000
table.insert(tab, util.unicodeCodepointToUtf8(charcode))
hi_surrogate = nil
else
@ -379,13 +380,13 @@ function util.hasCJKChar(str)
return string.match(str, "[\228-\234][\128-\191].") ~= nil
end
--- Split texts into a list of words, spaces and punctuation.
--- Split texts into a list of words, spaces and punctuation marks.
---- @string text text to split
---- @treturn table list of words, spaces and punctuation
---- @treturn table list of words, spaces and punctuation marks
function util.splitToWords(text)
local wlist = {}
for word in util.gsplit(text, "[%s%p]+", true) do
-- if space splitted word contains CJK characters
-- if space split word contains CJK characters
if util.hasCJKChar(word) then
-- split with CJK characters
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
@ -399,11 +400,11 @@ function util.splitToWords(text)
end
-- We don't want to split on a space if it is followed by some
-- specific punctuation : e.g. "word :" or "word )"
-- (In french, there is a space before a colon, and it better
-- specific punctuation marks : e.g. "word :" or "word )"
-- (In French, there is a non-breaking space before a colon, and it better
-- not be wrapped there.)
local non_splittable_space_tailers = ":;,.!?)]}$%=-+*/|<>»”"
-- Same if a space has some specific other punctuation before it
-- Same if a space has some specific other punctuation mark before it
local non_splittable_space_leaders = "([{$=-+*/|<>«“"
@ -460,20 +461,20 @@ function util.isSplittable(c, next_c, prev_c)
return true
end
elseif c == " " then
-- we only split on a space (so punctuation sticks to prev word)
-- we only split on a space (so a punctuation mark sticks to prev word)
-- if next_c or prev_c is provided, we can make a better decision
if next_c and non_splittable_space_tailers:find(next_c, 1, true) then
-- this space is followed by some punctuation that is better kept with us
-- this space is followed by some punctuation mark that is better kept with us
return false
elseif prev_c and non_splittable_space_leaders:find(prev_c, 1, true) then
-- this space is lead by some punctuation that is better kept with us
-- this space is lead by some punctuation mark that is better kept with us
return false
else
-- we can split on this space
return true
end
end
-- otherwise, non splittable
-- otherwise, not splittable
return false
end
@ -570,7 +571,7 @@ local function replaceSlashChar(str)
end
--[[--
Replaces characters that are invalid filenames.
Replaces characters that are invalid in filenames.
Replaces the characters `\/:*?"<>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
@ -683,7 +684,7 @@ function util.getMenuText(item)
text = item.text
end
if item.sub_item_table ~= nil or item.sub_item_table_func then
text = text .. " \226\150\184"
text = text .. " "
end
return text
end
@ -692,6 +693,8 @@ end
Replaces invalid UTF-8 characters with a replacement string.
Based on <http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua>.
c.f., FixUTF8 @ <https://github.com/pkulchenko/ZeroBraneStudio/blob/master/src/util.lua>.
@string str the string to be checked for invalid characters
@string replacement the string to replace invalid characters with
@treturn string valid UTF-8
@ -700,15 +703,15 @@ function util.fixUtf8(str, replacement)
local pos = 1
local len = #str
while pos <= len do
if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1
elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2
elseif pos == str:find( "\224[\160-\191][\128-\191]", pos)
or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos)
or pos == str:find( "\237[\128-\159][\128-\191]", pos)
or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos)
or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
if str:find("^[%z\1-\127]", pos) then pos = pos + 1
elseif str:find("^[\194-\223][\128-\191]", pos) then pos = pos + 2
elseif str:find( "^\224[\160-\191][\128-\191]", pos)
or str:find("^[\225-\236][\128-\191][\128-\191]", pos)
or str:find( "^\237[\128-\159][\128-\191]", pos)
or str:find("^[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
elseif str:find( "^\240[\144-\191][\128-\191][\128-\191]", pos)
or str:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
or str:find( "^\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
else
str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1)
pos = pos + #replacement
@ -735,6 +738,7 @@ end
--- Convert a Unicode codepoint (number) to UTF-8 char
--- c.f., <https://stackoverflow.com/a/4609989>
--- & <https://stackoverflow.com/a/38492214>
--- See utf8charcode in ffi/util for a decoder.
--
--- @int c Unicode codepoint
--- @treturn string UTF-8 char
@ -779,12 +783,12 @@ local HTML_ENTITIES_TO_UTF8 = {
{"&amp;", "&"}, -- must be last
}
--[[--
Replace HTML entities with their UTF8 equivalent in text.
Replace HTML entities with their UTF-8 encoded equivalent in text.
Supports only basic ones and those with numbers (no support for named entities like `&eacute;`).
@int string text with HTML entities
@treturn string UTF8 text
@treturn string UTF-8 text
]]
function util.htmlEntitiesToUtf8(text)
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
@ -834,7 +838,7 @@ function util.htmlToPlainTextIfHtml(text)
is_html = true
else
-- no <tag> found
-- but we may meet some text badly twicely encoded html containing "&lt;br&gt;"
-- but we may meet some text badly/twice encoded html containing "&lt;br&gt;"
local nb_encoded_tags
_, nb_encoded_tags = text:gsub("&lt;%a+&gt;", "")
if nb_encoded_tags > 0 then

@ -6,14 +6,14 @@ describe("util module", function()
util = require("util")
end)
it("should strip punctuations around word", function()
assert.is_equal("hello world", util.stripePunctuations("\"hello world\""))
assert.is_equal("hello world", util.stripePunctuations("\"hello world?\""))
assert.is_equal("hello, world", util.stripePunctuations("\"hello, world?\""))
assert.is_equal("你好", util.stripePunctuations("“你好“"))
assert.is_equal("你好", util.stripePunctuations("“你好?“"))
assert.is_equal("", util.stripePunctuations(""))
assert.is_nil(util.stripePunctuations(nil))
it("should strip punctuation marks around word", function()
assert.is_equal("hello world", util.stripPunctuation("\"hello world\""))
assert.is_equal("hello world", util.stripPunctuation("\"hello world?\""))
assert.is_equal("hello, world", util.stripPunctuation("\"hello, world?\""))
assert.is_equal("你好", util.stripPunctuation("“你好“"))
assert.is_equal("你好", util.stripPunctuation("“你好?“"))
assert.is_equal("", util.stripPunctuation(""))
assert.is_nil(util.stripPunctuation(nil))
end)
describe("gsplit()", function()

Loading…
Cancel
Save