diff --git a/frontend/apps/filemanager/filemanagersearch.lua b/frontend/apps/filemanager/filemanagersearch.lua index 24ed19cd1..2e8ee28c1 100644 --- a/frontend/apps/filemanager/filemanagersearch.lua +++ b/frontend/apps/filemanager/filemanagersearch.lua @@ -9,7 +9,8 @@ local Screen = require("device").screen local UIManager = require("ui/uimanager") local lfs = require("libs/libkoreader-lfs") local logger = require("logger") -local util = require("ffi/util") +local FFIUtil = require("ffi/util") +local util = require("util") local _ = require("gettext") local T = require("ffi/util").template @@ -255,7 +256,7 @@ function Search:find(option) s=string.sub(s, n, string.len(s)-j) end - s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unichar(tonumber(w, 16)) end) + s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unicodeCodepointToUtf8(tonumber(w, 16)) end) return s end @@ -606,7 +607,7 @@ function Search:browse(option, run, chosen) if run == 1 then self.results = {} if option == "series" then - for v,n in util.orderedPairs(self.browse_series) do + for v,n in FFIUtil.orderedPairs(self.browse_series) do dummy = v if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end if string.find(dummy, upsearch, nil, true) then @@ -619,7 +620,7 @@ function Search:browse(option, run, chosen) end end else - for v,n in util.orderedPairs(self.browse_tags) do + for v,n in FFIUtil.orderedPairs(self.browse_tags) do dummy = v if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end if string.find(dummy, upsearch, nil, true) then diff --git a/frontend/apps/reader/modules/readerdictionary.lua b/frontend/apps/reader/modules/readerdictionary.lua index dc775837c..869ac3ea6 100644 --- a/frontend/apps/reader/modules/readerdictionary.lua +++ b/frontend/apps/reader/modules/readerdictionary.lua @@ -567,7 +567,7 @@ function ReaderDictionary:cleanSelection(text) -- with plain ascii quote (for french words like "aujourd’hui") text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark) -- Strip punctuation characters around selection - text = util.stripePunctuations(text) + text = util.stripPunctuation(text) -- Strip some common english grammatical construct text = text:gsub("'s$", '') -- english possessive -- Strip some common french grammatical constructs diff --git a/frontend/apps/reader/modules/readerhighlight.lua b/frontend/apps/reader/modules/readerhighlight.lua index ebd98d139..407ceb369 100644 --- a/frontend/apps/reader/modules/readerhighlight.lua +++ b/frontend/apps/reader/modules/readerhighlight.lua @@ -1116,7 +1116,7 @@ function ReaderHighlight:onHighlightSearch() logger.dbg("search highlight") self:highlightFromHoldPos() if self.selected_text then - local text = require("util").stripePunctuations(self.selected_text.text) + local text = require("util").stripPunctuation(self.selected_text.text) self.ui:handleEvent(Event:new("ShowSearchDialog", text)) end end diff --git a/frontend/ui/data/keyboardlayouts/ko_KR_helper.lua b/frontend/ui/data/keyboardlayouts/ko_KR_helper.lua index fa827e399..6a568ef0a 100644 --- a/frontend/ui/data/keyboardlayouts/ko_KR_helper.lua +++ b/frontend/ui/data/keyboardlayouts/ko_KR_helper.lua @@ -22,6 +22,7 @@ --]] local BaseUtil = require("ffi/util") +local util = require("util") local logger = require("logger") -- Hangul Syllables @@ -82,8 +83,8 @@ end function HgSylbls:get_combined_char(initial, medial, final) - -- utf8.char() - return BaseUtil.unichar(HgSylbls:_get_combined_charcode(initial, medial, final)) + -- utf8.char() (i.e., encode) + return util.unicodeCodepointToUtf8(HgSylbls:_get_combined_charcode(initial, medial, final)) end function HgSylbls:_get_combined_charcode(initial, medial, final) local len_medial = #HgSylbls.CHARS_MEDIAL @@ -145,7 +146,7 @@ function HgSylbls:in_vowel_char(char) HgSylbls.UNI_HG_COMPAT_VOWEL_BASE, HgSylbls.UNI_HG_COMPAT_VOWEL_UPPER) end function HgSylbls:_in_target_char_group(char, base, upper, compat_base, compat_upper) - local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() + local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() (i.e., decode) if code == nil then return false diff --git a/frontend/ui/opdsparser.lua b/frontend/ui/opdsparser.lua index e5f681bf2..f415c294a 100644 --- a/frontend/ui/opdsparser.lua +++ b/frontend/ui/opdsparser.lua @@ -3,7 +3,7 @@ https://github.com/Wiladams/LAPHLibs --]] -local util = require("ffi/util") +local util = require("util") local luxl = require("luxl") local ffi = require("ffi") @@ -23,7 +23,7 @@ local function unescape(str) if unescape_map[s] then return unescape_map[s] elseif n == "#" then -- unescape unicode - return util.unichar(tonumber(s)) + return util.unicodeCodepointToUtf8(tonumber(s)) else return orig end diff --git a/frontend/util.lua b/frontend/util.lua index 57f026079..ef39bb76a 100644 --- a/frontend/util.lua +++ b/frontend/util.lua @@ -7,19 +7,20 @@ local dbg = require("dbg") local _ = require("gettext") local T = BaseUtil.template +local lshift = bit.lshift local rshift = bit.rshift local band = bit.band local bor = bit.bor local util = {} ---- Strips all punctuation and spaces from a string. +--- Strips all punctuation marks and spaces from a string. ---- @string text the string to be stripped ---- @treturn string stripped text -function util.stripePunctuations(text) +function util.stripPunctuation(text) if not text then return end - -- strip ASCII punctuation characters around text - -- and strip any generic punctuation (U+2000 - U+206F) in the text + -- strip ASCII punctuation marks around text + -- and strip any generic punctuation marks (U+2000 - U+206F) in the text return text:gsub("\226[\128-\131][\128-\191]", ''):gsub("^%p+", ''):gsub("%p+$", '') end @@ -286,7 +287,7 @@ function util.tableMerge(t1, t2) end --[[-- -Gets last index of string in character +Gets last index of character in string (i.e., strrchr) Returns the index within this string of the last occurrence of the specified character or -1 if the character does not occur. @@ -348,7 +349,7 @@ function util.splitToChars(text) hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then -- low surrogate following a high surrogate, good, let's make them a single char - charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000 + charcode = lshift((hi_surrogate - 0xD800), 10) + (charcode - 0xDC00) + 0x10000 table.insert(tab, util.unicodeCodepointToUtf8(charcode)) hi_surrogate = nil else @@ -379,13 +380,13 @@ function util.hasCJKChar(str) return string.match(str, "[\228-\234][\128-\191].") ~= nil end ---- Split texts into a list of words, spaces and punctuation. +--- Split texts into a list of words, spaces and punctuation marks. ---- @string text text to split ----- @treturn table list of words, spaces and punctuation +---- @treturn table list of words, spaces and punctuation marks function util.splitToWords(text) local wlist = {} for word in util.gsplit(text, "[%s%p]+", true) do - -- if space splitted word contains CJK characters + -- if space split word contains CJK characters if util.hasCJKChar(word) then -- split with CJK characters for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do @@ -399,11 +400,11 @@ function util.splitToWords(text) end -- We don't want to split on a space if it is followed by some --- specific punctuation : e.g. "word :" or "word )" --- (In french, there is a space before a colon, and it better +-- specific punctuation marks : e.g. "word :" or "word )" +-- (In French, there is a non-breaking space before a colon, and it better -- not be wrapped there.) local non_splittable_space_tailers = ":;,.!?)]}$%=-+*/|<>»”" --- Same if a space has some specific other punctuation before it +-- Same if a space has some specific other punctuation mark before it local non_splittable_space_leaders = "([{$=-+*/|<>«“" @@ -460,20 +461,20 @@ function util.isSplittable(c, next_c, prev_c) return true end elseif c == " " then - -- we only split on a space (so punctuation sticks to prev word) + -- we only split on a space (so a punctuation mark sticks to prev word) -- if next_c or prev_c is provided, we can make a better decision if next_c and non_splittable_space_tailers:find(next_c, 1, true) then - -- this space is followed by some punctuation that is better kept with us + -- this space is followed by some punctuation mark that is better kept with us return false elseif prev_c and non_splittable_space_leaders:find(prev_c, 1, true) then - -- this space is lead by some punctuation that is better kept with us + -- this space is lead by some punctuation mark that is better kept with us return false else -- we can split on this space return true end end - -- otherwise, non splittable + -- otherwise, not splittable return false end @@ -570,7 +571,7 @@ local function replaceSlashChar(str) end --[[-- -Replaces characters that are invalid filenames. +Replaces characters that are invalid in filenames. Replaces the characters `\/:*?"<>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem. @@ -683,7 +684,7 @@ function util.getMenuText(item) text = item.text end if item.sub_item_table ~= nil or item.sub_item_table_func then - text = text .. " \226\150\184" + text = text .. " ▸" end return text end @@ -692,6 +693,8 @@ end Replaces invalid UTF-8 characters with a replacement string. Based on . +c.f., FixUTF8 @ . + @string str the string to be checked for invalid characters @string replacement the string to replace invalid characters with @treturn string valid UTF-8 @@ -700,15 +703,15 @@ function util.fixUtf8(str, replacement) local pos = 1 local len = #str while pos <= len do - if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1 - elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2 - elseif pos == str:find( "\224[\160-\191][\128-\191]", pos) - or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos) - or pos == str:find( "\237[\128-\159][\128-\191]", pos) - or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3 - elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos) - or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos) - or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4 + if str:find("^[%z\1-\127]", pos) then pos = pos + 1 + elseif str:find("^[\194-\223][\128-\191]", pos) then pos = pos + 2 + elseif str:find( "^\224[\160-\191][\128-\191]", pos) + or str:find("^[\225-\236][\128-\191][\128-\191]", pos) + or str:find( "^\237[\128-\159][\128-\191]", pos) + or str:find("^[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3 + elseif str:find( "^\240[\144-\191][\128-\191][\128-\191]", pos) + or str:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", pos) + or str:find( "^\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4 else str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1) pos = pos + #replacement @@ -735,6 +738,7 @@ end --- Convert a Unicode codepoint (number) to UTF-8 char --- c.f., --- & +--- See utf8charcode in ffi/util for a decoder. -- --- @int c Unicode codepoint --- @treturn string UTF-8 char @@ -779,12 +783,12 @@ local HTML_ENTITIES_TO_UTF8 = { {"&", "&"}, -- must be last } --[[-- -Replace HTML entities with their UTF8 equivalent in text. +Replace HTML entities with their UTF-8 encoded equivalent in text. Supports only basic ones and those with numbers (no support for named entities like `é`). @int string text with HTML entities -@treturn string UTF8 text +@treturn string UTF-8 text ]] function util.htmlEntitiesToUtf8(text) for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do @@ -834,7 +838,7 @@ function util.htmlToPlainTextIfHtml(text) is_html = true else -- no found - -- but we may meet some text badly twicely encoded html containing "<br>" + -- but we may meet some text badly/twice encoded html containing "<br>" local nb_encoded_tags _, nb_encoded_tags = text:gsub("<%a+>", "") if nb_encoded_tags > 0 then diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua index 3d9706da1..eb5119011 100644 --- a/spec/unit/util_spec.lua +++ b/spec/unit/util_spec.lua @@ -6,14 +6,14 @@ describe("util module", function() util = require("util") end) - it("should strip punctuations around word", function() - assert.is_equal("hello world", util.stripePunctuations("\"hello world\"")) - assert.is_equal("hello world", util.stripePunctuations("\"hello world?\"")) - assert.is_equal("hello, world", util.stripePunctuations("\"hello, world?\"")) - assert.is_equal("你好", util.stripePunctuations("“你好“")) - assert.is_equal("你好", util.stripePunctuations("“你好?“")) - assert.is_equal("", util.stripePunctuations("")) - assert.is_nil(util.stripePunctuations(nil)) + it("should strip punctuation marks around word", function() + assert.is_equal("hello world", util.stripPunctuation("\"hello world\"")) + assert.is_equal("hello world", util.stripPunctuation("\"hello world?\"")) + assert.is_equal("hello, world", util.stripPunctuation("\"hello, world?\"")) + assert.is_equal("你好", util.stripPunctuation("“你好“")) + assert.is_equal("你好", util.stripPunctuation("“你好?“")) + assert.is_equal("", util.stripPunctuation("")) + assert.is_nil(util.stripPunctuation(nil)) end) describe("gsplit()", function()