2016-02-04 18:24:39 +00:00
--[[--
2016-12-13 16:06:02 +00:00
This module contains miscellaneous helper functions for the KOReader frontend .
2016-06-05 07:33:31 +00:00
] ]
2016-02-04 18:24:39 +00:00
2016-06-05 07:33:31 +00:00
local BaseUtil = require ( " ffi/util " )
2022-09-13 21:09:49 +00:00
local Utf8Proc = require ( " ffi/utf8proc " )
2022-09-27 23:10:50 +00:00
local lfs = require ( " libs/libkoreader-lfs " )
2018-12-13 06:27:49 +00:00
local _ = require ( " gettext " )
2022-05-23 22:25:50 +00:00
local C_ = _.pgettext
2018-12-13 06:27:49 +00:00
local T = BaseUtil.template
2019-11-23 23:27:27 +00:00
local lshift = bit.lshift
2019-11-22 18:50:58 +00:00
local rshift = bit.rshift
local band = bit.band
local bor = bit.bor
2015-02-01 09:40:34 +00:00
local util = { }
2020-09-30 17:56:56 +00:00
---- Strips all punctuation marks and spaces from a string.
2016-06-05 07:33:31 +00:00
---- @string text the string to be stripped
---- @treturn string stripped text
2019-11-23 23:27:27 +00:00
function util . stripPunctuation ( text )
2016-06-05 07:33:31 +00:00
if not text then return end
2019-11-23 23:27:27 +00:00
-- strip ASCII punctuation marks around text
-- and strip any generic punctuation marks (U+2000 - U+206F) in the text
2016-06-05 07:33:31 +00:00
return text : gsub ( " \226 [ \128 - \131 ][ \128 - \191 ] " , ' ' ) : gsub ( " ^%p+ " , ' ' ) : gsub ( " %p+$ " , ' ' )
2015-02-01 09:40:34 +00:00
end
2020-09-30 17:56:56 +00:00
-- Various whitespace trimming helpers, from http://lua-users.org/wiki/CommonFunctions & http://lua-users.org/wiki/StringTrim
---- Remove leading whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . ltrim ( s )
return ( s : gsub ( " ^%s* " , " " ) )
end
---- Remove trailing whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . rtrim ( s )
local n = # s
while n > 0 and s : find ( " ^%s " , n ) do
n = n - 1
end
return s : sub ( 1 , n )
end
---- Remove leading & trailing whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . trim ( s )
local from = s : match " ^%s*() "
return from > # s and " " or s : match ( " .*%S " , from )
end
2017-02-25 17:52:34 +00:00
--[[--
2017-04-04 07:57:14 +00:00
Splits a string by a pattern
2015-04-22 06:17:06 +00:00
Lua doesn ' t have a string.split() function and most of the time
you don ' t really need it because string.gmatch() is enough.
However string.gmatch ( ) has one significant disadvantage for me :
You can ' t split a string while matching both the delimited
strings and the delimiters themselves without tracking positions
and substrings . The gsplit function below takes care of
this problem .
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
Author : Peter Odding
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
License : MIT / X11
2017-04-04 07:57:14 +00:00
Source : < a href = " http://snippets.luacode.org/snippets/String_splitting_130 " > http : // snippets.luacode . org / snippets / String_splitting_130 </ a >
2017-02-25 17:52:34 +00:00
] ]
----@string str string to split
----@param pattern the pattern to split against
----@bool capture
2017-04-14 19:12:28 +00:00
----@bool capture_empty_entity
function util . gsplit ( str , pattern , capture , capture_empty_entity )
2015-04-22 06:17:06 +00:00
pattern = pattern and tostring ( pattern ) or ' %s+ '
if ( ' ' ) : find ( pattern ) then
error ( ' pattern matches empty string! ' , 2 )
end
return coroutine.wrap ( function ( )
local index = 1
repeat
local first , last = str : find ( pattern , index )
if first and last then
2017-04-14 19:12:28 +00:00
if index < first or ( index == first and capture_empty_entity ) then
2015-04-22 06:17:06 +00:00
coroutine.yield ( str : sub ( index , first - 1 ) )
end
if capture then
coroutine.yield ( str : sub ( first , last ) )
end
index = last + 1
else
if index <= # str then
coroutine.yield ( str : sub ( index ) )
end
break
end
until index > # str
end )
end
2018-03-31 19:19:31 +00:00
--[[--
Compares values in two different tables .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 32660766 / 2470572 >
2018-03-31 19:19:31 +00:00
] ]
---- @param o1 Lua table
---- @param o2 Lua table
---- @bool ignore_mt
---- @treturn boolean
function util . tableEquals ( o1 , o2 , ignore_mt )
if o1 == o2 then return true end
local o1Type = type ( o1 )
local o2Type = type ( o2 )
if o1Type ~= o2Type then return false end
if o1Type ~= ' table ' then return false end
if not ignore_mt then
local mt1 = getmetatable ( o1 )
if mt1 and mt1.__eq then
2022-09-27 23:10:50 +00:00
-- Compare using built in method
2018-03-31 19:19:31 +00:00
return o1 == o2
end
end
local keySet = { }
for key1 , value1 in pairs ( o1 ) do
local value2 = o2 [ key1 ]
if value2 == nil or util.tableEquals ( value1 , value2 , ignore_mt ) == false then
return false
end
keySet [ key1 ] = true
end
for key2 , _ in pairs ( o2 ) do
if not keySet [ key2 ] then return false end
end
return true
end
2019-03-04 18:01:01 +00:00
--[[--
Makes a deep copy of a table .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 16077650 / 2470572 >
2019-03-04 18:01:01 +00:00
] ]
---- @param o Lua table
---- @treturn Lua table
function util . tableDeepCopy ( o , seen )
seen = seen or { }
if o == nil then return nil end
if seen [ o ] then return seen [ o ] end
local no
if type ( o ) == " table " then
no = { }
seen [ o ] = no
for k , v in next , o , nil do
no [ util.tableDeepCopy ( k , seen ) ] = util.tableDeepCopy ( v , seen )
end
setmetatable ( no , util.tableDeepCopy ( getmetatable ( o ) , seen ) )
else -- number, string, boolean, etc
no = o
end
return no
end
2016-02-04 18:24:39 +00:00
--- Returns number of keys in a table.
2018-12-13 06:27:49 +00:00
---- @param t Lua table
---- @treturn int number of keys in table t
function util . tableSize ( t )
2015-11-27 15:13:01 +00:00
local count = 0
2018-12-13 06:27:49 +00:00
for _ in pairs ( t ) do count = count + 1 end
2015-11-27 15:13:01 +00:00
return count
end
2017-04-04 07:57:14 +00:00
--- Append all elements from t2 into t1.
---- @param t1 Lua table
---- @param t2 Lua table
2016-01-31 22:23:44 +00:00
function util . arrayAppend ( t1 , t2 )
2016-02-12 14:55:02 +00:00
for _ , v in ipairs ( t2 ) do
2016-01-31 22:23:44 +00:00
table.insert ( t1 , v )
end
end
2021-03-06 21:44:18 +00:00
--[[--
Remove elements from an array , fast .
Swap & pop , like < http : // lua - users.org / lists / lua - l / 2013 - 11 / msg00027.html > / < https : // stackoverflow.com / a / 28942022 > , but preserving order .
c.f . , < https : // stackoverflow.com / a / 53038524 >
@ table t Lua array to filter
@ func keep_cb Filtering callback . Takes three arguments : table , index , new index . Returns true to * keep * the item . See link above for potential uses of the third argument .
@ usage
local foo = { " a " , " b " , " c " , " b " , " d " , " e " }
local function drop_b ( t , i , j )
-- Discard any item with value "b"
return t [ i ] ~= " b "
end
util.arrayRemove ( foo , drop_b )
] ]
function util . arrayRemove ( t , keep_cb )
local j , n = 1 , # t
for i = 1 , n do
if keep_cb ( t , i , j ) then
-- Move i's kept value to j's position, if it's not already there.
if i ~= j then
t [ j ] = t [ i ]
t [ i ] = nil
end
-- Increment position of where we'll place the next kept value.
j = j + 1
else
t [ i ] = nil
end
end
return t
end
2020-11-28 16:18:57 +00:00
--- Reverse array elements in-place in table t
2019-12-06 21:55:37 +00:00
---- @param t Lua table
function util . arrayReverse ( t )
local i , j = 1 , # t
while i < j do
t [ i ] , t [ j ] = t [ j ] , t [ i ]
i = i + 1
j = j - 1
end
end
2020-11-28 16:18:57 +00:00
--- Test whether t contains a value equal to v
--- (or such a value that callback returns true),
--- and if so, return the index.
---- @param t Lua table
---- @param v
2021-03-06 21:44:18 +00:00
---- @func callback(v1, v2)
2020-11-28 16:18:57 +00:00
function util . arrayContains ( t , v , cb )
cb = cb or function ( v1 , v2 ) return v1 == v2 end
for _k , _v in ipairs ( t ) do
if cb ( _v , v ) then
return _k
end
end
return false
end
Revamp "flash_ui" handling (#7118)
* Wherever possible, do an actual dumb invert on the Screen BB instead of repainting the widget, *then* inverting it (which is what the "invert" flag does).
* Instead of playing with nextTick/tickAfterNext delays, explicitly fence stuff with forceRePaint
* And, in the few cases where said Mk. 7 quirk kicks in, make the fences more marked by using a well-placed WAIT_FOR_UPDATE_COMPLETE
* Fix an issue in Button where show/hide & enable/disable where actually all toggles, which meant that duplicate calls or timing issues would do the wrong thing. (This broke dimming some icons, and mistakenly dropped the background from FM chevrons, for example).
* Speaking of, fix Button's hide/show to actually restore the background properly (there was a stupid typo in the variable name)
* Still in Button, fix the insanity of the double repaint on rounded buttons. Turns out it made sense, after all (and was related to said missing background, and bad interaction with invert & text with no background).
* KeyValuePage suffered from a similar issue with broken highlights (all black) because of missing backgrounds.
* In ConfigDialog, only instanciate IconButtons once (because every tab switch causes a full instantiation; and the initial display implies a full instanciation and an initial tab switch). Otherwise, both instances linger, and catch taps, and as such, double highlights.
* ConfigDialog: Restore the "don't repaint ReaderUI" when switching between similarly sized tabs (re #6131). I never could reproduce that on eInk, and I can't now on the emulator, so I'm assuming @poire-z fixed it during the swap to SVG icons.
* KeyValuePage: Only instanciate Buttons once (again, this is a widget that goes through a full init every page). Again, caused highlight/dimming issues because buttons were stacked.
* Menu: Ditto.
* TouchMenu: Now home of the gnarliest unhilight heuristics, because of the sheer amount of different things that can happen (and/or thanks to stuff not flagged covers_fullscreen properly ;p).
* Bump base
https://github.com/koreader/koreader-base/pull/1280
https://github.com/koreader/koreader-base/pull/1282
https://github.com/koreader/koreader-base/pull/1283
https://github.com/koreader/koreader-base/pull/1284
* Bump android-luajit-launcher
https://github.com/koreader/android-luajit-launcher/pull/284
https://github.com/koreader/android-luajit-launcher/pull/285
https://github.com/koreader/android-luajit-launcher/pull/286
https://github.com/koreader/android-luajit-launcher/pull/287
2021-01-10 00:51:09 +00:00
--- Test whether array t contains a reference to array n (at any depth at or below m)
---- @param t Lua table (array only)
---- @param n Lua table (array only)
---- @int m Max nesting level
function util . arrayReferences ( t , n , m , l )
if not m then m = 15 end
if not l then l = 0 end
if l > m then
return false
end
if type ( t ) == " table " then
if t == n then
return true , l
end
for _ , v in ipairs ( t ) do
local matched , depth = util.arrayReferences ( v , n , m , l + 1 )
if matched then
return matched , depth
end
end
end
return false
end
2022-10-10 13:16:16 +00:00
-- A set of binary search implementations for plain arrays.
-- Should be easy to tweak for arrays of hashes (c.f., UIManager:schedule),
-- or arrays sorted in descending order (c.f., ReadHistory).
-- refs: https://en.wikipedia.org/wiki/Binary_search_algorithm
-- https://rosettacode.org/wiki/Binary_search
--- Perform a binary search for `value` in a *sorted* (ascending) `array`.
---- @param array Lua table (array only, sorted, ascending, every value must match the type of `value` and support comparison operators)
---- @param value
---- @return int index of value in array, or a (nil, insertion index) tuple if value was not found.
function util . bsearch ( array , value )
local lo = 1
local hi = # array
while lo <= hi do
-- invariants: value > array[i] for all i < lo
-- value < array[i] for all i > hi
local mid = bit.rshift ( lo + hi , 1 )
if array [ mid ] > value then
hi = mid - 1
elseif array [ mid ] < value then
lo = mid + 1
else
return mid
end
end
return nil , lo
end
--- Perform a leftmost insertion binary search for `value` in a *sorted* (ascending) `array`.
---- @param array Lua table (array only, sorted, ascending, every value must match the type of `value` and support comparison operators)
---- @param value
---- @return int leftmost insertion index of value in array.
function util . bsearch_left ( array , value )
local lo = 1
local hi = # array
while lo <= hi do
-- invariants: value > array[i] for all i < lo
-- value <= array[i] for all i > hi
local mid = bit.rshift ( lo + hi , 1 )
if array [ mid ] >= value then
hi = mid - 1
else
lo = mid + 1
end
end
return lo
end
--- Perform a rightmost insertion binary search for `value` in a *sorted* (ascending) `array`.
---- @param array Lua table (array only, sorted, ascending, every value must match the type of `value` and support comparison operators)
---- @param value
---- @return int rightmost insertion index of value in array.
function util . bsearch_right ( array , value )
local lo = 1
local hi = # array
while lo <= hi do
-- invariants: value >= array[i] for all i < low
-- value < array[i] for all i > high
local mid = bit.rshift ( lo + hi , 1 )
if array [ mid ] > value then
hi = mid - 1
else
lo = mid + 1
end
end
return lo
end
2019-06-28 02:46:16 +00:00
-- Merge t2 into t1, overwriting existing elements if they already exist
-- Probably not safe with nested tables (c.f., https://stackoverflow.com/q/1283388)
---- @param t1 Lua table
---- @param t2 Lua table
function util . tableMerge ( t1 , t2 )
for k , v in pairs ( t2 ) do
t1 [ k ] = v
end
end
2017-04-04 07:57:14 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Gets last index of character in string ( i.e . , strrchr )
2017-04-04 07:57:14 +00:00
Returns the index within this string of the last occurrence of the specified character
or - 1 if the character does not occur .
To find . you need to escape it .
] ]
---- @string string
---- @string ch
---- @treturn int last occurrence or -1 if not found
2016-02-12 14:55:02 +00:00
function util . lastIndexOf ( string , ch )
local i = string : match ( " .* " .. ch .. " () " )
if i == nil then return - 1 else return i - 1 end
end
2021-10-23 10:12:38 +00:00
--- Pattern which matches a single well-formed UTF-8 character, including
--- theoretical >4-byte extensions.
-- Taken from <https://www.lua.org/manual/5.4/manual.html#pdf-utf8.charpattern>
util.UTF8_CHAR_PATTERN = ' [%z \1 - \127 \194 - \253 ][ \128 - \191 ]* '
2018-01-13 23:05:05 +00:00
--- Reverse the individual greater-than-single-byte characters
-- @string string to reverse
2019-08-23 17:53:53 +00:00
-- Taken from <https://github.com/blitmap/lua-utf8-simple#utf8reverses>
2018-01-13 23:05:05 +00:00
function util . utf8Reverse ( text )
2021-10-23 10:12:38 +00:00
text = text : gsub ( util.UTF8_CHAR_PATTERN , function ( c ) return # c > 1 and c : reverse ( ) end )
2018-01-13 23:05:05 +00:00
return text : reverse ( )
end
2016-04-21 14:13:10 +00:00
2016-12-13 16:06:02 +00:00
--- Splits string into a list of UTF-8 characters.
---- @string text the string to be split.
2016-06-05 07:33:31 +00:00
---- @treturn table list of UTF-8 chars
2016-05-22 15:59:28 +00:00
function util . splitToChars ( text )
local tab = { }
if text ~= nil then
local prevcharcode , charcode = 0
2019-01-15 17:36:33 +00:00
-- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
-- a superset of UTF-8, that includes UTF-16 surrogates
-- in UTF-8 bytes (forbidden in well-formed UTF-8).
-- We may get that from bad producers or converters.
-- (luajson, used to decode Wikipedia API json, will not correctly decode
-- this sample: <span lang=\"got\">\ud800\udf45</span> : single Unicode
-- char https://www.compart.com/en/unicode/U+10345 and will give us
-- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85")
-- From http://www.unicode.org/faq/utf_bom.html#utf16-1
-- Surrogates are code points from two special ranges of
-- Unicode values, reserved for use as the leading, and
-- trailing values of paired code units in UTF-16. Leading,
-- also called high, surrogates are from D800 to DBFF, and
-- trailing, or low, surrogates are from DC00 to DFFF. They
-- are called surrogates, since they do not represent
-- characters directly, but only as a pair.
local hi_surrogate
local hi_surrogate_uchar
2021-10-23 10:12:38 +00:00
for uchar in text : gmatch ( util.UTF8_CHAR_PATTERN ) do
2016-05-22 15:59:28 +00:00
charcode = BaseUtil.utf8charcode ( uchar )
2019-01-15 17:36:33 +00:00
-- (not sure why we need this prevcharcode check; we could get
-- charcode=nil with invalid UTF-8, but should we then really
-- ignore the following charcode ?)
2016-05-22 15:59:28 +00:00
if prevcharcode then -- utf8
2019-01-15 17:36:33 +00:00
if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = charcode
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
-- low surrogate following a high surrogate, good, let's make them a single char
2019-11-23 23:27:27 +00:00
charcode = lshift ( ( hi_surrogate - 0xD800 ) , 10 ) + ( charcode - 0xDC00 ) + 0x10000
2019-01-15 17:36:33 +00:00
table.insert ( tab , util.unicodeCodepointToUtf8 ( charcode ) )
hi_surrogate = nil
else
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = nil
table.insert ( tab , uchar )
end
2016-05-22 15:59:28 +00:00
end
prevcharcode = charcode
2016-04-21 14:13:10 +00:00
end
end
2016-05-22 15:59:28 +00:00
return tab
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Tests whether c is a CJK character
---- @string c
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . isCJKChar ( c )
2021-10-23 10:12:38 +00:00
-- Smallest CJK codepoint is 0x1100 which requires at least 3 utf8 bytes to
-- encode (U+07FF is the largest codepoint that can be represented in 2
-- bytes with utf8). So if the character is shorter than 3 bytes it's
-- definitely not CJK and no need to decode it.
if # c < 3 then
return false
end
2021-10-24 08:58:14 +00:00
local code = BaseUtil.utf8charcode ( c )
2021-10-23 10:12:38 +00:00
-- The weird bracketing is intentional -- we use the lowest possible
-- codepoint as a shortcut so if the codepoint is below U+1100 we
-- immediately return false.
return -- BMP (Plane 0)
code >= 0x1100 and ( code <= 0x11FF or -- Hangul Jamo
( code >= 0x2E80 and code <= 0x9FFF ) or -- Numerous CJK Blocks (NB: has some gaps)
( code >= 0xA960 and code <= 0xA97F ) or -- Hangul Jamo Extended-A
( code >= 0xAC00 and code <= 0xD7AF ) or -- Hangul Syllables
( code >= 0xD7B0 and code <= 0xD7FF ) or -- Hangul Jame Extended-B
( code >= 0xF900 and code <= 0xFAFF ) or -- CJK Compatibility Ideographs
( code >= 0xFE30 and code <= 0xFE4F ) or -- CJK Compatibility Forms
( code >= 0xFF00 and code <= 0xFFEF ) or -- Halfwidth and Fullwidth Forms
-- SIP (Plane 2)
( code >= 0x20000 and code <= 0x2A6DF ) or -- CJK Unified Ideographs Extension B
( code >= 0x2A700 and code <= 0x2B73F ) or -- CJK Unified Ideographs Extension C
( code >= 0x2B740 and code <= 0x2B81F ) or -- CJK Unified Ideographs Extension D
( code >= 0x2B820 and code <= 0x2CEAF ) or -- CJK Unified Ideographs Extension E
( code >= 0x2CEB0 and code <= 0x2EBEF ) or -- CJK Unified Ideographs Extension F
( code >= 0x2F800 and code <= 0x2FA1F ) or -- CJK Compatibility Ideographs Supplement
-- TIP (Plane 3)
( code >= 0x30000 and code <= 0x3134F ) ) -- CJK Unified Ideographs Extension G
2016-11-26 00:46:56 +00:00
end
2017-04-04 07:57:14 +00:00
--- Tests whether str contains CJK characters
---- @string str
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . hasCJKChar ( str )
2021-10-23 10:12:38 +00:00
for c in str : gmatch ( util.UTF8_CHAR_PATTERN ) do
if util.isCJKChar ( c ) then
return true
end
end
return false
2016-11-26 00:46:56 +00:00
end
2019-11-23 23:27:27 +00:00
--- Split texts into a list of words, spaces and punctuation marks.
2016-06-05 07:33:31 +00:00
---- @string text text to split
2019-11-23 23:27:27 +00:00
---- @treturn table list of words, spaces and punctuation marks
2016-06-05 07:33:31 +00:00
function util . splitToWords ( text )
local wlist = { }
2016-06-28 15:50:21 +00:00
for word in util.gsplit ( text , " [%s%p]+ " , true ) do
2019-11-23 23:27:27 +00:00
-- if space split word contains CJK characters
2016-11-26 00:46:56 +00:00
if util.hasCJKChar ( word ) then
2021-10-23 10:12:38 +00:00
-- split all non-ASCII characters separately (FIXME ideally we
-- would split only the CJK characters, but you cannot define CJK
-- characters trivially with a byte-only Lua pattern).
for char in util.gsplit ( word , " [ \192 - \255 ][ \128 - \191 ]+ " , true ) do
2016-06-28 15:50:21 +00:00
table.insert ( wlist , char )
end
else
2016-06-05 07:33:31 +00:00
table.insert ( wlist , word )
end
end
return wlist
end
2016-12-06 21:10:25 +00:00
-- We don't want to split on a space if it is followed by some
2019-11-23 23:27:27 +00:00
-- specific punctuation marks : e.g. "word :" or "word )"
-- (In French, there is a non-breaking space before a colon, and it better
2016-12-06 21:10:25 +00:00
-- not be wrapped there.)
2017-04-04 07:57:14 +00:00
local non_splittable_space_tailers = " :;,.!?)]}$%=-+*/|<>»” "
2019-11-23 23:27:27 +00:00
-- Same if a space has some specific other punctuation mark before it
2017-04-04 07:57:14 +00:00
local non_splittable_space_leaders = " ([{$=-+*/|<>«“ "
2016-12-06 21:10:25 +00:00
2016-12-15 07:58:58 +00:00
-- Similar rules exist for CJK text. Taken from :
-- https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_tailers = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" !%),.:;?]}¢°·’ \" †‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?!]}~ " ,
-- Traditional Chinese
" !),.:;?]}¢·–—’ \" •、。〆〞〕〉》」︰︱︲︳﹐﹑﹒﹔﹕﹖﹘﹚﹜!),.:;?︶︸︺︼︾﹀﹂﹗]|}、 " ,
-- Japanese
" )]}〕〉》」』】〙〗〟’ \" ⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。. " ,
-- Korean
" !%),.:;?]}¢°’ \" †‡℃〆〈《「『〕!%),.:;?]} " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_leaders = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" $(£¥·‘ \" 〈《「『【〔〖〝﹙﹛$(.[{£¥ " ,
-- Traditional Chinese
" ([{£¥‘ \" ‵〈《「『〔〝︴﹙﹛({︵︷︹︻︽︿﹁﹃﹏ " ,
-- Japanese
" ([{〔〈《「『【〘〖〝‘ \" ⦅« " ,
-- Korean
" $([{£¥‘ \" 々〇〉》」〔$([{⦆¥₩# " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Japanese
" —…‥〳〴〵 " ,
} )
2017-04-04 07:57:14 +00:00
--- Test whether a string can be separated by this char for multi-line rendering.
2016-12-12 22:41:16 +00:00
-- Optional next or prev chars may be provided to help make the decision
2017-04-04 07:57:14 +00:00
---- @string c
---- @string next_c
---- @string prev_c
---- @treturn boolean true if splittable, false if not
function util . isSplittable ( c , next_c , prev_c )
2016-12-06 21:10:25 +00:00
if util.isCJKChar ( c ) then
2017-04-04 07:57:14 +00:00
-- a CJKChar is a word in itself, and so is splittable
if cjk_non_splittable : find ( c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- except a few of them
return false
2017-04-04 07:57:14 +00:00
elseif next_c and cjk_non_splittable_tailers : find ( next_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but followed by a char that is not permitted at start of line
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and cjk_non_splittable_leaders : find ( prev_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but preceded by a char that is not permitted at end of line
return false
else
-- we can split on this CJKchar
return true
end
2016-12-06 21:10:25 +00:00
elseif c == " " then
2019-11-23 23:27:27 +00:00
-- we only split on a space (so a punctuation mark sticks to prev word)
2016-12-12 22:41:16 +00:00
-- if next_c or prev_c is provided, we can make a better decision
2017-04-04 07:57:14 +00:00
if next_c and non_splittable_space_tailers : find ( next_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is followed by some punctuation mark that is better kept with us
2016-12-12 22:41:16 +00:00
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and non_splittable_space_leaders : find ( prev_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is lead by some punctuation mark that is better kept with us
2016-12-06 21:10:25 +00:00
return false
else
-- we can split on this space
return true
end
end
2019-11-23 23:27:27 +00:00
-- otherwise, not splittable
2016-12-06 21:10:25 +00:00
return false
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Gets filesystem type of a path.
--
-- Checks if the path occurs in <code>/proc/mounts</code>
---- @string path an absolute path
---- @treturn string filesystem type
2017-01-10 00:05:15 +00:00
function util . getFilesystemType ( path )
local mounts = io.open ( " /proc/mounts " , " r " )
if not mounts then return nil end
local type
2021-01-16 03:41:46 +00:00
for line in mounts : lines ( ) do
2017-01-10 00:05:15 +00:00
local mount = { }
for param in line : gmatch ( " %S+ " ) do table.insert ( mount , param ) end
if string.match ( path , mount [ 2 ] ) then
type = mount [ 3 ]
if mount [ 2 ] ~= ' / ' then
break
end
end
end
mounts : close ( )
return type
2020-10-20 04:30:41 +00:00
end
2022-09-19 21:25:18 +00:00
-- For documentation purposes, here's a battle-tested shell version of calcFreeMem,
-- our simplified Lua version follows...
--[[
if grep - q ' MemAvailable ' / proc / meminfo ; then
# We ' ll settle for 85% of available memory to leave a bit of breathing room
tmpfs_size = " $(awk '/MemAvailable/ {printf " % d " , $2 * 0.85}' /proc/meminfo) "
elif grep - q ' Inactive(file) ' / proc / meminfo ; then
# Basically try to emulate the kernel ' s computation, c.f., https://unix.stackexchange.com/q/261247
# Again , 85 % of available memory
tmpfs_size = " $(awk -v low=$(grep low /proc/zoneinfo | awk '{k+=$2}END{printf " % d " , k}') \
' {a[$1]=$2}
END {
printf " %d " , ( a [ " MemFree: " ] + a [ " Active(file): " ] + a [ " Inactive(file): " ] + a [ " SReclaimable: " ] - ( 12 * low ) ) * 0.85 ;
} ' /proc/meminfo)"
else
# Ye olde crap workaround of Free + Buffers + Cache ...
# Take it with a grain of salt , and settle for 80 % of that ...
tmpfs_size = " $(awk \
' {a[$1]=$2}
END {
printf " %d " , ( a [ " MemFree: " ] + a [ " Buffers: " ] + a [ " Cached: " ] ) * 0.80 ;
} ' /proc/meminfo)"
fi
--]]
--- Computes the currently available memory
---- @treturn tuple of ints: memavailable, memtotal (or nil, nil on unsupported platforms).
function util : calcFreeMem ( )
local memtotal , memfree , memavailable , buffers , cached
local meminfo = io.open ( " /proc/meminfo " , " r " )
if meminfo then
for line in meminfo : lines ( ) do
if not memtotal then
memtotal = line : match ( " ^MemTotal:%s-(%d+) kB " )
if memtotal then
-- Next!
goto continue
end
end
if not memfree then
memfree = line : match ( " ^MemFree:%s-(%d+) kB " )
if memfree then
-- Next!
goto continue
end
end
if not memavailable then
memavailable = line : match ( " ^MemAvailable:%s-(%d+) kB " )
if memavailable then
-- Best case scenario, we're done :)
break
end
end
if not buffers then
buffers = line : match ( " ^Buffers:%s-(%d+) kB " )
if buffers then
-- Next!
goto continue
end
end
if not cached then
cached = line : match ( " ^Cached:%s-(%d+) kB " )
if cached then
-- Ought to be the last entry we care about, we're done
break
end
end
:: continue ::
end
meminfo : close ( )
else
-- Not on Linux?
return nil , nil
end
if memavailable then
-- Leave a bit of margin, and report 85% of that...
return math.floor ( memavailable * 0.85 ) * 1024 , memtotal * 1024
else
-- Crappy Free + Buffers + Cache version, because the zoneinfo approach is a tad hairy...
-- So, leave an even larger margin, and only report 75% of that...
return math.floor ( ( memfree + buffers + cached ) * 0.75 ) * 1024 , memtotal * 1024
end
end
2020-10-20 04:30:41 +00:00
--- Recursively scan directory for files inside
-- @string path
2021-03-06 21:44:18 +00:00
-- @func callback(fullpath, name, attr)
2020-10-20 04:30:41 +00:00
function util . findFiles ( dir , cb )
local function scan ( current )
local ok , iter , dir_obj = pcall ( lfs.dir , current )
if not ok then return end
for f in iter , dir_obj do
local path = current .. " / " .. f
2021-06-11 11:25:53 +00:00
-- lfs can return nil here, as it will follow symlinks!
local attr = lfs.attributes ( path ) or { }
2020-10-20 04:30:41 +00:00
if attr.mode == " directory " then
if f ~= " . " and f ~= " .. " then
scan ( path )
end
2020-10-24 11:23:05 +00:00
elseif attr.mode == " file " or attr.mode == " link " then
2020-10-20 04:30:41 +00:00
cb ( path , f , attr )
end
end
end
scan ( dir )
2017-01-10 00:05:15 +00:00
end
2017-04-26 06:12:25 +00:00
--- Checks if directory is empty.
---- @string path
---- @treturn bool
function util . isEmptyDir ( path )
2017-08-12 13:01:59 +00:00
-- lfs.dir will crash rather than return nil if directory doesn't exist O_o
local ok , iter , dir_obj = pcall ( lfs.dir , path )
if not ok then return end
for filename in iter , dir_obj do
2017-04-26 06:12:25 +00:00
if filename ~= ' . ' and filename ~= ' .. ' then
return false
end
end
return true
end
2020-06-19 10:22:38 +00:00
--- check if the given path is a file
---- @string path
---- @treturn bool
function util . fileExists ( path )
local file = io.open ( path , " r " )
if file ~= nil then
file : close ( )
return true
end
end
2018-01-01 14:40:28 +00:00
--- Checks if the given path exists. Doesn't care if it's a file or directory.
---- @string path
---- @treturn bool
function util . pathExists ( path )
return lfs.attributes ( path , " mode " ) ~= nil
end
--- As `mkdir -p`.
2019-08-23 17:53:53 +00:00
-- Unlike [lfs.mkdir](https://keplerproject.github.io/luafilesystem/manual.html#mkdir)(),
-- does not error if the directory already exists, and creates intermediate directories as needed.
-- @string path the directory to create
-- @treturn bool true on success; nil, err_message on error
2018-01-01 14:40:28 +00:00
function util . makePath ( path )
2023-02-12 22:48:33 +00:00
if lfs.attributes ( path , " mode " ) == " directory " then
return true
end
local components
if path : sub ( 1 , 1 ) == " / " then
-- Leading slash, remember that it's an absolute path
components = " / "
else
-- Relative path
components = " "
end
2018-01-01 14:40:28 +00:00
2023-02-12 22:48:33 +00:00
local success , err
-- NOTE: mkdir -p handles umask shenanigans for intermediate components, we don't
for component in path : gmatch ( " ([^/]+) " ) do
-- The trailing slash ensures we properly fail via mkdir if the composite path already exists as a file/link
components = components .. component .. " / "
2023-02-17 22:24:42 +00:00
if lfs.attributes ( components , " mode " ) == nil then
2023-02-12 22:48:33 +00:00
success , err = lfs.mkdir ( components )
if not success then
return nil , err .. " (creating ` " .. components .. " ` for ` " .. path .. " `) "
end
end
2018-01-01 14:40:28 +00:00
end
2023-02-12 22:48:33 +00:00
return success , err
2018-01-01 14:40:28 +00:00
end
2023-02-17 22:24:42 +00:00
--- Remove as many of the empty directories specified in path, children-first.
-- Does not fail if the directory is already gone.
-- @string path the directory tree to prune
-- @treturn bool true on success; nil, err_message on error
function util . removePath ( path )
local component = path
repeat
local attr = lfs.attributes ( component , " mode " )
if attr == " directory " then
local success , err = lfs.rmdir ( component )
if not success then
-- Most likely because ENOTEMPTY ;)
return nil , err .. " (removing ` " .. component .. " ` for ` " .. path .. " `) "
end
elseif attr ~= nil then
return nil , " Encountered a component that isn't a directory " .. " (removing ` " .. component .. " ` for ` " .. path .. " `) "
end
local parent = BaseUtil.dirname ( component )
component = parent
until parent == " . " or parent == " / "
return true , nil
end
2020-06-19 10:22:38 +00:00
--- As `rm`
-- @string path of the file to remove
-- @treturn bool true on success; nil, err_message on error
function util . removeFile ( file )
if file and lfs.attributes ( file , " mode " ) == " file " then
return os.remove ( file )
elseif file then
return nil , file .. " is not a file "
else
return nil , " file is nil "
end
end
-- Gets total, used and available bytes for the mountpoint that holds a given directory.
-- @string path of the directory
-- @treturn table with total, used and available bytes
function util . diskUsage ( dir )
-- safe way of testing df & awk
local function doCommand ( d )
2023-05-26 18:51:15 +00:00
local handle = io.popen ( " df -k " .. d .. " 2>/dev/null | awk '$3 ~ /[0-9]+/ { print $2,$3,$4 }' 2>/dev/null || echo ::ERROR:: " )
2020-06-19 10:22:38 +00:00
if not handle then return end
local output = handle : read ( " *all " )
handle : close ( )
if not output : find " ::ERROR:: " then
return output
end
end
local err = { total = nil , used = nil , available = nil }
if not dir or lfs.attributes ( dir , " mode " ) ~= " directory " then return err end
local usage = doCommand ( dir )
if not usage then return err end
local stage , result = { } , { }
for size in usage : gmatch ( " %w+ " ) do
table.insert ( stage , size )
end
for k , v in pairs ( { " total " , " used " , " available " } ) do
if stage [ k ] ~= nil then
-- sizes are in kb, return bytes here
result [ v ] = stage [ k ] * 1024
end
end
return result
end
2017-04-04 07:57:14 +00:00
--- Replaces characters that are invalid filenames.
--
-- Replaces the characters <code>\/:*?"<>|</code> with an <code>_</code>.
-- These characters are problematic on Windows filesystems. On Linux only
-- <code>/</code> poses a problem.
---- @string str filename
---- @treturn string sanitized filename
2019-05-14 17:10:41 +00:00
local function replaceAllInvalidChars ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' [ \\ ,%/,:,%*,%?,%",%<,%>,%|] ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2017-04-04 07:57:14 +00:00
--- Replaces slash with an underscore.
---- @string str
---- @treturn string
2019-05-14 17:10:41 +00:00
local function replaceSlashChar ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' %/ ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replaces characters that are invalid in filenames .
2019-08-23 17:53:53 +00:00
Replaces the characters ` \ / : * ? " <>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
If an optional path is provided , @ { util.getFilesystemType } ( ) will be used to determine whether stricter VFAT restrictions should be applied .
] ]
2019-05-14 17:10:41 +00:00
---- @string str
---- @string path
---- @int limit
2019-08-23 17:53:53 +00:00
---- @treturn string safe filename
2019-06-10 15:06:13 +00:00
function util . getSafeFilename ( str , path , limit , limit_ext )
2019-05-14 17:10:41 +00:00
local filename , suffix = util.splitFileNameSuffix ( str )
2019-05-22 09:34:46 +00:00
local replaceFunc = replaceAllInvalidChars
2019-05-14 17:10:41 +00:00
local safe_filename
-- VFAT supports a maximum of 255 UCS-2 characters, although it's probably treated as UTF-16 by Windows
-- default to a slightly lower limit just in case
limit = limit or 240
2019-06-10 15:06:13 +00:00
limit_ext = limit_ext or 10
2019-05-14 17:10:41 +00:00
2021-06-15 16:06:19 +00:00
-- Always assume the worst on Android (#7837)
if path and not BaseUtil.isAndroid ( ) then
2019-05-14 17:10:41 +00:00
local file_system = util.getFilesystemType ( path )
2019-05-22 09:34:46 +00:00
if file_system ~= " vfat " and file_system ~= " fuse.fsp " then
replaceFunc = replaceSlashChar
2019-05-14 17:10:41 +00:00
end
end
2019-06-10 15:06:13 +00:00
if suffix : len ( ) > limit_ext then
-- probably not an actual file extension, or at least not one we'd be
-- dealing with, so strip the whole string
filename = str
suffix = nil
end
filename = util.htmlToPlainTextIfHtml ( filename )
2019-05-14 17:10:41 +00:00
filename = filename : sub ( 1 , limit )
-- the limit might result in broken UTF-8, which we don't want in the result
filename = util.fixUtf8 ( filename , " " )
if suffix and suffix ~= " " then
safe_filename = replaceFunc ( filename ) .. " . " .. replaceFunc ( suffix )
else
safe_filename = replaceFunc ( filename )
end
return safe_filename
end
2018-01-01 14:40:28 +00:00
--- Splits a file into its directory path and file name.
--- If the given path has a trailing /, returns the entire path as the directory
--- path and "" as the file name.
2017-04-04 07:57:14 +00:00
---- @string file
2020-12-10 19:51:21 +00:00
---- @treturn string directory, filename
2017-01-21 09:32:42 +00:00
function util . splitFilePathName ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " / " ) == nil then return " " , file end
2020-12-06 23:09:47 +00:00
return file : match ( " (.*/)(.*) " )
2017-01-21 09:32:42 +00:00
end
2017-04-04 07:57:14 +00:00
--- Splits a file name into its pure file name and suffix
---- @string file
---- @treturn string path, extension
2017-01-21 09:32:42 +00:00
function util . splitFileNameSuffix ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " %. " ) == nil then return file , " " end
2020-12-06 23:09:47 +00:00
return file : match ( " (.*)%.(.*) " )
2017-01-21 09:32:42 +00:00
end
2017-04-04 07:57:14 +00:00
--- Gets file extension
---- @string filename
---- @treturn string extension
2017-02-12 02:55:31 +00:00
function util . getFileNameSuffix ( file )
local _ , suffix = util.splitFileNameSuffix ( file )
return suffix
end
2020-02-03 19:08:18 +00:00
--- Companion helper function that returns the script's language,
2020-12-10 19:51:21 +00:00
--- based on the file extension.
2020-02-03 19:08:18 +00:00
---- @string filename
2020-07-09 16:11:56 +00:00
---- @treturn string (lowercase) (or nil if not Device:canExecuteScript(file))
2020-02-03 19:08:18 +00:00
function util . getScriptType ( file )
local file_ext = string.lower ( util.getFileNameSuffix ( file ) )
if file_ext == " sh " then
return " shell "
elseif file_ext == " py " then
return " python "
end
end
2017-10-20 15:48:32 +00:00
--- Gets human friendly size as string
---- @int size (bytes)
2019-12-17 12:00:35 +00:00
---- @bool right_align (by padding with spaces on the left)
2017-10-20 15:48:32 +00:00
---- @treturn string
2019-12-17 12:00:35 +00:00
function util . getFriendlySize ( size , right_align )
local frac_format = right_align and " %6.1f " or " %.1f "
local deci_format = right_align and " %6d " or " %d "
2018-12-13 06:27:49 +00:00
size = tonumber ( size )
2018-01-31 16:22:34 +00:00
if not size or type ( size ) ~= " number " then return end
2021-01-16 20:40:00 +00:00
if size > 1000 * 1000 * 1000 then
2022-05-31 22:04:46 +00:00
return T ( C_ ( " Data storage size " , " %1 GB " ) , string.format ( frac_format , size / 1000 / 1000 / 1000 ) )
2019-11-28 22:22:07 +00:00
end
2021-01-16 20:40:00 +00:00
if size > 1000 * 1000 then
2022-05-31 22:04:46 +00:00
return T ( C_ ( " Data storage size " , " %1 MB " ) , string.format ( frac_format , size / 1000 / 1000 ) )
2019-11-28 22:22:07 +00:00
end
2021-01-16 20:40:00 +00:00
if size > 1000 then
2022-05-31 22:04:46 +00:00
return T ( C_ ( " Data storage size " , " %1 kB " ) , string.format ( frac_format , size / 1000 ) )
2017-10-20 15:48:32 +00:00
else
2022-05-31 22:04:46 +00:00
return T ( C_ ( " Data storage size " , " %1 B " ) , string.format ( deci_format , size ) )
2017-10-20 15:48:32 +00:00
end
end
2017-10-20 17:29:52 +00:00
--- Gets formatted size as string (1273334 => "1,273,334")
---- @int size (bytes)
---- @treturn string
function util . getFormattedSize ( size )
local s = tostring ( size )
s = s : reverse ( ) : gsub ( " (%d%d%d) " , " %1, " )
s = s : reverse ( ) : gsub ( " ^, " , " " )
return s
end
2019-08-23 17:53:53 +00:00
--[[--
Replaces invalid UTF - 8 characters with a replacement string .
Based on < http : // notebook.kulchenko . com / programming / fixing - malformed - utf8 - in - lua > .
2019-11-23 23:27:27 +00:00
c.f . , FixUTF8 @ < https : // github.com / pkulchenko / ZeroBraneStudio / blob / master / src / util.lua > .
2019-08-23 17:53:53 +00:00
@ string str the string to be checked for invalid characters
@ string replacement the string to replace invalid characters with
@ treturn string valid UTF - 8
] ]
2017-04-02 14:17:49 +00:00
function util . fixUtf8 ( str , replacement )
local pos = 1
local len = # str
while pos <= len do
2019-11-23 23:27:27 +00:00
if str : find ( " ^[%z \1 - \127 ] " , pos ) then pos = pos + 1
elseif str : find ( " ^[ \194 - \223 ][ \128 - \191 ] " , pos ) then pos = pos + 2
elseif str : find ( " ^ \224 [ \160 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \225 - \236 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \237 [ \128 - \159 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \238 - \239 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 3
elseif str : find ( " ^ \240 [ \144 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \241 - \243 ][ \128 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \244 [ \128 - \143 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 4
2017-04-02 14:17:49 +00:00
else
str = str : sub ( 1 , pos - 1 ) .. replacement .. str : sub ( pos + 1 )
pos = pos + # replacement
len = len + # replacement - 1
end
end
return str
end
2017-04-14 19:12:28 +00:00
--- Splits input string with the splitter into a table. This function ignores the last empty entity.
--
--- @string str the string to be split
--- @string splitter
--- @bool capture_empty_entity
--- @treturn an array-like table
function util . splitToArray ( str , splitter , capture_empty_entity )
local result = { }
for word in util.gsplit ( str , splitter , false , capture_empty_entity ) do
table.insert ( result , word )
end
return result
end
2019-11-22 18:50:58 +00:00
--- Convert a Unicode codepoint (number) to UTF-8 char
--- c.f., <https://stackoverflow.com/a/4609989>
--- & <https://stackoverflow.com/a/38492214>
2019-11-23 23:27:27 +00:00
--- See utf8charcode in ffi/util for a decoder.
2017-07-01 10:11:44 +00:00
--
--- @int c Unicode codepoint
2019-11-22 18:50:58 +00:00
--- @treturn string UTF-8 char
2017-07-01 10:11:44 +00:00
function util . unicodeCodepointToUtf8 ( c )
2019-11-22 18:50:58 +00:00
if c < 0x80 then
2017-07-01 10:11:44 +00:00
return string.char ( c )
2019-11-22 18:50:58 +00:00
elseif c < 0x800 then
return string.char (
bor ( 0xC0 , rshift ( c , 6 ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x10000 then
if c >= 0xD800 and c <= 0xDFFF then
return ' <EFBFBD> ' -- Surrogates -> U+FFFD REPLACEMENT CHARACTER
end
return string.char (
bor ( 0xE0 , rshift ( c , 12 ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x110000 then
return string.char (
bor ( 0xF0 , rshift ( c , 18 ) ) ,
bor ( 0x80 , band ( rshift ( c , 12 ) , 0x3F ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
2017-07-01 10:11:44 +00:00
else
2019-11-22 18:50:58 +00:00
return ' <EFBFBD> ' -- Invalid -> U+FFFD REPLACEMENT CHARACTER
2017-07-01 10:11:44 +00:00
end
end
2018-04-10 16:30:27 +00:00
-- we need to use an array of arrays to keep them ordered as written
2017-07-01 10:11:44 +00:00
local HTML_ENTITIES_TO_UTF8 = {
2018-04-10 16:30:27 +00:00
{ " < " , " < " } ,
{ " > " , " > " } ,
{ " " " , ' " ' } ,
{ " ' " , " ' " } ,
2023-08-01 08:09:29 +00:00
{ " " , " \u{00A0} " } ,
2018-04-10 16:30:27 +00:00
{ " &#(%d+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x ) ) end } ,
2019-11-22 18:50:58 +00:00
{ " &#x(%x+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x , 16 ) ) end } ,
2018-04-10 16:30:27 +00:00
{ " & " , " & " } , -- must be last
2017-07-01 10:11:44 +00:00
}
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replace HTML entities with their UTF - 8 encoded equivalent in text .
2019-08-23 17:53:53 +00:00
Supports only basic ones and those with numbers ( no support for named entities like ` & eacute ; ` ) .
@ int string text with HTML entities
2019-11-23 23:27:27 +00:00
@ treturn string UTF - 8 text
2019-08-23 17:53:53 +00:00
] ]
2017-07-01 10:11:44 +00:00
function util . htmlEntitiesToUtf8 ( text )
2018-04-10 16:30:27 +00:00
for _ , t in ipairs ( HTML_ENTITIES_TO_UTF8 ) do
text = text : gsub ( t [ 1 ] , t [ 2 ] )
2017-07-01 10:11:44 +00:00
end
return text
end
2019-08-23 17:53:53 +00:00
--[[--
Convert simple HTML to plain text .
This may fail on complex HTML ( with styles , scripts , comments ) , but should be fine enough with simple HTML as found in EPUB ' s `<dc:description>`.
@ string text HTML text
@ treturn string plain text
] ]
2017-07-01 10:11:44 +00:00
function util . htmlToPlainText ( text )
2020-12-31 22:23:05 +00:00
-- Replace <br> with \n
2017-07-01 10:11:44 +00:00
text = text : gsub ( " %s*<%s*br%s*/?>%s* " , " \n " ) -- <br> and <br/>
2020-12-31 22:23:05 +00:00
-- Replace <p> with \n\t (\t, unlike any combination of spaces,
-- ensures a constant indentation when text is justified.)
2017-07-01 10:11:44 +00:00
text = text : gsub ( " %s*</%s*p%s*>%s* " , " \n " ) -- </p>
text = text : gsub ( " %s*<%s*p%s*/>%s* " , " \n " ) -- standalone <p/>
2020-12-31 22:23:05 +00:00
text = text : gsub ( " %s*<%s*p%s*>%s* " , " \n \t " ) -- <p>
-- (this one last, so \t is not removed by the others' %s)
2017-07-01 10:11:44 +00:00
-- Remove all HTML tags
text = text : gsub ( " <[^>]*> " , " " )
-- Convert HTML entities
text = util.htmlEntitiesToUtf8 ( text )
2020-12-31 22:23:05 +00:00
-- Trim spaces and new lines at start and end, including
-- the \t we added (this looks fine enough with multiple
-- paragraphs, but feels nicer with a single paragraph,
-- whether it contains <br>s or not).
2017-07-01 10:11:44 +00:00
text = text : gsub ( " ^[ \n %s]* " , " " )
text = text : gsub ( " [ \n %s]*$ " , " " )
return text
end
--- Convert HTML to plain text if text seems to be HTML
-- Detection of HTML is simple and may raise false positives
-- or negatives, but seems quite good at guessing content type
-- of text found in EPUB's <dc:description>.
--
--- @string text the string with possibly some HTML
--- @treturn string cleaned text
function util . htmlToPlainTextIfHtml ( text )
local is_html = false
-- Quick way to check if text is some HTML:
-- look for html tags
local _ , nb_tags
_ , nb_tags = text : gsub ( " <%w+.-> " , " " )
if nb_tags > 0 then
is_html = true
else
-- no <tag> found
2019-11-23 23:27:27 +00:00
-- but we may meet some text badly/twice encoded html containing "<br>"
2017-07-01 10:11:44 +00:00
local nb_encoded_tags
_ , nb_encoded_tags = text : gsub ( " <%a+> " , " " )
if nb_encoded_tags > 0 then
is_html = true
-- decode one of the two encodes
text = util.htmlEntitiesToUtf8 ( text )
end
end
if is_html then
text = util.htmlToPlainText ( text )
else
-- if text ends with ]]>, it probably comes from <![CDATA[ .. ]]> that
-- crengine has extracted correctly, but let the ending tag in, so
-- let's remove it
text = text : gsub ( " ]]>%s*$ " , " " )
end
return text
end
2018-01-07 19:24:15 +00:00
--- Encode the HTML entities in a string
2018-01-15 22:51:43 +00:00
--- @string text the string to escape
2018-01-07 19:24:15 +00:00
-- Taken from https://github.com/kernelsauce/turbo/blob/e4a35c2e3fb63f07464f8f8e17252bea3a029685/turbo/escape.lua#L58-L70
function util . htmlEscape ( text )
return text : gsub ( " [}{ \" >/<'&] " , {
[ " & " ] = " & " ,
[ " < " ] = " < " ,
[ " > " ] = " > " ,
[ ' " ' ] = " " " ,
[ " ' " ] = " ' " ,
[ " / " ] = " / " ,
} )
end
2020-06-08 18:47:31 +00:00
--- Prettify a CSS stylesheet
-- Not perfect, but enough to make some ugly CSS readable.
-- By default, each selector and each property is put on its own line.
-- With condensed=true, condense each full declaration on a single line.
--
--- @string CSS string
--- @boolean condensed[opt=false] true to condense each declaration on a line
--- @treturn string the CSS prettified
function util . prettifyCSS ( css_text , condensed )
if not condensed then
2023-08-01 08:09:27 +00:00
-- Get rid of \t
2020-06-08 18:47:31 +00:00
css_text = css_text : gsub ( " \t " , " " )
2023-08-01 08:09:27 +00:00
css_text = css_text : gsub ( " \r " , " " )
-- Protect ',:;' in comments by replacing them with rare control chars
css_text = css_text : gsub ( " /%*.-%*/ " , function ( s )
s = s : gsub ( " , " , " \v " )
s = s : gsub ( " : " , " \f " )
s = s : gsub ( " ; " , " \b " )
2020-06-08 18:47:31 +00:00
return s
end )
2023-08-01 08:09:27 +00:00
-- Cleanup declarations (the most nested ones only, which may be
-- contained in "@supports (...) {...}" or "@media (...) {...}")
css_text = css_text : gsub ( " *{([^{}]*)} * " , function ( s )
-- Comments inside declaration may be mixed with properties, on a same line,
-- before or after them, and we don't know if they apply to what's before or
-- what's after, except when they are standalone and probably apply to the
-- next line. So, when not standalone, double indent them (so it looks like
-- they apply to what's above - but will still look fine if they are about
-- what's after.
s = " \n " .. s -- so the next one match on the first line
s = s : gsub ( " \n */%* " , " \a /* " ) -- '/*' with only blank before: mark them with '\a'
s = s : gsub ( " *([^ \a ])/%* " , " \n \t /* " ) -- unmarked '/*' (content before): marked, more indentation later
s = s : gsub ( " \a " , " " ) -- remove mark
s = s : gsub ( " \t " , " \a " ) -- replace mark by one that is not caught by '%s'
s = s : gsub ( " %*/%s* " , " */ \n " ) -- '*/' end of css comment: newline after
s = s : gsub ( " %s*;%s* " , " ; \n " ) -- newline after ';'
s = s : gsub ( " \n +%s* " , " \n " ) -- remove blank lines, 4 spaces indent on all lines
s = s : gsub ( " \a " , " " ) -- expand our \a marks to have these /* more indented
s = s : gsub ( " %s*:%s* " , " : " ) -- normalize spacing in "keyword: value"
s = s : gsub ( " ^%s*(.-)%s*$ " , " \n %1 " ) -- remove leading and trailing spaces, indent first line
s = s : gsub ( " ^%s*$ " , " " ) -- but have empty declaration really empty
-- less indent for these crengine specific tweaks to the followup properties
s = s : gsub ( " \n %-cr%-hint: late " , " \n -cr-hint: late " )
s = s : gsub ( " \n %-cr%-only%-if " , " \n -cr-only-if " )
-- Protect and normalize ',' in declarations (ie. in font-family list, rgb()...)
s = s : gsub ( " %s*,%s* " , " \v " )
return " { " .. s .. " \n } "
end )
2020-06-08 18:47:31 +00:00
-- Have each selector (separated by ',') on a new line
css_text = css_text : gsub ( " %s*,%s* " , " , \n " )
2023-08-01 08:09:27 +00:00
css_text = css_text : gsub ( " \n *([^ \n ]+), " , " \n %1, " ) -- remove leading spaces on the first one
css_text = css_text : gsub ( " \n *([^ \n ]+){ " , " \n %1{ " ) -- remove leading spaces on a standalone one
-- Make sure { is on the same line with the selector it follows
css_text = css_text : gsub ( " %s* \n *{ " , " { " )
2023-08-10 14:46:51 +00:00
-- Make sure we have a newline after our }
css_text = css_text : gsub ( " \n } *([^ \n ]+) " , " \n } \n %1 " )
2023-08-01 08:09:27 +00:00
-- Restore all protected chars
css_text = css_text : gsub ( " \v " , " , " )
css_text = css_text : gsub ( " \f " , " : " )
css_text = css_text : gsub ( " \b " , " ; " )
2020-06-08 18:47:31 +00:00
else
-- Go thru previous method to have something standard to work on
css_text = util.prettifyCSS ( css_text )
-- And condense that
css_text = css_text : gsub ( " { \n " , " { " )
css_text = css_text : gsub ( " ; \n " , " ; " )
css_text = css_text : gsub ( " \n } " , " } " )
css_text = css_text : gsub ( " , \n " , " , " )
end
return css_text
end
2018-01-15 22:51:43 +00:00
--- Escape list for shell usage
--- @table args the list of arguments to escape
--- @treturn string the escaped and concatenated arguments
function util . shell_escape ( args )
local escaped_args = { }
for _ , arg in ipairs ( args ) do
arg = " ' " .. arg : gsub ( " ' " , " ' \\ '' " ) .. " ' "
table.insert ( escaped_args , arg )
end
return table.concat ( escaped_args , " " )
end
2022-09-27 23:10:50 +00:00
--- Clear all the elements from an array without reassignment.
--- @table t the array to be cleared
2018-01-17 08:17:53 +00:00
function util . clearTable ( t )
local c = # t
for i = 0 , c do t [ i ] = nil end
end
2018-05-04 15:06:58 +00:00
--- Encode URL also known as percent-encoding see https://en.wikipedia.org/wiki/Percent-encoding
--- @string text the string to encode
--- @treturn encode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlEncode ( url )
local char_to_hex = function ( c )
return string.format ( " %%%02X " , string.byte ( c ) )
end
if url == nil then
return
end
url = url : gsub ( " \n " , " \r \n " )
url = url : gsub ( " ([^%w%-%.%_%~%!%*%'%(%)]) " , char_to_hex )
return url
end
--- Decode URL (reverse process to util.urlEncode())
--- @string text the string to decode
--- @treturn decode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlDecode ( url )
local hex_to_char = function ( x )
return string.char ( tonumber ( x , 16 ) )
end
if url == nil then
return
end
url = url : gsub ( " %%(%x%x) " , hex_to_char )
return url
end
2018-08-06 19:16:30 +00:00
--- Check lua syntax of string
--- @string text lua code text
--- @treturn string with parsing error, nil if syntax ok
function util . checkLuaSyntax ( lua_text )
local lua_code_ok , err = loadstring ( lua_text )
if lua_code_ok then
return nil
end
-- Replace: [string "blah blah..."]:3: '=' expected near '123'
-- with: Line 3: '=' expected near '123'
2022-09-27 23:10:50 +00:00
err = err and err : gsub ( " %[string \" .-% \" ]: " , " Line " )
2018-08-06 19:16:30 +00:00
return err
end
2021-02-15 08:28:22 +00:00
--- Simple startsWith string helper.
--
-- C.f., <http://lua-users.org/wiki/StringRecipes>.
-- @string str source string
-- @string start string to match
-- @treturn bool true on success
2019-04-18 21:26:53 +00:00
function util . stringStartsWith ( str , start )
return str : sub ( 1 , # start ) == start
end
2021-02-15 08:28:22 +00:00
--- Simple endsWith string helper.
-- @string str source string
-- @string ending string to match
-- @treturn bool true on success
2019-04-18 21:26:53 +00:00
function util . stringEndsWith ( str , ending )
return ending == " " or str : sub ( -# ending ) == ending
end
2022-09-13 21:09:49 +00:00
--- Search a string in a text.
-- @string or table txt Text (char list) to search in
-- @string str String to search for
-- @boolean case_sensitive
-- @number start_pos Position number in text to start search from
-- @treturn number Position number or 0 if not found
function util . stringSearch ( txt , str , case_sensitive , start_pos )
if not case_sensitive then
str = Utf8Proc.lowercase ( util.fixUtf8 ( str , " ? " ) )
end
local txt_charlist = type ( txt ) == " table " and txt or util.splitToChars ( txt )
local str_charlist = util.splitToChars ( str )
local str_len = # str_charlist
local char_pos , found = 0 , 0
for i = start_pos - 1 , # txt_charlist - str_len do
for j = 1 , str_len do
local char_txt = txt_charlist [ i + j ]
local char_str = str_charlist [ j ]
if not case_sensitive then
char_txt = Utf8Proc.lowercase ( util.fixUtf8 ( char_txt , " ? " ) )
end
if char_txt ~= char_str then
found = 0
break
end
found = found + 1
end
if found == str_len then
char_pos = i + 1
break
end
end
return char_pos
end
2021-11-06 05:09:12 +00:00
local WrappedFunction_mt = {
__call = function ( self , ... )
if self.before_callback then
self.before_callback ( self.target_table , ... )
end
if self.func then
return self.func ( ... )
end
end ,
}
--- Wrap (or replace) a table method with a custom method, in a revertable way.
-- This allows you extend the features of an existing module by modifying its
-- internal methods, and then revert them back to normal later if necessary.
--
-- The most notable use-case for this is VirtualKeyboard's inputbox method
-- wrapping to allow keyboards to add more complicated state-machines to modify
-- how characters are input.
--
-- The returned table is the same table `target_table[target_field_name]` is
-- set to. In addition to being callable, the new method has two sub-methods:
--
-- * `:revert()` will un-wrap the method and revert it to the original state.
--
-- Note that if a method is wrapped multiple times, reverting it will revert
-- it to the state of the method when util.wrapMethod was called (and if
-- called on the table returned from util.wrapMethod, that is the state when
-- that particular util.wrapMethod was called).
--
-- * `:raw_call(...)` will call the original method with the given arguments
-- and return whatever it returns.
--
-- This makes it more ergonomic to use the wrapped table methods in the case
-- where you've replaced the regular function with your own implementation
-- but you need to call the original functions inside your implementation.
--
-- * `:raw_method_call(...)` will call the original method with the arguments
-- `(target_table, ...)` and return whatever it returns. Note that the
-- target_table used is the one associated with the util.wrapMethod call.
--
-- This makes it more ergonomic to use the wrapped table methods in the case
-- where you've replaced the regular function with your own implementation
-- but you need to call the original functions inside your implementation.
--
-- This is effectively short-hand for `:raw_call(target_table, ...)`.
--
-- This is loosely based on busted/luassert's spies implementation (MIT).
-- <https://github.com/Olivine-Labs/luassert/blob/v1.7.11/src/spy.lua>
--
-- @tparam table target_table The table whose method will be wrapped.
-- @tparam string target_field_name The name of the field to wrap.
-- @tparam nil|func new_func If non-nil, this function will be called instead of the original function after wrapping.
-- @tparam nil|func before_callback If non-nil, this function will be called (with the arguments (target_table, ...)) before the function is called.
function util . wrapMethod ( target_table , target_field_name , new_func , before_callback )
local old_func = target_table [ target_field_name ]
local wrapped = setmetatable ( {
target_table = target_table ,
target_field_name = target_field_name ,
old_func = old_func ,
before_callback = before_callback ,
func = new_func or old_func ,
revert = function ( self )
if not self.reverted then
self.target_table [ self.target_field_name ] = self.old_func
self.reverted = true
end
end ,
raw_call = function ( self , ... )
if self.old_func then
return self.old_func ( ... )
end
end ,
raw_method_call = function ( self , ... )
return self : raw_call ( self.target_table , ... )
end ,
} , WrappedFunction_mt )
target_table [ target_field_name ] = wrapped
return wrapped
end
2015-02-01 09:40:34 +00:00
return util