2016-02-04 18:24:39 +00:00
--[[--
2016-12-13 16:06:02 +00:00
This module contains miscellaneous helper functions for the KOReader frontend .
2016-06-05 07:33:31 +00:00
] ]
2016-02-04 18:24:39 +00:00
2016-06-05 07:33:31 +00:00
local BaseUtil = require ( " ffi/util " )
2018-12-13 06:27:49 +00:00
local dbg = require ( " dbg " )
local _ = require ( " gettext " )
local T = BaseUtil.template
2019-11-23 23:27:27 +00:00
local lshift = bit.lshift
2019-11-22 18:50:58 +00:00
local rshift = bit.rshift
local band = bit.band
local bor = bit.bor
2015-02-01 09:40:34 +00:00
local util = { }
2019-11-23 23:27:27 +00:00
--- Strips all punctuation marks and spaces from a string.
2016-06-05 07:33:31 +00:00
---- @string text the string to be stripped
---- @treturn string stripped text
2019-11-23 23:27:27 +00:00
function util . stripPunctuation ( text )
2016-06-05 07:33:31 +00:00
if not text then return end
2019-11-23 23:27:27 +00:00
-- strip ASCII punctuation marks around text
-- and strip any generic punctuation marks (U+2000 - U+206F) in the text
2016-06-05 07:33:31 +00:00
return text : gsub ( " \226 [ \128 - \131 ][ \128 - \191 ] " , ' ' ) : gsub ( " ^%p+ " , ' ' ) : gsub ( " %p+$ " , ' ' )
2015-02-01 09:40:34 +00:00
end
2017-02-25 17:52:34 +00:00
--[[--
2017-04-04 07:57:14 +00:00
Splits a string by a pattern
2015-04-22 06:17:06 +00:00
Lua doesn ' t have a string.split() function and most of the time
you don ' t really need it because string.gmatch() is enough.
However string.gmatch ( ) has one significant disadvantage for me :
You can ' t split a string while matching both the delimited
strings and the delimiters themselves without tracking positions
and substrings . The gsplit function below takes care of
this problem .
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
Author : Peter Odding
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
License : MIT / X11
2017-04-04 07:57:14 +00:00
Source : < a href = " http://snippets.luacode.org/snippets/String_splitting_130 " > http : // snippets.luacode . org / snippets / String_splitting_130 </ a >
2017-02-25 17:52:34 +00:00
] ]
----@string str string to split
----@param pattern the pattern to split against
----@bool capture
2017-04-14 19:12:28 +00:00
----@bool capture_empty_entity
function util . gsplit ( str , pattern , capture , capture_empty_entity )
2015-04-22 06:17:06 +00:00
pattern = pattern and tostring ( pattern ) or ' %s+ '
if ( ' ' ) : find ( pattern ) then
error ( ' pattern matches empty string! ' , 2 )
end
return coroutine.wrap ( function ( )
local index = 1
repeat
local first , last = str : find ( pattern , index )
if first and last then
2017-04-14 19:12:28 +00:00
if index < first or ( index == first and capture_empty_entity ) then
2015-04-22 06:17:06 +00:00
coroutine.yield ( str : sub ( index , first - 1 ) )
end
if capture then
coroutine.yield ( str : sub ( first , last ) )
end
index = last + 1
else
if index <= # str then
coroutine.yield ( str : sub ( index ) )
end
break
end
until index > # str
end )
end
2017-04-04 07:57:14 +00:00
--[[--
Converts seconds to a clock string .
Source : < a href = " https://gist.github.com/jesseadams/791673 " > https : // gist.github . com / jesseadams / 791673 </ a >
] ]
2017-02-25 17:52:34 +00:00
---- @int seconds number of seconds
---- @bool withoutSeconds if true 00:00, if false 00:00:00
---- @treturn string clock string in the form of 00:00 or 00:00:00
2016-01-03 09:08:26 +00:00
function util . secondsToClock ( seconds , withoutSeconds )
seconds = tonumber ( seconds )
2015-11-27 15:13:01 +00:00
if seconds == 0 or seconds ~= seconds then
if withoutSeconds then
2017-10-18 15:27:27 +00:00
return " 00:00 "
2015-11-27 15:13:01 +00:00
else
2017-10-18 15:27:27 +00:00
return " 00:00:00 "
2015-11-27 15:13:01 +00:00
end
else
2017-10-11 14:38:20 +00:00
local round = withoutSeconds and require ( " optmath " ) . round or math.floor
2017-10-18 15:27:27 +00:00
local hours = string.format ( " %02.f " , math.floor ( seconds / 3600 ) )
local mins = string.format ( " %02.f " , round ( seconds / 60 - ( hours * 60 ) ) )
if mins == " 60 " then
mins = string.format ( " %02.f " , 0 )
hours = string.format ( " %02.f " , hours + 1 )
end
2015-11-27 15:13:01 +00:00
if withoutSeconds then
return hours .. " : " .. mins
end
2017-10-18 15:27:27 +00:00
local secs = string.format ( " %02.f " , math.floor ( seconds - hours * 3600 - mins * 60 ) )
2015-11-27 15:13:01 +00:00
return hours .. " : " .. mins .. " : " .. secs
end
end
2019-08-23 17:53:53 +00:00
--- Converts seconds to a period of time string.
2019-08-16 21:22:58 +00:00
---- @int seconds number of seconds
---- @bool withoutSeconds if true 1h30', if false 1h30'10''
---- @bool hmsFormat, if true format 1h30m10s
---- @treturn string clock string in the form of 1h30' or 1h30'10''
function util . secondsToHClock ( seconds , withoutSeconds , hmsFormat )
seconds = tonumber ( seconds )
if seconds == 0 then
if withoutSeconds then
if hmsFormat then
return T ( _ ( " %1m " ) , " 0 " )
else
return " 0' "
end
else
if hmsFormat then
return T ( _ ( " %1s " ) , " 0 " )
else
return " 0'' "
end
end
elseif seconds < 60 then
if withoutSeconds and seconds < 30 then
if hmsFormat then
return T ( _ ( " %1m " ) , " 0 " )
else
return " 0' "
end
elseif withoutSeconds and seconds >= 30 then
if hmsFormat then
return T ( _ ( " %1m " ) , " 1 " )
else
return " 1' "
end
else
if hmsFormat then
return T ( _ ( " %1m%2s " ) , " 0 " , string.format ( " %02.f " , seconds ) )
else
return " 0' " .. string.format ( " %02.f " , seconds ) .. " '' "
end
end
else
local round = withoutSeconds and require ( " optmath " ) . round or math.floor
local hours = string.format ( " %.f " , math.floor ( seconds / 3600 ) )
local mins = string.format ( " %02.f " , round ( seconds / 60 - ( hours * 60 ) ) )
if mins == " 60 " then
mins = string.format ( " %02.f " , 0 )
hours = string.format ( " %.f " , hours + 1 )
end
if withoutSeconds then
if hours == " 0 " then
mins = string.format ( " %.f " , round ( seconds / 60 ) )
2019-11-26 12:28:11 +00:00
if hmsFormat then
return T ( _ ( " %1m " ) , mins )
else
return mins .. " ' "
end
2019-08-16 21:22:58 +00:00
end
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour, like in 1h30. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2 " ) , hours , mins )
end
local secs = string.format ( " %02.f " , math.floor ( seconds - hours * 3600 - mins * 60 ) )
if hours == " 0 " then
mins = string.format ( " %.f " , round ( seconds / 60 ) )
if hmsFormat then
2019-08-24 07:25:38 +00:00
-- @translators This is the 'm' for minute and the 's' for second, like in 1m30s. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1m%2s " ) , mins , secs )
else
return mins .. " ' " .. secs .. " '' "
end
end
if hmsFormat then
if secs == " 00 " then
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour and the 'm' for minute, like in 1h30m. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2m " ) , hours , mins )
else
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour, the 'm' for minute and the 's' for second, like in 1h30m30s. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2m%3s " ) , hours , mins , secs )
end
else
if secs == " 00 " then
return T ( _ ( " %1h%2' " ) , hours , mins )
else
return T ( _ ( " %1h%2'%3'' " ) , hours , mins , secs )
end
end
end
end
2018-03-31 19:19:31 +00:00
--[[--
Compares values in two different tables .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 32660766 / 2470572 >
2018-03-31 19:19:31 +00:00
] ]
---- @param o1 Lua table
---- @param o2 Lua table
---- @bool ignore_mt
---- @treturn boolean
function util . tableEquals ( o1 , o2 , ignore_mt )
if o1 == o2 then return true end
local o1Type = type ( o1 )
local o2Type = type ( o2 )
if o1Type ~= o2Type then return false end
if o1Type ~= ' table ' then return false end
if not ignore_mt then
local mt1 = getmetatable ( o1 )
if mt1 and mt1.__eq then
--compare using built in method
return o1 == o2
end
end
local keySet = { }
for key1 , value1 in pairs ( o1 ) do
local value2 = o2 [ key1 ]
if value2 == nil or util.tableEquals ( value1 , value2 , ignore_mt ) == false then
return false
end
keySet [ key1 ] = true
end
for key2 , _ in pairs ( o2 ) do
if not keySet [ key2 ] then return false end
end
return true
end
2019-03-04 18:01:01 +00:00
--[[--
Makes a deep copy of a table .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 16077650 / 2470572 >
2019-03-04 18:01:01 +00:00
] ]
---- @param o Lua table
---- @treturn Lua table
function util . tableDeepCopy ( o , seen )
seen = seen or { }
if o == nil then return nil end
if seen [ o ] then return seen [ o ] end
local no
if type ( o ) == " table " then
no = { }
seen [ o ] = no
for k , v in next , o , nil do
no [ util.tableDeepCopy ( k , seen ) ] = util.tableDeepCopy ( v , seen )
end
setmetatable ( no , util.tableDeepCopy ( getmetatable ( o ) , seen ) )
else -- number, string, boolean, etc
no = o
end
return no
end
2016-02-04 18:24:39 +00:00
--- Returns number of keys in a table.
2018-12-13 06:27:49 +00:00
---- @param t Lua table
---- @treturn int number of keys in table t
function util . tableSize ( t )
2015-11-27 15:13:01 +00:00
local count = 0
2018-12-13 06:27:49 +00:00
for _ in pairs ( t ) do count = count + 1 end
2015-11-27 15:13:01 +00:00
return count
end
2017-04-04 07:57:14 +00:00
--- Append all elements from t2 into t1.
---- @param t1 Lua table
---- @param t2 Lua table
2016-01-31 22:23:44 +00:00
function util . arrayAppend ( t1 , t2 )
2016-02-12 14:55:02 +00:00
for _ , v in ipairs ( t2 ) do
2016-01-31 22:23:44 +00:00
table.insert ( t1 , v )
end
end
2019-12-06 21:55:37 +00:00
-- Reverse array elements in-place in table t
---- @param t Lua table
function util . arrayReverse ( t )
local i , j = 1 , # t
while i < j do
t [ i ] , t [ j ] = t [ j ] , t [ i ]
i = i + 1
j = j - 1
end
end
2019-06-28 02:46:16 +00:00
-- Merge t2 into t1, overwriting existing elements if they already exist
-- Probably not safe with nested tables (c.f., https://stackoverflow.com/q/1283388)
---- @param t1 Lua table
---- @param t2 Lua table
function util . tableMerge ( t1 , t2 )
for k , v in pairs ( t2 ) do
t1 [ k ] = v
end
end
2017-04-04 07:57:14 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Gets last index of character in string ( i.e . , strrchr )
2017-04-04 07:57:14 +00:00
Returns the index within this string of the last occurrence of the specified character
or - 1 if the character does not occur .
To find . you need to escape it .
] ]
---- @string string
---- @string ch
---- @treturn int last occurrence or -1 if not found
2016-02-12 14:55:02 +00:00
function util . lastIndexOf ( string , ch )
local i = string : match ( " .* " .. ch .. " () " )
if i == nil then return - 1 else return i - 1 end
end
2018-01-13 23:05:05 +00:00
--- Reverse the individual greater-than-single-byte characters
-- @string string to reverse
2019-08-23 17:53:53 +00:00
-- Taken from <https://github.com/blitmap/lua-utf8-simple#utf8reverses>
2018-01-13 23:05:05 +00:00
function util . utf8Reverse ( text )
text = text : gsub ( ' [%z \1 - \127 \194 - \244 ][ \128 - \191 ]* ' , function ( c ) return # c > 1 and c : reverse ( ) end )
return text : reverse ( )
end
2016-04-21 14:13:10 +00:00
2016-12-13 16:06:02 +00:00
--- Splits string into a list of UTF-8 characters.
---- @string text the string to be split.
2016-06-05 07:33:31 +00:00
---- @treturn table list of UTF-8 chars
2016-05-22 15:59:28 +00:00
function util . splitToChars ( text )
local tab = { }
if text ~= nil then
local prevcharcode , charcode = 0
2019-01-15 17:36:33 +00:00
-- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
-- a superset of UTF-8, that includes UTF-16 surrogates
-- in UTF-8 bytes (forbidden in well-formed UTF-8).
-- We may get that from bad producers or converters.
-- (luajson, used to decode Wikipedia API json, will not correctly decode
-- this sample: <span lang=\"got\">\ud800\udf45</span> : single Unicode
-- char https://www.compart.com/en/unicode/U+10345 and will give us
-- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85")
-- From http://www.unicode.org/faq/utf_bom.html#utf16-1
-- Surrogates are code points from two special ranges of
-- Unicode values, reserved for use as the leading, and
-- trailing values of paired code units in UTF-16. Leading,
-- also called high, surrogates are from D800 to DBFF, and
-- trailing, or low, surrogates are from DC00 to DFFF. They
-- are called surrogates, since they do not represent
-- characters directly, but only as a pair.
local hi_surrogate
local hi_surrogate_uchar
2017-08-14 20:30:42 +00:00
for uchar in string.gmatch ( text , " ([%z \1 - \127 \194 - \244 ][ \128 - \191 ]*) " ) do
2016-05-22 15:59:28 +00:00
charcode = BaseUtil.utf8charcode ( uchar )
2019-01-15 17:36:33 +00:00
-- (not sure why we need this prevcharcode check; we could get
-- charcode=nil with invalid UTF-8, but should we then really
-- ignore the following charcode ?)
2016-05-22 15:59:28 +00:00
if prevcharcode then -- utf8
2019-01-15 17:36:33 +00:00
if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = charcode
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
-- low surrogate following a high surrogate, good, let's make them a single char
2019-11-23 23:27:27 +00:00
charcode = lshift ( ( hi_surrogate - 0xD800 ) , 10 ) + ( charcode - 0xDC00 ) + 0x10000
2019-01-15 17:36:33 +00:00
table.insert ( tab , util.unicodeCodepointToUtf8 ( charcode ) )
hi_surrogate = nil
else
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = nil
table.insert ( tab , uchar )
end
2016-05-22 15:59:28 +00:00
end
prevcharcode = charcode
2016-04-21 14:13:10 +00:00
end
end
2016-05-22 15:59:28 +00:00
return tab
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Tests whether c is a CJK character
---- @string c
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . isCJKChar ( c )
return string.match ( c , " [ \228 - \234 ][ \128 - \191 ]. " ) == c
end
2017-04-04 07:57:14 +00:00
--- Tests whether str contains CJK characters
---- @string str
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . hasCJKChar ( str )
return string.match ( str , " [ \228 - \234 ][ \128 - \191 ]. " ) ~= nil
end
2019-11-23 23:27:27 +00:00
--- Split texts into a list of words, spaces and punctuation marks.
2016-06-05 07:33:31 +00:00
---- @string text text to split
2019-11-23 23:27:27 +00:00
---- @treturn table list of words, spaces and punctuation marks
2016-06-05 07:33:31 +00:00
function util . splitToWords ( text )
local wlist = { }
2016-06-28 15:50:21 +00:00
for word in util.gsplit ( text , " [%s%p]+ " , true ) do
2019-11-23 23:27:27 +00:00
-- if space split word contains CJK characters
2016-11-26 00:46:56 +00:00
if util.hasCJKChar ( word ) then
2016-06-28 15:50:21 +00:00
-- split with CJK characters
for char in util.gsplit ( word , " [ \228 - \234 \192 - \255 ][ \128 - \191 ]+ " , true ) do
table.insert ( wlist , char )
end
else
2016-06-05 07:33:31 +00:00
table.insert ( wlist , word )
end
end
return wlist
end
2016-12-06 21:10:25 +00:00
-- We don't want to split on a space if it is followed by some
2019-11-23 23:27:27 +00:00
-- specific punctuation marks : e.g. "word :" or "word )"
-- (In French, there is a non-breaking space before a colon, and it better
2016-12-06 21:10:25 +00:00
-- not be wrapped there.)
2017-04-04 07:57:14 +00:00
local non_splittable_space_tailers = " :;,.!?)]}$%=-+*/|<>»” "
2019-11-23 23:27:27 +00:00
-- Same if a space has some specific other punctuation mark before it
2017-04-04 07:57:14 +00:00
local non_splittable_space_leaders = " ([{$=-+*/|<>«“ "
2016-12-06 21:10:25 +00:00
2016-12-15 07:58:58 +00:00
-- Similar rules exist for CJK text. Taken from :
-- https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_tailers = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" !%),.:;?]}¢°·’ \" †‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?!]}~ " ,
-- Traditional Chinese
" !),.:;?]}¢·–—’ \" •、。〆〞〕〉》」︰︱︲︳﹐﹑﹒﹔﹕﹖﹘﹚﹜!),.:;?︶︸︺︼︾﹀﹂﹗]|}、 " ,
-- Japanese
" )]}〕〉》」』】〙〗〟’ \" ⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。. " ,
-- Korean
" !%),.:;?]}¢°’ \" †‡℃〆〈《「『〕!%),.:;?]} " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_leaders = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" $(£¥·‘ \" 〈《「『【〔〖〝﹙﹛$(.[{£¥ " ,
-- Traditional Chinese
" ([{£¥‘ \" ‵〈《「『〔〝︴﹙﹛({︵︷︹︻︽︿﹁﹃﹏ " ,
-- Japanese
" ([{〔〈《「『【〘〖〝‘ \" ⦅« " ,
-- Korean
" $([{£¥‘ \" 々〇〉》」〔$([{⦆¥₩# " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Japanese
" —…‥〳〴〵 " ,
} )
2017-04-04 07:57:14 +00:00
--- Test whether a string can be separated by this char for multi-line rendering.
2016-12-12 22:41:16 +00:00
-- Optional next or prev chars may be provided to help make the decision
2017-04-04 07:57:14 +00:00
---- @string c
---- @string next_c
---- @string prev_c
---- @treturn boolean true if splittable, false if not
function util . isSplittable ( c , next_c , prev_c )
2016-12-06 21:10:25 +00:00
if util.isCJKChar ( c ) then
2017-04-04 07:57:14 +00:00
-- a CJKChar is a word in itself, and so is splittable
if cjk_non_splittable : find ( c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- except a few of them
return false
2017-04-04 07:57:14 +00:00
elseif next_c and cjk_non_splittable_tailers : find ( next_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but followed by a char that is not permitted at start of line
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and cjk_non_splittable_leaders : find ( prev_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but preceded by a char that is not permitted at end of line
return false
else
-- we can split on this CJKchar
return true
end
2016-12-06 21:10:25 +00:00
elseif c == " " then
2019-11-23 23:27:27 +00:00
-- we only split on a space (so a punctuation mark sticks to prev word)
2016-12-12 22:41:16 +00:00
-- if next_c or prev_c is provided, we can make a better decision
2017-04-04 07:57:14 +00:00
if next_c and non_splittable_space_tailers : find ( next_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is followed by some punctuation mark that is better kept with us
2016-12-12 22:41:16 +00:00
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and non_splittable_space_leaders : find ( prev_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is lead by some punctuation mark that is better kept with us
2016-12-06 21:10:25 +00:00
return false
else
-- we can split on this space
return true
end
end
2019-11-23 23:27:27 +00:00
-- otherwise, not splittable
2016-12-06 21:10:25 +00:00
return false
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Gets filesystem type of a path.
--
-- Checks if the path occurs in <code>/proc/mounts</code>
---- @string path an absolute path
---- @treturn string filesystem type
2017-01-10 00:05:15 +00:00
function util . getFilesystemType ( path )
local mounts = io.open ( " /proc/mounts " , " r " )
if not mounts then return nil end
local type
while true do
local line
local mount = { }
line = mounts : read ( )
if line == nil then
break
end
for param in line : gmatch ( " %S+ " ) do table.insert ( mount , param ) end
if string.match ( path , mount [ 2 ] ) then
type = mount [ 3 ]
if mount [ 2 ] ~= ' / ' then
break
end
end
end
mounts : close ( )
return type
end
2017-04-26 06:12:25 +00:00
--- Checks if directory is empty.
---- @string path
---- @treturn bool
function util . isEmptyDir ( path )
2017-04-29 06:57:50 +00:00
local lfs = require ( " libs/libkoreader-lfs " )
2017-08-12 13:01:59 +00:00
-- lfs.dir will crash rather than return nil if directory doesn't exist O_o
local ok , iter , dir_obj = pcall ( lfs.dir , path )
if not ok then return end
for filename in iter , dir_obj do
2017-04-26 06:12:25 +00:00
if filename ~= ' . ' and filename ~= ' .. ' then
return false
end
end
return true
end
2018-01-01 14:40:28 +00:00
--- Checks if the given path exists. Doesn't care if it's a file or directory.
---- @string path
---- @treturn bool
function util . pathExists ( path )
local lfs = require ( " libs/libkoreader-lfs " )
return lfs.attributes ( path , " mode " ) ~= nil
end
--- As `mkdir -p`.
2019-08-23 17:53:53 +00:00
-- Unlike [lfs.mkdir](https://keplerproject.github.io/luafilesystem/manual.html#mkdir)(),
-- does not error if the directory already exists, and creates intermediate directories as needed.
-- @string path the directory to create
-- @treturn bool true on success; nil, err_message on error
2018-01-01 14:40:28 +00:00
function util . makePath ( path )
path = path : gsub ( " /+$ " , " " )
if util.pathExists ( path ) then return true end
local success , err = util.makePath ( ( util.splitFilePathName ( path ) ) )
if not success then
return nil , err .. " (creating " .. path .. " ) "
end
local lfs = require ( " libs/libkoreader-lfs " )
return lfs.mkdir ( path )
end
2017-04-04 07:57:14 +00:00
--- Replaces characters that are invalid filenames.
--
-- Replaces the characters <code>\/:*?"<>|</code> with an <code>_</code>.
-- These characters are problematic on Windows filesystems. On Linux only
-- <code>/</code> poses a problem.
---- @string str filename
---- @treturn string sanitized filename
2019-05-14 17:10:41 +00:00
local function replaceAllInvalidChars ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' [ \\ ,%/,:,%*,%?,%",%<,%>,%|] ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2017-04-04 07:57:14 +00:00
--- Replaces slash with an underscore.
---- @string str
---- @treturn string
2019-05-14 17:10:41 +00:00
local function replaceSlashChar ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' %/ ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replaces characters that are invalid in filenames .
2019-08-23 17:53:53 +00:00
Replaces the characters ` \ / : * ? " <>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
If an optional path is provided , @ { util.getFilesystemType } ( ) will be used to determine whether stricter VFAT restrictions should be applied .
] ]
2019-05-14 17:10:41 +00:00
---- @string str
---- @string path
---- @int limit
2019-08-23 17:53:53 +00:00
---- @treturn string safe filename
2019-06-10 15:06:13 +00:00
function util . getSafeFilename ( str , path , limit , limit_ext )
2019-05-14 17:10:41 +00:00
local filename , suffix = util.splitFileNameSuffix ( str )
2019-05-22 09:34:46 +00:00
local replaceFunc = replaceAllInvalidChars
2019-05-14 17:10:41 +00:00
local safe_filename
-- VFAT supports a maximum of 255 UCS-2 characters, although it's probably treated as UTF-16 by Windows
-- default to a slightly lower limit just in case
limit = limit or 240
2019-06-10 15:06:13 +00:00
limit_ext = limit_ext or 10
2019-05-14 17:10:41 +00:00
if path then
local file_system = util.getFilesystemType ( path )
2019-05-22 09:34:46 +00:00
if file_system ~= " vfat " and file_system ~= " fuse.fsp " then
replaceFunc = replaceSlashChar
2019-05-14 17:10:41 +00:00
end
end
2019-06-10 15:06:13 +00:00
if suffix : len ( ) > limit_ext then
-- probably not an actual file extension, or at least not one we'd be
-- dealing with, so strip the whole string
filename = str
suffix = nil
end
filename = util.htmlToPlainTextIfHtml ( filename )
2019-05-14 17:10:41 +00:00
filename = filename : sub ( 1 , limit )
-- the limit might result in broken UTF-8, which we don't want in the result
filename = util.fixUtf8 ( filename , " " )
if suffix and suffix ~= " " then
safe_filename = replaceFunc ( filename ) .. " . " .. replaceFunc ( suffix )
else
safe_filename = replaceFunc ( filename )
end
return safe_filename
end
2018-01-01 14:40:28 +00:00
--- Splits a file into its directory path and file name.
--- If the given path has a trailing /, returns the entire path as the directory
--- path and "" as the file name.
2017-04-04 07:57:14 +00:00
---- @string file
---- @treturn string path, filename
2017-01-21 09:32:42 +00:00
function util . splitFilePathName ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " / " ) == nil then return " " , file end
return string.gsub ( file , " (.*/)(.*) " , " %1 " ) , string.gsub ( file , " .*/ " , " " )
end
2017-04-04 07:57:14 +00:00
--- Splits a file name into its pure file name and suffix
---- @string file
---- @treturn string path, extension
2017-01-21 09:32:42 +00:00
function util . splitFileNameSuffix ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " %. " ) == nil then return file , " " end
return string.gsub ( file , " (.*)%.(.*) " , " %1 " ) , string.gsub ( file , " .*%. " , " " )
end
2017-04-04 07:57:14 +00:00
--- Gets file extension
---- @string filename
---- @treturn string extension
2017-02-12 02:55:31 +00:00
function util . getFileNameSuffix ( file )
local _ , suffix = util.splitFileNameSuffix ( file )
return suffix
end
2020-02-03 19:08:18 +00:00
--- Returns true if the file is a script we allow running
--- Basically a helper method to check a specific list of file extensions.
---- @string filename
---- @treturn boolean
function util . isAllowedScript ( file )
local file_ext = string.lower ( util.getFileNameSuffix ( file ) )
if file_ext == " sh "
or file_ext == " py " then
return true
else
return false
end
end
--- Companion helper function that returns the script's language,
--- based on the filme extension.
---- @string filename
---- @treturn string (lowercase) (or nil if !isAllowedScript)
function util . getScriptType ( file )
local file_ext = string.lower ( util.getFileNameSuffix ( file ) )
if file_ext == " sh " then
return " shell "
elseif file_ext == " py " then
return " python "
end
end
2017-10-20 15:48:32 +00:00
--- Gets human friendly size as string
---- @int size (bytes)
2019-12-17 12:00:35 +00:00
---- @bool right_align (by padding with spaces on the left)
2017-10-20 15:48:32 +00:00
---- @treturn string
2019-12-17 12:00:35 +00:00
function util . getFriendlySize ( size , right_align )
local frac_format = right_align and " %6.1f " or " %.1f "
local deci_format = right_align and " %6d " or " %d "
2018-12-13 06:27:49 +00:00
size = tonumber ( size )
2018-01-31 16:22:34 +00:00
if not size or type ( size ) ~= " number " then return end
2017-10-20 15:48:32 +00:00
if size > 1024 * 1024 * 1024 then
2019-11-28 22:22:07 +00:00
-- @translators This is an abbreviation for the gigabyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 GB " ) , string.format ( frac_format , size / 1024 / 1024 / 1024 ) )
2019-11-28 22:22:07 +00:00
end
if size > 1024 * 1024 then
-- @translators This is an abbreviation for the megabyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 MB " ) , string.format ( frac_format , size / 1024 / 1024 ) )
2019-11-28 22:22:07 +00:00
end
if size > 1024 then
-- @translators This is an abbreviation for the kilobyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 KB " ) , string.format ( frac_format , size / 1024 ) )
2017-10-20 15:48:32 +00:00
else
2019-11-28 22:22:07 +00:00
-- @translators This is an abbreviation for the byte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 B " ) , string.format ( deci_format , size ) )
2017-10-20 15:48:32 +00:00
end
end
2017-10-20 17:29:52 +00:00
--- Gets formatted size as string (1273334 => "1,273,334")
---- @int size (bytes)
---- @treturn string
function util . getFormattedSize ( size )
local s = tostring ( size )
s = s : reverse ( ) : gsub ( " (%d%d%d) " , " %1, " )
s = s : reverse ( ) : gsub ( " ^, " , " " )
return s
end
2019-08-23 17:53:53 +00:00
--[[--
Replaces invalid UTF - 8 characters with a replacement string .
Based on < http : // notebook.kulchenko . com / programming / fixing - malformed - utf8 - in - lua > .
2019-11-23 23:27:27 +00:00
c.f . , FixUTF8 @ < https : // github.com / pkulchenko / ZeroBraneStudio / blob / master / src / util.lua > .
2019-08-23 17:53:53 +00:00
@ string str the string to be checked for invalid characters
@ string replacement the string to replace invalid characters with
@ treturn string valid UTF - 8
] ]
2017-04-02 14:17:49 +00:00
function util . fixUtf8 ( str , replacement )
local pos = 1
local len = # str
while pos <= len do
2019-11-23 23:27:27 +00:00
if str : find ( " ^[%z \1 - \127 ] " , pos ) then pos = pos + 1
elseif str : find ( " ^[ \194 - \223 ][ \128 - \191 ] " , pos ) then pos = pos + 2
elseif str : find ( " ^ \224 [ \160 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \225 - \236 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \237 [ \128 - \159 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \238 - \239 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 3
elseif str : find ( " ^ \240 [ \144 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \241 - \243 ][ \128 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \244 [ \128 - \143 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 4
2017-04-02 14:17:49 +00:00
else
str = str : sub ( 1 , pos - 1 ) .. replacement .. str : sub ( pos + 1 )
pos = pos + # replacement
len = len + # replacement - 1
end
end
return str
end
2017-04-14 19:12:28 +00:00
--- Splits input string with the splitter into a table. This function ignores the last empty entity.
--
--- @string str the string to be split
--- @string splitter
--- @bool capture_empty_entity
--- @treturn an array-like table
function util . splitToArray ( str , splitter , capture_empty_entity )
local result = { }
for word in util.gsplit ( str , splitter , false , capture_empty_entity ) do
table.insert ( result , word )
end
return result
end
2019-11-22 18:50:58 +00:00
--- Convert a Unicode codepoint (number) to UTF-8 char
--- c.f., <https://stackoverflow.com/a/4609989>
--- & <https://stackoverflow.com/a/38492214>
2019-11-23 23:27:27 +00:00
--- See utf8charcode in ffi/util for a decoder.
2017-07-01 10:11:44 +00:00
--
--- @int c Unicode codepoint
2019-11-22 18:50:58 +00:00
--- @treturn string UTF-8 char
2017-07-01 10:11:44 +00:00
function util . unicodeCodepointToUtf8 ( c )
2019-11-22 18:50:58 +00:00
if c < 0x80 then
2017-07-01 10:11:44 +00:00
return string.char ( c )
2019-11-22 18:50:58 +00:00
elseif c < 0x800 then
return string.char (
bor ( 0xC0 , rshift ( c , 6 ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x10000 then
if c >= 0xD800 and c <= 0xDFFF then
return ' <EFBFBD> ' -- Surrogates -> U+FFFD REPLACEMENT CHARACTER
end
return string.char (
bor ( 0xE0 , rshift ( c , 12 ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x110000 then
return string.char (
bor ( 0xF0 , rshift ( c , 18 ) ) ,
bor ( 0x80 , band ( rshift ( c , 12 ) , 0x3F ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
2017-07-01 10:11:44 +00:00
else
2019-11-22 18:50:58 +00:00
return ' <EFBFBD> ' -- Invalid -> U+FFFD REPLACEMENT CHARACTER
2017-07-01 10:11:44 +00:00
end
end
2018-04-10 16:30:27 +00:00
-- we need to use an array of arrays to keep them ordered as written
2017-07-01 10:11:44 +00:00
local HTML_ENTITIES_TO_UTF8 = {
2018-04-10 16:30:27 +00:00
{ " < " , " < " } ,
{ " > " , " > " } ,
{ " " " , ' " ' } ,
{ " ' " , " ' " } ,
{ " " , " \xC2 \xA0 " } ,
{ " &#(%d+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x ) ) end } ,
2019-11-22 18:50:58 +00:00
{ " &#x(%x+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x , 16 ) ) end } ,
2018-04-10 16:30:27 +00:00
{ " & " , " & " } , -- must be last
2017-07-01 10:11:44 +00:00
}
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replace HTML entities with their UTF - 8 encoded equivalent in text .
2019-08-23 17:53:53 +00:00
Supports only basic ones and those with numbers ( no support for named entities like ` & eacute ; ` ) .
@ int string text with HTML entities
2019-11-23 23:27:27 +00:00
@ treturn string UTF - 8 text
2019-08-23 17:53:53 +00:00
] ]
2017-07-01 10:11:44 +00:00
function util . htmlEntitiesToUtf8 ( text )
2018-04-10 16:30:27 +00:00
for _ , t in ipairs ( HTML_ENTITIES_TO_UTF8 ) do
text = text : gsub ( t [ 1 ] , t [ 2 ] )
2017-07-01 10:11:44 +00:00
end
return text
end
2019-08-23 17:53:53 +00:00
--[[--
Convert simple HTML to plain text .
This may fail on complex HTML ( with styles , scripts , comments ) , but should be fine enough with simple HTML as found in EPUB ' s `<dc:description>`.
@ string text HTML text
@ treturn string plain text
] ]
2017-07-01 10:11:44 +00:00
function util . htmlToPlainText ( text )
-- Replace <br> and <p> with \n
text = text : gsub ( " %s*<%s*br%s*/?>%s* " , " \n " ) -- <br> and <br/>
text = text : gsub ( " %s*<%s*p%s*>%s* " , " \n " ) -- <p>
text = text : gsub ( " %s*</%s*p%s*>%s* " , " \n " ) -- </p>
text = text : gsub ( " %s*<%s*p%s*/>%s* " , " \n " ) -- standalone <p/>
-- Remove all HTML tags
text = text : gsub ( " <[^>]*> " , " " )
-- Convert HTML entities
text = util.htmlEntitiesToUtf8 ( text )
-- Trim spaces and new lines at start and end
text = text : gsub ( " ^[ \n %s]* " , " " )
text = text : gsub ( " [ \n %s]*$ " , " " )
return text
end
--- Convert HTML to plain text if text seems to be HTML
-- Detection of HTML is simple and may raise false positives
-- or negatives, but seems quite good at guessing content type
-- of text found in EPUB's <dc:description>.
--
--- @string text the string with possibly some HTML
--- @treturn string cleaned text
function util . htmlToPlainTextIfHtml ( text )
local is_html = false
-- Quick way to check if text is some HTML:
-- look for html tags
local _ , nb_tags
_ , nb_tags = text : gsub ( " <%w+.-> " , " " )
if nb_tags > 0 then
is_html = true
else
-- no <tag> found
2019-11-23 23:27:27 +00:00
-- but we may meet some text badly/twice encoded html containing "<br>"
2017-07-01 10:11:44 +00:00
local nb_encoded_tags
_ , nb_encoded_tags = text : gsub ( " <%a+> " , " " )
if nb_encoded_tags > 0 then
is_html = true
-- decode one of the two encodes
text = util.htmlEntitiesToUtf8 ( text )
end
end
if is_html then
text = util.htmlToPlainText ( text )
else
-- if text ends with ]]>, it probably comes from <![CDATA[ .. ]]> that
-- crengine has extracted correctly, but let the ending tag in, so
-- let's remove it
text = text : gsub ( " ]]>%s*$ " , " " )
end
return text
end
2018-01-07 19:24:15 +00:00
--- Encode the HTML entities in a string
2018-01-15 22:51:43 +00:00
--- @string text the string to escape
2018-01-07 19:24:15 +00:00
-- Taken from https://github.com/kernelsauce/turbo/blob/e4a35c2e3fb63f07464f8f8e17252bea3a029685/turbo/escape.lua#L58-L70
function util . htmlEscape ( text )
return text : gsub ( " [}{ \" >/<'&] " , {
[ " & " ] = " & " ,
[ " < " ] = " < " ,
[ " > " ] = " > " ,
[ ' " ' ] = " " " ,
[ " ' " ] = " ' " ,
[ " / " ] = " / " ,
} )
end
2018-01-15 22:51:43 +00:00
--- Escape list for shell usage
--- @table args the list of arguments to escape
--- @treturn string the escaped and concatenated arguments
function util . shell_escape ( args )
local escaped_args = { }
for _ , arg in ipairs ( args ) do
arg = " ' " .. arg : gsub ( " ' " , " ' \\ '' " ) .. " ' "
table.insert ( escaped_args , arg )
end
return table.concat ( escaped_args , " " )
end
2018-01-17 08:17:53 +00:00
--- Clear all the elements from a table without reassignment.
--- @table t the table to be cleared
function util . clearTable ( t )
local c = # t
for i = 0 , c do t [ i ] = nil end
end
2018-05-04 15:06:58 +00:00
--- Encode URL also known as percent-encoding see https://en.wikipedia.org/wiki/Percent-encoding
--- @string text the string to encode
--- @treturn encode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlEncode ( url )
local char_to_hex = function ( c )
return string.format ( " %%%02X " , string.byte ( c ) )
end
if url == nil then
return
end
url = url : gsub ( " \n " , " \r \n " )
url = url : gsub ( " ([^%w%-%.%_%~%!%*%'%(%)]) " , char_to_hex )
return url
end
--- Decode URL (reverse process to util.urlEncode())
--- @string text the string to decode
--- @treturn decode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlDecode ( url )
local hex_to_char = function ( x )
return string.char ( tonumber ( x , 16 ) )
end
if url == nil then
return
end
url = url : gsub ( " %%(%x%x) " , hex_to_char )
return url
end
2018-08-06 19:16:30 +00:00
--- Check lua syntax of string
--- @string text lua code text
--- @treturn string with parsing error, nil if syntax ok
function util . checkLuaSyntax ( lua_text )
local lua_code_ok , err = loadstring ( lua_text )
if lua_code_ok then
return nil
end
-- Replace: [string "blah blah..."]:3: '=' expected near '123'
-- with: Line 3: '=' expected near '123'
err = err : gsub ( " %[string \" .-% \" ]: " , " Line " )
return err
end
2018-12-13 06:27:49 +00:00
--- Unpack an archive.
-- Extract the contents of an archive, detecting its format by
-- filename extension. Inspired by luarocks archive_unpack()
-- @param archive string: Filename of archive.
-- @param extract_to string: Destination directory.
-- @return boolean or (boolean, string): true on success, false and an error message on failure.
function util . unpackArchive ( archive , extract_to )
dbg.dassert ( type ( archive ) == " string " )
2020-01-04 00:18:51 +00:00
local BD = require ( " ui/bidi " )
2018-12-13 06:27:49 +00:00
local ok
if archive : match ( " %.tar%.bz2$ " ) or archive : match ( " %.tar%.gz$ " ) or archive : match ( " %.tar%.lz$ " ) or archive : match ( " %.tgz$ " ) then
ok = os.execute ( ( " ./tar xf %q -C %q " ) : format ( archive , extract_to ) )
else
2020-01-04 00:18:51 +00:00
return false , T ( _ ( " Couldn't extract archive: \n \n %1 \n \n Unrecognized filename extension. " ) , BD.filepath ( archive ) )
2018-12-13 06:27:49 +00:00
end
if not ok then
2020-01-04 00:18:51 +00:00
return false , T ( _ ( " Extracting archive failed: \n \n %1 " , BD.filepath ( archive ) ) )
2018-12-13 06:27:49 +00:00
end
return true
end
2019-04-18 21:26:53 +00:00
-- Simple startsWith / endsWith string helpers
-- c.f., http://lua-users.org/wiki/StringRecipes
-- @param str string: source string
-- @param start string: string to match
-- @return boolean: true on success
function util . stringStartsWith ( str , start )
return str : sub ( 1 , # start ) == start
end
-- @param str string: source string
-- @param ending string: string to match
-- @return boolean: true on success
function util . stringEndsWith ( str , ending )
return ending == " " or str : sub ( -# ending ) == ending
end
2015-02-01 09:40:34 +00:00
return util