2016-02-04 18:24:39 +00:00
--[[--
2016-12-13 16:06:02 +00:00
This module contains miscellaneous helper functions for the KOReader frontend .
2016-06-05 07:33:31 +00:00
] ]
2016-02-04 18:24:39 +00:00
2016-06-05 07:33:31 +00:00
local BaseUtil = require ( " ffi/util " )
2018-12-13 06:27:49 +00:00
local _ = require ( " gettext " )
local T = BaseUtil.template
2019-11-23 23:27:27 +00:00
local lshift = bit.lshift
2019-11-22 18:50:58 +00:00
local rshift = bit.rshift
local band = bit.band
local bor = bit.bor
2015-02-01 09:40:34 +00:00
local util = { }
2020-09-30 17:56:56 +00:00
---- Strips all punctuation marks and spaces from a string.
2016-06-05 07:33:31 +00:00
---- @string text the string to be stripped
---- @treturn string stripped text
2019-11-23 23:27:27 +00:00
function util . stripPunctuation ( text )
2016-06-05 07:33:31 +00:00
if not text then return end
2019-11-23 23:27:27 +00:00
-- strip ASCII punctuation marks around text
-- and strip any generic punctuation marks (U+2000 - U+206F) in the text
2016-06-05 07:33:31 +00:00
return text : gsub ( " \226 [ \128 - \131 ][ \128 - \191 ] " , ' ' ) : gsub ( " ^%p+ " , ' ' ) : gsub ( " %p+$ " , ' ' )
2015-02-01 09:40:34 +00:00
end
2020-09-30 17:56:56 +00:00
-- Various whitespace trimming helpers, from http://lua-users.org/wiki/CommonFunctions & http://lua-users.org/wiki/StringTrim
---- Remove leading whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . ltrim ( s )
return ( s : gsub ( " ^%s* " , " " ) )
end
---- Remove trailing whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . rtrim ( s )
local n = # s
while n > 0 and s : find ( " ^%s " , n ) do
n = n - 1
end
return s : sub ( 1 , n )
end
---- Remove leading & trailing whitespace from string.
---- @string s the string to be trimmed
---- @treturn string trimmed text
function util . trim ( s )
local from = s : match " ^%s*() "
return from > # s and " " or s : match ( " .*%S " , from )
end
2017-02-25 17:52:34 +00:00
--[[--
2017-04-04 07:57:14 +00:00
Splits a string by a pattern
2015-04-22 06:17:06 +00:00
Lua doesn ' t have a string.split() function and most of the time
you don ' t really need it because string.gmatch() is enough.
However string.gmatch ( ) has one significant disadvantage for me :
You can ' t split a string while matching both the delimited
strings and the delimiters themselves without tracking positions
and substrings . The gsplit function below takes care of
this problem .
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
Author : Peter Odding
2017-04-04 07:57:14 +00:00
2015-04-22 06:17:06 +00:00
License : MIT / X11
2017-04-04 07:57:14 +00:00
Source : < a href = " http://snippets.luacode.org/snippets/String_splitting_130 " > http : // snippets.luacode . org / snippets / String_splitting_130 </ a >
2017-02-25 17:52:34 +00:00
] ]
----@string str string to split
----@param pattern the pattern to split against
----@bool capture
2017-04-14 19:12:28 +00:00
----@bool capture_empty_entity
function util . gsplit ( str , pattern , capture , capture_empty_entity )
2015-04-22 06:17:06 +00:00
pattern = pattern and tostring ( pattern ) or ' %s+ '
if ( ' ' ) : find ( pattern ) then
error ( ' pattern matches empty string! ' , 2 )
end
return coroutine.wrap ( function ( )
local index = 1
repeat
local first , last = str : find ( pattern , index )
if first and last then
2017-04-14 19:12:28 +00:00
if index < first or ( index == first and capture_empty_entity ) then
2015-04-22 06:17:06 +00:00
coroutine.yield ( str : sub ( index , first - 1 ) )
end
if capture then
coroutine.yield ( str : sub ( first , last ) )
end
index = last + 1
else
if index <= # str then
coroutine.yield ( str : sub ( index ) )
end
break
end
until index > # str
end )
end
2017-04-04 07:57:14 +00:00
--[[--
Converts seconds to a clock string .
Source : < a href = " https://gist.github.com/jesseadams/791673 " > https : // gist.github . com / jesseadams / 791673 </ a >
] ]
2017-02-25 17:52:34 +00:00
---- @int seconds number of seconds
---- @bool withoutSeconds if true 00:00, if false 00:00:00
---- @treturn string clock string in the form of 00:00 or 00:00:00
2016-01-03 09:08:26 +00:00
function util . secondsToClock ( seconds , withoutSeconds )
seconds = tonumber ( seconds )
2015-11-27 15:13:01 +00:00
if seconds == 0 or seconds ~= seconds then
if withoutSeconds then
2017-10-18 15:27:27 +00:00
return " 00:00 "
2015-11-27 15:13:01 +00:00
else
2017-10-18 15:27:27 +00:00
return " 00:00:00 "
2015-11-27 15:13:01 +00:00
end
else
2017-10-11 14:38:20 +00:00
local round = withoutSeconds and require ( " optmath " ) . round or math.floor
2017-10-18 15:27:27 +00:00
local hours = string.format ( " %02.f " , math.floor ( seconds / 3600 ) )
local mins = string.format ( " %02.f " , round ( seconds / 60 - ( hours * 60 ) ) )
if mins == " 60 " then
mins = string.format ( " %02.f " , 0 )
hours = string.format ( " %02.f " , hours + 1 )
end
2015-11-27 15:13:01 +00:00
if withoutSeconds then
return hours .. " : " .. mins
end
2017-10-18 15:27:27 +00:00
local secs = string.format ( " %02.f " , math.floor ( seconds - hours * 3600 - mins * 60 ) )
2015-11-27 15:13:01 +00:00
return hours .. " : " .. mins .. " : " .. secs
end
end
2019-08-23 17:53:53 +00:00
--- Converts seconds to a period of time string.
2019-08-16 21:22:58 +00:00
---- @int seconds number of seconds
---- @bool withoutSeconds if true 1h30', if false 1h30'10''
---- @bool hmsFormat, if true format 1h30m10s
---- @treturn string clock string in the form of 1h30' or 1h30'10''
function util . secondsToHClock ( seconds , withoutSeconds , hmsFormat )
seconds = tonumber ( seconds )
if seconds == 0 then
if withoutSeconds then
if hmsFormat then
return T ( _ ( " %1m " ) , " 0 " )
else
return " 0' "
end
else
if hmsFormat then
return T ( _ ( " %1s " ) , " 0 " )
else
return " 0'' "
end
end
elseif seconds < 60 then
if withoutSeconds and seconds < 30 then
if hmsFormat then
return T ( _ ( " %1m " ) , " 0 " )
else
return " 0' "
end
elseif withoutSeconds and seconds >= 30 then
if hmsFormat then
return T ( _ ( " %1m " ) , " 1 " )
else
return " 1' "
end
else
if hmsFormat then
return T ( _ ( " %1m%2s " ) , " 0 " , string.format ( " %02.f " , seconds ) )
else
return " 0' " .. string.format ( " %02.f " , seconds ) .. " '' "
end
end
else
local round = withoutSeconds and require ( " optmath " ) . round or math.floor
local hours = string.format ( " %.f " , math.floor ( seconds / 3600 ) )
local mins = string.format ( " %02.f " , round ( seconds / 60 - ( hours * 60 ) ) )
if mins == " 60 " then
mins = string.format ( " %02.f " , 0 )
hours = string.format ( " %.f " , hours + 1 )
end
if withoutSeconds then
if hours == " 0 " then
mins = string.format ( " %.f " , round ( seconds / 60 ) )
2019-11-26 12:28:11 +00:00
if hmsFormat then
return T ( _ ( " %1m " ) , mins )
else
return mins .. " ' "
end
2019-08-16 21:22:58 +00:00
end
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour, like in 1h30. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2 " ) , hours , mins )
end
local secs = string.format ( " %02.f " , math.floor ( seconds - hours * 3600 - mins * 60 ) )
if hours == " 0 " then
mins = string.format ( " %.f " , round ( seconds / 60 ) )
if hmsFormat then
2019-08-24 07:25:38 +00:00
-- @translators This is the 'm' for minute and the 's' for second, like in 1m30s. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1m%2s " ) , mins , secs )
else
return mins .. " ' " .. secs .. " '' "
end
end
if hmsFormat then
if secs == " 00 " then
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour and the 'm' for minute, like in 1h30m. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2m " ) , hours , mins )
else
2019-08-24 07:25:38 +00:00
-- @translators This is the 'h' for hour, the 'm' for minute and the 's' for second, like in 1h30m30s. This is a duration.
2019-08-16 21:22:58 +00:00
return T ( _ ( " %1h%2m%3s " ) , hours , mins , secs )
end
else
if secs == " 00 " then
return T ( _ ( " %1h%2' " ) , hours , mins )
else
return T ( _ ( " %1h%2'%3'' " ) , hours , mins , secs )
end
end
end
end
ReaderFooter: Don't duplicate a 12h clock time format option (#6973)
* ReaderFooter:
* Honor the global twelve_hour_clock setting, instead of
duplicating a local one.
(Re #6969)
* os.date is a thin wrapper around strftime, so we might be able to get
away with some not-quite-standard extensions...
These are *definitely* supported on Linux, but are *NOT* the glibc
extension (that'd be e.g., %-I), so, hopefully, they're somewhat
portable...
They are also supported on BSD/macOS.
They are *not* supported by the MS UCRT. That means MinGW-w64, too.
This *appears* to be supported on current Bionic (it might even support
said glibc format altering extensions).
* And of course, Windows is terrible, so, make this terribly ugly to not
break it there...
* Turns out BSD also supports the dash trim format extension, so, leave
the trimming to the libc, and handle the special-casing in a way that
doesn't create stupid locals.
* Random unrelated cleanup ^^.
(https://gitter.im/koreader/koreader?at=5fd24be492aa1c4ef5d11f31)
* Update the testsuite
(Because the default used to be 24h clock).
Changed the default to 24h clock ;p.
* Explain why we don't try to fix it in Lua
2020-12-12 09:44:35 +00:00
if jit.os == " Windows " then
2020-12-16 06:40:57 +00:00
--- Converts timestamp to an hour string
---- @int seconds number of seconds
---- @bool twelve_hour_clock
---- @treturn string hour string
---- @note: The MS CRT doesn't support either %l & %k, or the - format modifier (as they're not technically C99 or POSIX).
---- They are otherwise supported on Linux, BSD & Bionic, so, just special-case Windows...
---- We *could* arguably feed the os.date output to gsub("^0(%d)(.*)$", "%1%2"), but, while unlikely,
---- it's conceivable that a translator would put something other that the hour at the front of the string ;).
ReaderFooter: Don't duplicate a 12h clock time format option (#6973)
* ReaderFooter:
* Honor the global twelve_hour_clock setting, instead of
duplicating a local one.
(Re #6969)
* os.date is a thin wrapper around strftime, so we might be able to get
away with some not-quite-standard extensions...
These are *definitely* supported on Linux, but are *NOT* the glibc
extension (that'd be e.g., %-I), so, hopefully, they're somewhat
portable...
They are also supported on BSD/macOS.
They are *not* supported by the MS UCRT. That means MinGW-w64, too.
This *appears* to be supported on current Bionic (it might even support
said glibc format altering extensions).
* And of course, Windows is terrible, so, make this terribly ugly to not
break it there...
* Turns out BSD also supports the dash trim format extension, so, leave
the trimming to the libc, and handle the special-casing in a way that
doesn't create stupid locals.
* Random unrelated cleanup ^^.
(https://gitter.im/koreader/koreader?at=5fd24be492aa1c4ef5d11f31)
* Update the testsuite
(Because the default used to be 24h clock).
Changed the default to 24h clock ;p.
* Explain why we don't try to fix it in Lua
2020-12-12 09:44:35 +00:00
function util . secondsToHour ( seconds , twelve_hour_clock )
if twelve_hour_clock then
if os.date ( " %p " , seconds ) == " AM " then
-- @translators This is the time in the morning using a 12-hour clock (%I is the hour, %M the minute).
return os.date ( _ ( " %I:%M AM " ) , seconds )
else
-- @translators This is the time in the afternoon using a 12-hour clock (%I is the hour, %M the minute).
return os.date ( _ ( " %I:%M PM " ) , seconds )
end
2020-08-29 16:25:34 +00:00
else
ReaderFooter: Don't duplicate a 12h clock time format option (#6973)
* ReaderFooter:
* Honor the global twelve_hour_clock setting, instead of
duplicating a local one.
(Re #6969)
* os.date is a thin wrapper around strftime, so we might be able to get
away with some not-quite-standard extensions...
These are *definitely* supported on Linux, but are *NOT* the glibc
extension (that'd be e.g., %-I), so, hopefully, they're somewhat
portable...
They are also supported on BSD/macOS.
They are *not* supported by the MS UCRT. That means MinGW-w64, too.
This *appears* to be supported on current Bionic (it might even support
said glibc format altering extensions).
* And of course, Windows is terrible, so, make this terribly ugly to not
break it there...
* Turns out BSD also supports the dash trim format extension, so, leave
the trimming to the libc, and handle the special-casing in a way that
doesn't create stupid locals.
* Random unrelated cleanup ^^.
(https://gitter.im/koreader/koreader?at=5fd24be492aa1c4ef5d11f31)
* Update the testsuite
(Because the default used to be 24h clock).
Changed the default to 24h clock ;p.
* Explain why we don't try to fix it in Lua
2020-12-12 09:44:35 +00:00
-- @translators This is the time using a 24-hour clock (%H is the hour, %M the minute).
return os.date ( _ ( " %H:%M " ) , seconds )
end
end
else
function util . secondsToHour ( seconds , twelve_hour_clock )
if twelve_hour_clock then
if os.date ( " %p " , seconds ) == " AM " then
-- @translators This is the time in the morning using a 12-hour clock (%-I is the hour, %M the minute).
return os.date ( _ ( " %-I:%M AM " ) , seconds )
else
-- @translators This is the time in the afternoon using a 12-hour clock (%-I is the hour, %M the minute).
return os.date ( _ ( " %-I:%M PM " ) , seconds )
end
else
-- @translators This is the time using a 24-hour clock (%-H is the hour, %M the minute).
return os.date ( _ ( " %-H:%M " ) , seconds )
2020-08-29 16:25:34 +00:00
end
end
end
--- Converts timestamp to a date string
---- @int seconds number of seconds
---- @bool twelve_hour_clock
---- @treturn string date string
function util . secondsToDate ( seconds , twelve_hour_clock )
local BD = require ( " ui/bidi " )
local time = util.secondsToHour ( seconds , twelve_hour_clock )
-- @translators This is the date (%Y is the year, %m the month, %d the day)
local day = os.date ( _ ( " %Y-%m-%d " ) , seconds )
return BD.wrap ( day ) .. " " .. BD.wrap ( time )
end
2019-08-16 21:22:58 +00:00
2018-03-31 19:19:31 +00:00
--[[--
Compares values in two different tables .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 32660766 / 2470572 >
2018-03-31 19:19:31 +00:00
] ]
---- @param o1 Lua table
---- @param o2 Lua table
---- @bool ignore_mt
---- @treturn boolean
function util . tableEquals ( o1 , o2 , ignore_mt )
if o1 == o2 then return true end
local o1Type = type ( o1 )
local o2Type = type ( o2 )
if o1Type ~= o2Type then return false end
if o1Type ~= ' table ' then return false end
if not ignore_mt then
local mt1 = getmetatable ( o1 )
if mt1 and mt1.__eq then
--compare using built in method
return o1 == o2
end
end
local keySet = { }
for key1 , value1 in pairs ( o1 ) do
local value2 = o2 [ key1 ]
if value2 == nil or util.tableEquals ( value1 , value2 , ignore_mt ) == false then
return false
end
keySet [ key1 ] = true
end
for key2 , _ in pairs ( o2 ) do
if not keySet [ key2 ] then return false end
end
return true
end
2019-03-04 18:01:01 +00:00
--[[--
Makes a deep copy of a table .
2019-08-23 17:53:53 +00:00
Source : < https : // stackoverflow.com / a / 16077650 / 2470572 >
2019-03-04 18:01:01 +00:00
] ]
---- @param o Lua table
---- @treturn Lua table
function util . tableDeepCopy ( o , seen )
seen = seen or { }
if o == nil then return nil end
if seen [ o ] then return seen [ o ] end
local no
if type ( o ) == " table " then
no = { }
seen [ o ] = no
for k , v in next , o , nil do
no [ util.tableDeepCopy ( k , seen ) ] = util.tableDeepCopy ( v , seen )
end
setmetatable ( no , util.tableDeepCopy ( getmetatable ( o ) , seen ) )
else -- number, string, boolean, etc
no = o
end
return no
end
2016-02-04 18:24:39 +00:00
--- Returns number of keys in a table.
2018-12-13 06:27:49 +00:00
---- @param t Lua table
---- @treturn int number of keys in table t
function util . tableSize ( t )
2015-11-27 15:13:01 +00:00
local count = 0
2018-12-13 06:27:49 +00:00
for _ in pairs ( t ) do count = count + 1 end
2015-11-27 15:13:01 +00:00
return count
end
2017-04-04 07:57:14 +00:00
--- Append all elements from t2 into t1.
---- @param t1 Lua table
---- @param t2 Lua table
2016-01-31 22:23:44 +00:00
function util . arrayAppend ( t1 , t2 )
2016-02-12 14:55:02 +00:00
for _ , v in ipairs ( t2 ) do
2016-01-31 22:23:44 +00:00
table.insert ( t1 , v )
end
end
2020-11-28 16:18:57 +00:00
--- Reverse array elements in-place in table t
2019-12-06 21:55:37 +00:00
---- @param t Lua table
function util . arrayReverse ( t )
local i , j = 1 , # t
while i < j do
t [ i ] , t [ j ] = t [ j ] , t [ i ]
i = i + 1
j = j - 1
end
end
2020-11-28 16:18:57 +00:00
--- Test whether t contains a value equal to v
--- (or such a value that callback returns true),
--- and if so, return the index.
---- @param t Lua table
---- @param v
---- @function callback(v1, v2)
function util . arrayContains ( t , v , cb )
cb = cb or function ( v1 , v2 ) return v1 == v2 end
for _k , _v in ipairs ( t ) do
if cb ( _v , v ) then
return _k
end
end
return false
end
2019-06-28 02:46:16 +00:00
-- Merge t2 into t1, overwriting existing elements if they already exist
-- Probably not safe with nested tables (c.f., https://stackoverflow.com/q/1283388)
---- @param t1 Lua table
---- @param t2 Lua table
function util . tableMerge ( t1 , t2 )
for k , v in pairs ( t2 ) do
t1 [ k ] = v
end
end
2017-04-04 07:57:14 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Gets last index of character in string ( i.e . , strrchr )
2017-04-04 07:57:14 +00:00
Returns the index within this string of the last occurrence of the specified character
or - 1 if the character does not occur .
To find . you need to escape it .
] ]
---- @string string
---- @string ch
---- @treturn int last occurrence or -1 if not found
2016-02-12 14:55:02 +00:00
function util . lastIndexOf ( string , ch )
local i = string : match ( " .* " .. ch .. " () " )
if i == nil then return - 1 else return i - 1 end
end
2018-01-13 23:05:05 +00:00
--- Reverse the individual greater-than-single-byte characters
-- @string string to reverse
2019-08-23 17:53:53 +00:00
-- Taken from <https://github.com/blitmap/lua-utf8-simple#utf8reverses>
2018-01-13 23:05:05 +00:00
function util . utf8Reverse ( text )
text = text : gsub ( ' [%z \1 - \127 \194 - \244 ][ \128 - \191 ]* ' , function ( c ) return # c > 1 and c : reverse ( ) end )
return text : reverse ( )
end
2016-04-21 14:13:10 +00:00
2016-12-13 16:06:02 +00:00
--- Splits string into a list of UTF-8 characters.
---- @string text the string to be split.
2016-06-05 07:33:31 +00:00
---- @treturn table list of UTF-8 chars
2016-05-22 15:59:28 +00:00
function util . splitToChars ( text )
local tab = { }
if text ~= nil then
local prevcharcode , charcode = 0
2019-01-15 17:36:33 +00:00
-- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
-- a superset of UTF-8, that includes UTF-16 surrogates
-- in UTF-8 bytes (forbidden in well-formed UTF-8).
-- We may get that from bad producers or converters.
-- (luajson, used to decode Wikipedia API json, will not correctly decode
-- this sample: <span lang=\"got\">\ud800\udf45</span> : single Unicode
-- char https://www.compart.com/en/unicode/U+10345 and will give us
-- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85")
-- From http://www.unicode.org/faq/utf_bom.html#utf16-1
-- Surrogates are code points from two special ranges of
-- Unicode values, reserved for use as the leading, and
-- trailing values of paired code units in UTF-16. Leading,
-- also called high, surrogates are from D800 to DBFF, and
-- trailing, or low, surrogates are from DC00 to DFFF. They
-- are called surrogates, since they do not represent
-- characters directly, but only as a pair.
local hi_surrogate
local hi_surrogate_uchar
2017-08-14 20:30:42 +00:00
for uchar in string.gmatch ( text , " ([%z \1 - \127 \194 - \244 ][ \128 - \191 ]*) " ) do
2016-05-22 15:59:28 +00:00
charcode = BaseUtil.utf8charcode ( uchar )
2019-01-15 17:36:33 +00:00
-- (not sure why we need this prevcharcode check; we could get
-- charcode=nil with invalid UTF-8, but should we then really
-- ignore the following charcode ?)
2016-05-22 15:59:28 +00:00
if prevcharcode then -- utf8
2019-01-15 17:36:33 +00:00
if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = charcode
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
-- low surrogate following a high surrogate, good, let's make them a single char
2019-11-23 23:27:27 +00:00
charcode = lshift ( ( hi_surrogate - 0xD800 ) , 10 ) + ( charcode - 0xDC00 ) + 0x10000
2019-01-15 17:36:33 +00:00
table.insert ( tab , util.unicodeCodepointToUtf8 ( charcode ) )
hi_surrogate = nil
else
if hi_surrogate then -- previous unconsumed one, add it even if invalid
table.insert ( tab , hi_surrogate_uchar )
end
hi_surrogate = nil
table.insert ( tab , uchar )
end
2016-05-22 15:59:28 +00:00
end
prevcharcode = charcode
2016-04-21 14:13:10 +00:00
end
end
2016-05-22 15:59:28 +00:00
return tab
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Tests whether c is a CJK character
---- @string c
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . isCJKChar ( c )
return string.match ( c , " [ \228 - \234 ][ \128 - \191 ]. " ) == c
end
2017-04-04 07:57:14 +00:00
--- Tests whether str contains CJK characters
---- @string str
---- @treturn boolean true if CJK
2016-11-26 00:46:56 +00:00
function util . hasCJKChar ( str )
return string.match ( str , " [ \228 - \234 ][ \128 - \191 ]. " ) ~= nil
end
2019-11-23 23:27:27 +00:00
--- Split texts into a list of words, spaces and punctuation marks.
2016-06-05 07:33:31 +00:00
---- @string text text to split
2019-11-23 23:27:27 +00:00
---- @treturn table list of words, spaces and punctuation marks
2016-06-05 07:33:31 +00:00
function util . splitToWords ( text )
local wlist = { }
2016-06-28 15:50:21 +00:00
for word in util.gsplit ( text , " [%s%p]+ " , true ) do
2019-11-23 23:27:27 +00:00
-- if space split word contains CJK characters
2016-11-26 00:46:56 +00:00
if util.hasCJKChar ( word ) then
2016-06-28 15:50:21 +00:00
-- split with CJK characters
for char in util.gsplit ( word , " [ \228 - \234 \192 - \255 ][ \128 - \191 ]+ " , true ) do
table.insert ( wlist , char )
end
else
2016-06-05 07:33:31 +00:00
table.insert ( wlist , word )
end
end
return wlist
end
2016-12-06 21:10:25 +00:00
-- We don't want to split on a space if it is followed by some
2019-11-23 23:27:27 +00:00
-- specific punctuation marks : e.g. "word :" or "word )"
-- (In French, there is a non-breaking space before a colon, and it better
2016-12-06 21:10:25 +00:00
-- not be wrapped there.)
2017-04-04 07:57:14 +00:00
local non_splittable_space_tailers = " :;,.!?)]}$%=-+*/|<>»” "
2019-11-23 23:27:27 +00:00
-- Same if a space has some specific other punctuation mark before it
2017-04-04 07:57:14 +00:00
local non_splittable_space_leaders = " ([{$=-+*/|<>«“ "
2016-12-06 21:10:25 +00:00
2016-12-15 07:58:58 +00:00
-- Similar rules exist for CJK text. Taken from :
-- https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_tailers = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" !%),.:;?]}¢°·’ \" †‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?!]}~ " ,
-- Traditional Chinese
" !),.:;?]}¢·–—’ \" •、。〆〞〕〉》」︰︱︲︳﹐﹑﹒﹔﹕﹖﹘﹚﹜!),.:;?︶︸︺︼︾﹀﹂﹗]|}、 " ,
-- Japanese
" )]}〕〉》」』】〙〗〟’ \" ⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。. " ,
-- Korean
" !%),.:;?]}¢°’ \" †‡℃〆〈《「『〕!%),.:;?]} " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable_leaders = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Simplified Chinese
" $(£¥·‘ \" 〈《「『【〔〖〝﹙﹛$(.[{£¥ " ,
-- Traditional Chinese
" ([{£¥‘ \" ‵〈《「『〔〝︴﹙﹛({︵︷︹︻︽︿﹁﹃﹏ " ,
-- Japanese
" ([{〔〈《「『【〘〖〝‘ \" ⦅« " ,
-- Korean
" $([{£¥‘ \" 々〇〉》」〔$([{⦆¥₩# " ,
} )
2017-04-04 07:57:14 +00:00
local cjk_non_splittable = table.concat ( {
2016-12-15 07:58:58 +00:00
-- Japanese
" —…‥〳〴〵 " ,
} )
2017-04-04 07:57:14 +00:00
--- Test whether a string can be separated by this char for multi-line rendering.
2016-12-12 22:41:16 +00:00
-- Optional next or prev chars may be provided to help make the decision
2017-04-04 07:57:14 +00:00
---- @string c
---- @string next_c
---- @string prev_c
---- @treturn boolean true if splittable, false if not
function util . isSplittable ( c , next_c , prev_c )
2016-12-06 21:10:25 +00:00
if util.isCJKChar ( c ) then
2017-04-04 07:57:14 +00:00
-- a CJKChar is a word in itself, and so is splittable
if cjk_non_splittable : find ( c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- except a few of them
return false
2017-04-04 07:57:14 +00:00
elseif next_c and cjk_non_splittable_tailers : find ( next_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but followed by a char that is not permitted at start of line
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and cjk_non_splittable_leaders : find ( prev_c , 1 , true ) then
2016-12-15 07:58:58 +00:00
-- but preceded by a char that is not permitted at end of line
return false
else
-- we can split on this CJKchar
return true
end
2016-12-06 21:10:25 +00:00
elseif c == " " then
2019-11-23 23:27:27 +00:00
-- we only split on a space (so a punctuation mark sticks to prev word)
2016-12-12 22:41:16 +00:00
-- if next_c or prev_c is provided, we can make a better decision
2017-04-04 07:57:14 +00:00
if next_c and non_splittable_space_tailers : find ( next_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is followed by some punctuation mark that is better kept with us
2016-12-12 22:41:16 +00:00
return false
2017-04-04 07:57:14 +00:00
elseif prev_c and non_splittable_space_leaders : find ( prev_c , 1 , true ) then
2019-11-23 23:27:27 +00:00
-- this space is lead by some punctuation mark that is better kept with us
2016-12-06 21:10:25 +00:00
return false
else
-- we can split on this space
return true
end
end
2019-11-23 23:27:27 +00:00
-- otherwise, not splittable
2016-12-06 21:10:25 +00:00
return false
2016-04-21 14:13:10 +00:00
end
2017-04-04 07:57:14 +00:00
--- Gets filesystem type of a path.
--
-- Checks if the path occurs in <code>/proc/mounts</code>
---- @string path an absolute path
---- @treturn string filesystem type
2017-01-10 00:05:15 +00:00
function util . getFilesystemType ( path )
local mounts = io.open ( " /proc/mounts " , " r " )
if not mounts then return nil end
local type
while true do
local line
local mount = { }
line = mounts : read ( )
if line == nil then
break
end
for param in line : gmatch ( " %S+ " ) do table.insert ( mount , param ) end
if string.match ( path , mount [ 2 ] ) then
type = mount [ 3 ]
if mount [ 2 ] ~= ' / ' then
break
end
end
end
mounts : close ( )
return type
2020-10-20 04:30:41 +00:00
end
--- Recursively scan directory for files inside
-- @string path
-- @function callback(fullpath, name, attr)
function util . findFiles ( dir , cb )
local function scan ( current )
local ok , iter , dir_obj = pcall ( lfs.dir , current )
if not ok then return end
for f in iter , dir_obj do
local path = current .. " / " .. f
local attr = lfs.attributes ( path )
if attr.mode == " directory " then
if f ~= " . " and f ~= " .. " then
scan ( path )
end
2020-10-24 11:23:05 +00:00
elseif attr.mode == " file " or attr.mode == " link " then
2020-10-20 04:30:41 +00:00
cb ( path , f , attr )
end
end
end
scan ( dir )
2017-01-10 00:05:15 +00:00
end
2017-04-26 06:12:25 +00:00
--- Checks if directory is empty.
---- @string path
---- @treturn bool
function util . isEmptyDir ( path )
2017-04-29 06:57:50 +00:00
local lfs = require ( " libs/libkoreader-lfs " )
2017-08-12 13:01:59 +00:00
-- lfs.dir will crash rather than return nil if directory doesn't exist O_o
local ok , iter , dir_obj = pcall ( lfs.dir , path )
if not ok then return end
for filename in iter , dir_obj do
2017-04-26 06:12:25 +00:00
if filename ~= ' . ' and filename ~= ' .. ' then
return false
end
end
return true
end
2020-06-19 10:22:38 +00:00
--- check if the given path is a file
---- @string path
---- @treturn bool
function util . fileExists ( path )
local file = io.open ( path , " r " )
if file ~= nil then
file : close ( )
return true
end
end
2018-01-01 14:40:28 +00:00
--- Checks if the given path exists. Doesn't care if it's a file or directory.
---- @string path
---- @treturn bool
function util . pathExists ( path )
local lfs = require ( " libs/libkoreader-lfs " )
return lfs.attributes ( path , " mode " ) ~= nil
end
--- As `mkdir -p`.
2019-08-23 17:53:53 +00:00
-- Unlike [lfs.mkdir](https://keplerproject.github.io/luafilesystem/manual.html#mkdir)(),
-- does not error if the directory already exists, and creates intermediate directories as needed.
-- @string path the directory to create
-- @treturn bool true on success; nil, err_message on error
2018-01-01 14:40:28 +00:00
function util . makePath ( path )
path = path : gsub ( " /+$ " , " " )
if util.pathExists ( path ) then return true end
local success , err = util.makePath ( ( util.splitFilePathName ( path ) ) )
if not success then
return nil , err .. " (creating " .. path .. " ) "
end
local lfs = require ( " libs/libkoreader-lfs " )
return lfs.mkdir ( path )
end
2020-06-19 10:22:38 +00:00
--- As `rm`
-- @string path of the file to remove
-- @treturn bool true on success; nil, err_message on error
function util . removeFile ( file )
local lfs = require ( " libs/libkoreader-lfs " )
if file and lfs.attributes ( file , " mode " ) == " file " then
return os.remove ( file )
elseif file then
return nil , file .. " is not a file "
else
return nil , " file is nil "
end
end
-- Gets total, used and available bytes for the mountpoint that holds a given directory.
-- @string path of the directory
-- @treturn table with total, used and available bytes
function util . diskUsage ( dir )
-- safe way of testing df & awk
local function doCommand ( d )
local handle = io.popen ( " df -k " .. d .. " 2>&1 | awk '$3 ~ /[0-9]+/ { print $2,$3,$4 }' 2>&1 || echo ::ERROR:: " )
if not handle then return end
local output = handle : read ( " *all " )
handle : close ( )
if not output : find " ::ERROR:: " then
return output
end
end
local err = { total = nil , used = nil , available = nil }
local lfs = require ( " libs/libkoreader-lfs " )
if not dir or lfs.attributes ( dir , " mode " ) ~= " directory " then return err end
local usage = doCommand ( dir )
if not usage then return err end
local stage , result = { } , { }
for size in usage : gmatch ( " %w+ " ) do
table.insert ( stage , size )
end
for k , v in pairs ( { " total " , " used " , " available " } ) do
if stage [ k ] ~= nil then
-- sizes are in kb, return bytes here
result [ v ] = stage [ k ] * 1024
end
end
return result
end
2017-04-04 07:57:14 +00:00
--- Replaces characters that are invalid filenames.
--
-- Replaces the characters <code>\/:*?"<>|</code> with an <code>_</code>.
-- These characters are problematic on Windows filesystems. On Linux only
-- <code>/</code> poses a problem.
---- @string str filename
---- @treturn string sanitized filename
2019-05-14 17:10:41 +00:00
local function replaceAllInvalidChars ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' [ \\ ,%/,:,%*,%?,%",%<,%>,%|] ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2017-04-04 07:57:14 +00:00
--- Replaces slash with an underscore.
---- @string str
---- @treturn string
2019-05-14 17:10:41 +00:00
local function replaceSlashChar ( str )
2017-03-15 07:59:42 +00:00
if str then
2017-04-02 14:17:49 +00:00
return str : gsub ( ' %/ ' , ' _ ' )
2017-03-15 07:59:42 +00:00
end
2017-01-10 00:05:15 +00:00
end
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replaces characters that are invalid in filenames .
2019-08-23 17:53:53 +00:00
Replaces the characters ` \ / : * ? " <>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
If an optional path is provided , @ { util.getFilesystemType } ( ) will be used to determine whether stricter VFAT restrictions should be applied .
] ]
2019-05-14 17:10:41 +00:00
---- @string str
---- @string path
---- @int limit
2019-08-23 17:53:53 +00:00
---- @treturn string safe filename
2019-06-10 15:06:13 +00:00
function util . getSafeFilename ( str , path , limit , limit_ext )
2019-05-14 17:10:41 +00:00
local filename , suffix = util.splitFileNameSuffix ( str )
2019-05-22 09:34:46 +00:00
local replaceFunc = replaceAllInvalidChars
2019-05-14 17:10:41 +00:00
local safe_filename
-- VFAT supports a maximum of 255 UCS-2 characters, although it's probably treated as UTF-16 by Windows
-- default to a slightly lower limit just in case
limit = limit or 240
2019-06-10 15:06:13 +00:00
limit_ext = limit_ext or 10
2019-05-14 17:10:41 +00:00
if path then
local file_system = util.getFilesystemType ( path )
2019-05-22 09:34:46 +00:00
if file_system ~= " vfat " and file_system ~= " fuse.fsp " then
replaceFunc = replaceSlashChar
2019-05-14 17:10:41 +00:00
end
end
2019-06-10 15:06:13 +00:00
if suffix : len ( ) > limit_ext then
-- probably not an actual file extension, or at least not one we'd be
-- dealing with, so strip the whole string
filename = str
suffix = nil
end
filename = util.htmlToPlainTextIfHtml ( filename )
2019-05-14 17:10:41 +00:00
filename = filename : sub ( 1 , limit )
-- the limit might result in broken UTF-8, which we don't want in the result
filename = util.fixUtf8 ( filename , " " )
if suffix and suffix ~= " " then
safe_filename = replaceFunc ( filename ) .. " . " .. replaceFunc ( suffix )
else
safe_filename = replaceFunc ( filename )
end
return safe_filename
end
2018-01-01 14:40:28 +00:00
--- Splits a file into its directory path and file name.
--- If the given path has a trailing /, returns the entire path as the directory
--- path and "" as the file name.
2017-04-04 07:57:14 +00:00
---- @string file
2020-12-10 19:51:21 +00:00
---- @treturn string directory, filename
2017-01-21 09:32:42 +00:00
function util . splitFilePathName ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " / " ) == nil then return " " , file end
2020-12-06 23:09:47 +00:00
return file : match ( " (.*/)(.*) " )
2017-01-21 09:32:42 +00:00
end
2017-04-04 07:57:14 +00:00
--- Splits a file name into its pure file name and suffix
---- @string file
---- @treturn string path, extension
2017-01-21 09:32:42 +00:00
function util . splitFileNameSuffix ( file )
if file == nil or file == " " then return " " , " " end
if string.find ( file , " %. " ) == nil then return file , " " end
2020-12-06 23:09:47 +00:00
return file : match ( " (.*)%.(.*) " )
2017-01-21 09:32:42 +00:00
end
2017-04-04 07:57:14 +00:00
--- Gets file extension
---- @string filename
---- @treturn string extension
2017-02-12 02:55:31 +00:00
function util . getFileNameSuffix ( file )
local _ , suffix = util.splitFileNameSuffix ( file )
return suffix
end
2020-02-03 19:08:18 +00:00
--- Companion helper function that returns the script's language,
2020-12-10 19:51:21 +00:00
--- based on the file extension.
2020-02-03 19:08:18 +00:00
---- @string filename
2020-07-09 16:11:56 +00:00
---- @treturn string (lowercase) (or nil if not Device:canExecuteScript(file))
2020-02-03 19:08:18 +00:00
function util . getScriptType ( file )
local file_ext = string.lower ( util.getFileNameSuffix ( file ) )
if file_ext == " sh " then
return " shell "
elseif file_ext == " py " then
return " python "
end
end
2017-10-20 15:48:32 +00:00
--- Gets human friendly size as string
---- @int size (bytes)
2019-12-17 12:00:35 +00:00
---- @bool right_align (by padding with spaces on the left)
2017-10-20 15:48:32 +00:00
---- @treturn string
2019-12-17 12:00:35 +00:00
function util . getFriendlySize ( size , right_align )
local frac_format = right_align and " %6.1f " or " %.1f "
local deci_format = right_align and " %6d " or " %d "
2018-12-13 06:27:49 +00:00
size = tonumber ( size )
2018-01-31 16:22:34 +00:00
if not size or type ( size ) ~= " number " then return end
2017-10-20 15:48:32 +00:00
if size > 1024 * 1024 * 1024 then
2019-11-28 22:22:07 +00:00
-- @translators This is an abbreviation for the gigabyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 GB " ) , string.format ( frac_format , size / 1024 / 1024 / 1024 ) )
2019-11-28 22:22:07 +00:00
end
if size > 1024 * 1024 then
-- @translators This is an abbreviation for the megabyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 MB " ) , string.format ( frac_format , size / 1024 / 1024 ) )
2019-11-28 22:22:07 +00:00
end
if size > 1024 then
-- @translators This is an abbreviation for the kilobyte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 KB " ) , string.format ( frac_format , size / 1024 ) )
2017-10-20 15:48:32 +00:00
else
2019-11-28 22:22:07 +00:00
-- @translators This is an abbreviation for the byte, a unit of computer memory or data storage capacity.
2019-12-17 12:00:35 +00:00
return T ( _ ( " %1 B " ) , string.format ( deci_format , size ) )
2017-10-20 15:48:32 +00:00
end
end
2017-10-20 17:29:52 +00:00
--- Gets formatted size as string (1273334 => "1,273,334")
---- @int size (bytes)
---- @treturn string
function util . getFormattedSize ( size )
local s = tostring ( size )
s = s : reverse ( ) : gsub ( " (%d%d%d) " , " %1, " )
s = s : reverse ( ) : gsub ( " ^, " , " " )
return s
end
2019-08-23 17:53:53 +00:00
--[[--
Replaces invalid UTF - 8 characters with a replacement string .
Based on < http : // notebook.kulchenko . com / programming / fixing - malformed - utf8 - in - lua > .
2019-11-23 23:27:27 +00:00
c.f . , FixUTF8 @ < https : // github.com / pkulchenko / ZeroBraneStudio / blob / master / src / util.lua > .
2019-08-23 17:53:53 +00:00
@ string str the string to be checked for invalid characters
@ string replacement the string to replace invalid characters with
@ treturn string valid UTF - 8
] ]
2017-04-02 14:17:49 +00:00
function util . fixUtf8 ( str , replacement )
local pos = 1
local len = # str
while pos <= len do
2019-11-23 23:27:27 +00:00
if str : find ( " ^[%z \1 - \127 ] " , pos ) then pos = pos + 1
elseif str : find ( " ^[ \194 - \223 ][ \128 - \191 ] " , pos ) then pos = pos + 2
elseif str : find ( " ^ \224 [ \160 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \225 - \236 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \237 [ \128 - \159 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \238 - \239 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 3
elseif str : find ( " ^ \240 [ \144 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^[ \241 - \243 ][ \128 - \191 ][ \128 - \191 ][ \128 - \191 ] " , pos )
or str : find ( " ^ \244 [ \128 - \143 ][ \128 - \191 ][ \128 - \191 ] " , pos ) then pos = pos + 4
2017-04-02 14:17:49 +00:00
else
str = str : sub ( 1 , pos - 1 ) .. replacement .. str : sub ( pos + 1 )
pos = pos + # replacement
len = len + # replacement - 1
end
end
return str
end
2017-04-14 19:12:28 +00:00
--- Splits input string with the splitter into a table. This function ignores the last empty entity.
--
--- @string str the string to be split
--- @string splitter
--- @bool capture_empty_entity
--- @treturn an array-like table
function util . splitToArray ( str , splitter , capture_empty_entity )
local result = { }
for word in util.gsplit ( str , splitter , false , capture_empty_entity ) do
table.insert ( result , word )
end
return result
end
2019-11-22 18:50:58 +00:00
--- Convert a Unicode codepoint (number) to UTF-8 char
--- c.f., <https://stackoverflow.com/a/4609989>
--- & <https://stackoverflow.com/a/38492214>
2019-11-23 23:27:27 +00:00
--- See utf8charcode in ffi/util for a decoder.
2017-07-01 10:11:44 +00:00
--
--- @int c Unicode codepoint
2019-11-22 18:50:58 +00:00
--- @treturn string UTF-8 char
2017-07-01 10:11:44 +00:00
function util . unicodeCodepointToUtf8 ( c )
2019-11-22 18:50:58 +00:00
if c < 0x80 then
2017-07-01 10:11:44 +00:00
return string.char ( c )
2019-11-22 18:50:58 +00:00
elseif c < 0x800 then
return string.char (
bor ( 0xC0 , rshift ( c , 6 ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x10000 then
if c >= 0xD800 and c <= 0xDFFF then
return ' <EFBFBD> ' -- Surrogates -> U+FFFD REPLACEMENT CHARACTER
end
return string.char (
bor ( 0xE0 , rshift ( c , 12 ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
elseif c < 0x110000 then
return string.char (
bor ( 0xF0 , rshift ( c , 18 ) ) ,
bor ( 0x80 , band ( rshift ( c , 12 ) , 0x3F ) ) ,
bor ( 0x80 , band ( rshift ( c , 6 ) , 0x3F ) ) ,
bor ( 0x80 , band ( c , 0x3F ) )
)
2017-07-01 10:11:44 +00:00
else
2019-11-22 18:50:58 +00:00
return ' <EFBFBD> ' -- Invalid -> U+FFFD REPLACEMENT CHARACTER
2017-07-01 10:11:44 +00:00
end
end
2018-04-10 16:30:27 +00:00
-- we need to use an array of arrays to keep them ordered as written
2017-07-01 10:11:44 +00:00
local HTML_ENTITIES_TO_UTF8 = {
2018-04-10 16:30:27 +00:00
{ " < " , " < " } ,
{ " > " , " > " } ,
{ " " " , ' " ' } ,
{ " ' " , " ' " } ,
{ " " , " \xC2 \xA0 " } ,
{ " &#(%d+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x ) ) end } ,
2019-11-22 18:50:58 +00:00
{ " &#x(%x+); " , function ( x ) return util.unicodeCodepointToUtf8 ( tonumber ( x , 16 ) ) end } ,
2018-04-10 16:30:27 +00:00
{ " & " , " & " } , -- must be last
2017-07-01 10:11:44 +00:00
}
2019-08-23 17:53:53 +00:00
--[[--
2019-11-23 23:27:27 +00:00
Replace HTML entities with their UTF - 8 encoded equivalent in text .
2019-08-23 17:53:53 +00:00
Supports only basic ones and those with numbers ( no support for named entities like ` & eacute ; ` ) .
@ int string text with HTML entities
2019-11-23 23:27:27 +00:00
@ treturn string UTF - 8 text
2019-08-23 17:53:53 +00:00
] ]
2017-07-01 10:11:44 +00:00
function util . htmlEntitiesToUtf8 ( text )
2018-04-10 16:30:27 +00:00
for _ , t in ipairs ( HTML_ENTITIES_TO_UTF8 ) do
text = text : gsub ( t [ 1 ] , t [ 2 ] )
2017-07-01 10:11:44 +00:00
end
return text
end
2019-08-23 17:53:53 +00:00
--[[--
Convert simple HTML to plain text .
This may fail on complex HTML ( with styles , scripts , comments ) , but should be fine enough with simple HTML as found in EPUB ' s `<dc:description>`.
@ string text HTML text
@ treturn string plain text
] ]
2017-07-01 10:11:44 +00:00
function util . htmlToPlainText ( text )
2020-12-31 22:23:05 +00:00
-- Replace <br> with \n
2017-07-01 10:11:44 +00:00
text = text : gsub ( " %s*<%s*br%s*/?>%s* " , " \n " ) -- <br> and <br/>
2020-12-31 22:23:05 +00:00
-- Replace <p> with \n\t (\t, unlike any combination of spaces,
-- ensures a constant indentation when text is justified.)
2017-07-01 10:11:44 +00:00
text = text : gsub ( " %s*</%s*p%s*>%s* " , " \n " ) -- </p>
text = text : gsub ( " %s*<%s*p%s*/>%s* " , " \n " ) -- standalone <p/>
2020-12-31 22:23:05 +00:00
text = text : gsub ( " %s*<%s*p%s*>%s* " , " \n \t " ) -- <p>
-- (this one last, so \t is not removed by the others' %s)
2017-07-01 10:11:44 +00:00
-- Remove all HTML tags
text = text : gsub ( " <[^>]*> " , " " )
-- Convert HTML entities
text = util.htmlEntitiesToUtf8 ( text )
2020-12-31 22:23:05 +00:00
-- Trim spaces and new lines at start and end, including
-- the \t we added (this looks fine enough with multiple
-- paragraphs, but feels nicer with a single paragraph,
-- whether it contains <br>s or not).
2017-07-01 10:11:44 +00:00
text = text : gsub ( " ^[ \n %s]* " , " " )
text = text : gsub ( " [ \n %s]*$ " , " " )
return text
end
--- Convert HTML to plain text if text seems to be HTML
-- Detection of HTML is simple and may raise false positives
-- or negatives, but seems quite good at guessing content type
-- of text found in EPUB's <dc:description>.
--
--- @string text the string with possibly some HTML
--- @treturn string cleaned text
function util . htmlToPlainTextIfHtml ( text )
local is_html = false
-- Quick way to check if text is some HTML:
-- look for html tags
local _ , nb_tags
_ , nb_tags = text : gsub ( " <%w+.-> " , " " )
if nb_tags > 0 then
is_html = true
else
-- no <tag> found
2019-11-23 23:27:27 +00:00
-- but we may meet some text badly/twice encoded html containing "<br>"
2017-07-01 10:11:44 +00:00
local nb_encoded_tags
_ , nb_encoded_tags = text : gsub ( " <%a+> " , " " )
if nb_encoded_tags > 0 then
is_html = true
-- decode one of the two encodes
text = util.htmlEntitiesToUtf8 ( text )
end
end
if is_html then
text = util.htmlToPlainText ( text )
else
-- if text ends with ]]>, it probably comes from <![CDATA[ .. ]]> that
-- crengine has extracted correctly, but let the ending tag in, so
-- let's remove it
text = text : gsub ( " ]]>%s*$ " , " " )
end
return text
end
2018-01-07 19:24:15 +00:00
--- Encode the HTML entities in a string
2018-01-15 22:51:43 +00:00
--- @string text the string to escape
2018-01-07 19:24:15 +00:00
-- Taken from https://github.com/kernelsauce/turbo/blob/e4a35c2e3fb63f07464f8f8e17252bea3a029685/turbo/escape.lua#L58-L70
function util . htmlEscape ( text )
return text : gsub ( " [}{ \" >/<'&] " , {
[ " & " ] = " & " ,
[ " < " ] = " < " ,
[ " > " ] = " > " ,
[ ' " ' ] = " " " ,
[ " ' " ] = " ' " ,
[ " / " ] = " / " ,
} )
end
2020-06-08 18:47:31 +00:00
--- Prettify a CSS stylesheet
-- Not perfect, but enough to make some ugly CSS readable.
-- By default, each selector and each property is put on its own line.
-- With condensed=true, condense each full declaration on a single line.
--
--- @string CSS string
--- @boolean condensed[opt=false] true to condense each declaration on a line
--- @treturn string the CSS prettified
function util . prettifyCSS ( css_text , condensed )
if not condensed then
-- Get rid of \t so we can use it as a replacement/hiding char
css_text = css_text : gsub ( " \t " , " " )
-- Wrap and indent declarations
css_text = css_text : gsub ( " %s*{%s* " , " { \n " )
css_text = css_text : gsub ( " ;%s*}%s* " , " ; \n } \n " )
css_text = css_text : gsub ( " ;%s*([^}]) " , " ; \n %1 " )
css_text = css_text : gsub ( " %s*}%s* " , " \n } \n " )
-- Cleanup declarations
css_text = css_text : gsub ( " {[^}]*} " , function ( s )
s = s : gsub ( " %s*:%s* " , " : " )
-- Temporarily hide/replace ',' in declaration so they
-- are not matched and made multi-lines by followup gsub
s = s : gsub ( " %s*,%s* " , " \t " )
return s
end )
-- Have each selector (separated by ',') on a new line
css_text = css_text : gsub ( " %s*,%s* " , " , \n " )
-- Restore hidden ',' in declarations
css_text = css_text : gsub ( " \t " , " , " )
else
-- Go thru previous method to have something standard to work on
css_text = util.prettifyCSS ( css_text )
-- And condense that
css_text = css_text : gsub ( " { \n " , " { " )
css_text = css_text : gsub ( " ; \n " , " ; " )
css_text = css_text : gsub ( " \n } " , " } " )
css_text = css_text : gsub ( " , \n " , " , " )
end
return css_text
end
2018-01-15 22:51:43 +00:00
--- Escape list for shell usage
--- @table args the list of arguments to escape
--- @treturn string the escaped and concatenated arguments
function util . shell_escape ( args )
local escaped_args = { }
for _ , arg in ipairs ( args ) do
arg = " ' " .. arg : gsub ( " ' " , " ' \\ '' " ) .. " ' "
table.insert ( escaped_args , arg )
end
return table.concat ( escaped_args , " " )
end
2018-01-17 08:17:53 +00:00
--- Clear all the elements from a table without reassignment.
--- @table t the table to be cleared
function util . clearTable ( t )
local c = # t
for i = 0 , c do t [ i ] = nil end
end
2020-06-19 10:22:38 +00:00
--- Dumps a table into a file.
--- @table t the table to be dumped
--- @string file the file to store the table
--- @treturn bool true on success, false otherwise
function util . dumpTable ( t , file )
if not t or not file or file == " " then return end
local dump = require ( " dump " )
local f = io.open ( file , " w " )
if f then
f : write ( " return " .. dump ( t ) )
f : close ( )
return true
end
return false
end
2018-05-04 15:06:58 +00:00
--- Encode URL also known as percent-encoding see https://en.wikipedia.org/wiki/Percent-encoding
--- @string text the string to encode
--- @treturn encode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlEncode ( url )
local char_to_hex = function ( c )
return string.format ( " %%%02X " , string.byte ( c ) )
end
if url == nil then
return
end
url = url : gsub ( " \n " , " \r \n " )
url = url : gsub ( " ([^%w%-%.%_%~%!%*%'%(%)]) " , char_to_hex )
return url
end
--- Decode URL (reverse process to util.urlEncode())
--- @string text the string to decode
--- @treturn decode string
--- Taken from https://gist.github.com/liukun/f9ce7d6d14fa45fe9b924a3eed5c3d99
function util . urlDecode ( url )
local hex_to_char = function ( x )
return string.char ( tonumber ( x , 16 ) )
end
if url == nil then
return
end
url = url : gsub ( " %%(%x%x) " , hex_to_char )
return url
end
2018-08-06 19:16:30 +00:00
--- Check lua syntax of string
--- @string text lua code text
--- @treturn string with parsing error, nil if syntax ok
function util . checkLuaSyntax ( lua_text )
local lua_code_ok , err = loadstring ( lua_text )
if lua_code_ok then
return nil
end
-- Replace: [string "blah blah..."]:3: '=' expected near '123'
-- with: Line 3: '=' expected near '123'
err = err : gsub ( " %[string \" .-% \" ]: " , " Line " )
return err
end
2019-04-18 21:26:53 +00:00
-- Simple startsWith / endsWith string helpers
-- c.f., http://lua-users.org/wiki/StringRecipes
-- @param str string: source string
-- @param start string: string to match
-- @return boolean: true on success
function util . stringStartsWith ( str , start )
return str : sub ( 1 , # start ) == start
end
-- @param str string: source string
-- @param ending string: string to match
-- @return boolean: true on success
function util . stringEndsWith ( str , ending )
return ending == " " or str : sub ( -# ending ) == ending
end
2015-02-01 09:40:34 +00:00
return util