2
0
mirror of https://github.com/koreader/koreader synced 2024-11-13 19:11:25 +00:00
koreader/frontend/sort.lua
NiLuJe 7863a7ad70
Misc: Natural sorting refactor (#10023)
* Move natural sorting algo to a dedicated sort module to avoid code duplication
* Use a slightly more accurate algorithm, and speed it up by caching intermediary strings
* Calibre: Use natural sorting in metadata search (fix #10009)
2023-01-16 19:36:22 +01:00

134 lines
4.3 KiB
Lua

--[[--
This module contains a collection of comparison functions (or factories for comparison functions) for `table.sort`.
@module sort
]]
local sort = {}
--[[
Natural sorting functions, for use with table.sort
<http://notebook.kulchenko.com/algorithms/alphanumeric-natural-sorting-for-humans-in-lua>
--]]
-- Original implementation by Paul Kulchenko
--[[
local function addLeadingZeroes(d)
local dec, n = string.match(d, "(%.?)0*(.+)")
return #dec > 0 and ("%.12f"):format(d) or ("%s%03d%s"):format(dec, #n, n)
end
function sort.natsort(a, b)
return tostring(a):gsub("%.?%d+", addLeadingZeroes)..("%3d"):format(#b)
< tostring(b):gsub("%.?%d+", addLeadingZeroes)..("%3d"):format(#a)
end
--]]
-- Hardened (but more expensive) implementation by Egor Skriptunoff, with an UTF-8 tweak by Paul Kulchenko
--[[
local function natsort_conv(s)
local res, dot = "", ""
for n, m, c in tostring(s):gmatch("(0*(%d*))(.?)") do
if n == "" then
dot, c = "", dot..c
else
res = res..(dot == "" and ("%03d%s"):format(#m, m)
or "."..n)
dot, c = c:match("(%.?)(.*)")
end
res = res..c:gsub("[%z\1-\127\192-\255]", "\0%0")
end
return res
end
--]]
-- The above conversion is *fairly* expensive,
-- and table.sort ensures that it'll be called on identical strings multiple times,
-- so keeping a cache of massaged strings makes sense.
-- <https://github.com/koreader/koreader/pull/10023#discussion_r1069776657>
-- We can rely on LRU to avoid explicit cache maintenance concerns
-- (given the type of content we massage, the memory impact is fairly insignificant).
-- The extra persistence this affords us also happens to help with the FM use-case ;).
-- Dumb persistent hash-map => cold, ~200 to 250ms; hot: ~150ms (which roughly matches sorting by numerical file attributes).
-- (Numbers are from the FM sorting 350 entries (mostly composed of author names) on an H2O; an uncached run takes ~650ms).
--[[
local natsort_cache = {}
function sort.natsort(a, b)
local ca, cb = natsort_cache[a], natsort_cache[b]
if not ca then
ca = natsort_conv(a)
natsort_cache[a] = ca
end
if not cb then
cb = natsort_conv(b)
natsort_cache[b] = cb
end
return ca < cb or ca == cb and a < b
end
--]]
-- LRU => cold, ~200 to 250ms; hot ~150 to 175ms (which is barely any slower than a dumb hash-map, yay, LRU and LuaJIT magic).
--[[
local lru = require("ffi/lru")
local natsort_cache = lru.new(1024, nil, false)
function sort.natsort(a, b)
local ca, cb = natsort_cache:get(a), natsort_cache:get(b)
if not ca then
ca = natsort_conv(a)
natsort_cache:set(a, ca)
end
if not cb then
cb = natsort_conv(b)
natsort_cache:set(b, cb)
end
return ca < cb or ca == cb and a < b
end
--]]
--[[--
Generates a natural sorting comparison function for table.sort.
@param cache Optional, hashmap used to cache the processed strings to speed up sorting
@return The cmp function to feed to `table.sort`
@return The cache used (same object as the passed one, if any; will be created if not)
@usage
-- t is an array of strings, we don't want to keep the cache around
table.sort(t, sort.natsort_cmp())
-- t is an array of arrays, we want to sort the strings in the "text" field of the inner arrays, and we want to keep the cache around.
local cmp, cache
cmp, cache = sort.natsort_cmp(cache)
table.sort(t, function(a, b) return cmp(a.text, b.text) end)
]]
function sort.natsort_cmp(cache)
if not cache then
cache = {}
end
local function natsort_conv(s)
local res, dot = "", ""
for n, m, c in tostring(s):gmatch("(0*(%d*))(.?)") do
if n == "" then
dot, c = "", dot..c
else
res = res..(dot == "" and ("%03d%s"):format(#m, m)
or "."..n)
dot, c = c:match("(%.?)(.*)")
end
res = res..c:gsub("[%z\1-\127\192-\255]", "\0%0")
end
cache[s] = res
return res
end
local function natsort(a, b)
local ca, cb = cache[a] or natsort_conv(a), cache[b] or natsort_conv(b)
return ca < cb or ca == cb and a < b
end
return natsort, cache
end
return sort