mirror of
https://github.com/koreader/koreader
synced 2024-10-31 21:20:20 +00:00
7863a7ad70
* Move natural sorting algo to a dedicated sort module to avoid code duplication * Use a slightly more accurate algorithm, and speed it up by caching intermediary strings * Calibre: Use natural sorting in metadata search (fix #10009)
134 lines
4.3 KiB
Lua
134 lines
4.3 KiB
Lua
--[[--
|
|
This module contains a collection of comparison functions (or factories for comparison functions) for `table.sort`.
|
|
@module sort
|
|
]]
|
|
|
|
local sort = {}
|
|
|
|
--[[
|
|
Natural sorting functions, for use with table.sort
|
|
<http://notebook.kulchenko.com/algorithms/alphanumeric-natural-sorting-for-humans-in-lua>
|
|
--]]
|
|
-- Original implementation by Paul Kulchenko
|
|
--[[
|
|
local function addLeadingZeroes(d)
|
|
local dec, n = string.match(d, "(%.?)0*(.+)")
|
|
return #dec > 0 and ("%.12f"):format(d) or ("%s%03d%s"):format(dec, #n, n)
|
|
end
|
|
function sort.natsort(a, b)
|
|
return tostring(a):gsub("%.?%d+", addLeadingZeroes)..("%3d"):format(#b)
|
|
< tostring(b):gsub("%.?%d+", addLeadingZeroes)..("%3d"):format(#a)
|
|
end
|
|
--]]
|
|
-- Hardened (but more expensive) implementation by Egor Skriptunoff, with an UTF-8 tweak by Paul Kulchenko
|
|
--[[
|
|
local function natsort_conv(s)
|
|
local res, dot = "", ""
|
|
for n, m, c in tostring(s):gmatch("(0*(%d*))(.?)") do
|
|
if n == "" then
|
|
dot, c = "", dot..c
|
|
else
|
|
res = res..(dot == "" and ("%03d%s"):format(#m, m)
|
|
or "."..n)
|
|
dot, c = c:match("(%.?)(.*)")
|
|
end
|
|
res = res..c:gsub("[%z\1-\127\192-\255]", "\0%0")
|
|
end
|
|
return res
|
|
end
|
|
--]]
|
|
-- The above conversion is *fairly* expensive,
|
|
-- and table.sort ensures that it'll be called on identical strings multiple times,
|
|
-- so keeping a cache of massaged strings makes sense.
|
|
-- <https://github.com/koreader/koreader/pull/10023#discussion_r1069776657>
|
|
-- We can rely on LRU to avoid explicit cache maintenance concerns
|
|
-- (given the type of content we massage, the memory impact is fairly insignificant).
|
|
-- The extra persistence this affords us also happens to help with the FM use-case ;).
|
|
|
|
-- Dumb persistent hash-map => cold, ~200 to 250ms; hot: ~150ms (which roughly matches sorting by numerical file attributes).
|
|
-- (Numbers are from the FM sorting 350 entries (mostly composed of author names) on an H2O; an uncached run takes ~650ms).
|
|
--[[
|
|
local natsort_cache = {}
|
|
|
|
function sort.natsort(a, b)
|
|
local ca, cb = natsort_cache[a], natsort_cache[b]
|
|
if not ca then
|
|
ca = natsort_conv(a)
|
|
natsort_cache[a] = ca
|
|
end
|
|
if not cb then
|
|
cb = natsort_conv(b)
|
|
natsort_cache[b] = cb
|
|
end
|
|
|
|
return ca < cb or ca == cb and a < b
|
|
end
|
|
--]]
|
|
|
|
-- LRU => cold, ~200 to 250ms; hot ~150 to 175ms (which is barely any slower than a dumb hash-map, yay, LRU and LuaJIT magic).
|
|
--[[
|
|
local lru = require("ffi/lru")
|
|
local natsort_cache = lru.new(1024, nil, false)
|
|
|
|
function sort.natsort(a, b)
|
|
local ca, cb = natsort_cache:get(a), natsort_cache:get(b)
|
|
if not ca then
|
|
ca = natsort_conv(a)
|
|
natsort_cache:set(a, ca)
|
|
end
|
|
if not cb then
|
|
cb = natsort_conv(b)
|
|
natsort_cache:set(b, cb)
|
|
end
|
|
|
|
return ca < cb or ca == cb and a < b
|
|
end
|
|
--]]
|
|
|
|
--[[--
|
|
Generates a natural sorting comparison function for table.sort.
|
|
|
|
@param cache Optional, hashmap used to cache the processed strings to speed up sorting
|
|
@return The cmp function to feed to `table.sort`
|
|
@return The cache used (same object as the passed one, if any; will be created if not)
|
|
|
|
@usage
|
|
|
|
-- t is an array of strings, we don't want to keep the cache around
|
|
table.sort(t, sort.natsort_cmp())
|
|
|
|
-- t is an array of arrays, we want to sort the strings in the "text" field of the inner arrays, and we want to keep the cache around.
|
|
local cmp, cache
|
|
cmp, cache = sort.natsort_cmp(cache)
|
|
table.sort(t, function(a, b) return cmp(a.text, b.text) end)
|
|
]]
|
|
function sort.natsort_cmp(cache)
|
|
if not cache then
|
|
cache = {}
|
|
end
|
|
|
|
local function natsort_conv(s)
|
|
local res, dot = "", ""
|
|
for n, m, c in tostring(s):gmatch("(0*(%d*))(.?)") do
|
|
if n == "" then
|
|
dot, c = "", dot..c
|
|
else
|
|
res = res..(dot == "" and ("%03d%s"):format(#m, m)
|
|
or "."..n)
|
|
dot, c = c:match("(%.?)(.*)")
|
|
end
|
|
res = res..c:gsub("[%z\1-\127\192-\255]", "\0%0")
|
|
end
|
|
cache[s] = res
|
|
return res
|
|
end
|
|
|
|
local function natsort(a, b)
|
|
local ca, cb = cache[a] or natsort_conv(a), cache[b] or natsort_conv(b)
|
|
return ca < cb or ca == cb and a < b
|
|
end
|
|
return natsort, cache
|
|
end
|
|
|
|
return sort
|