Version 20230910.04. Install lua utf8 library. Fix converting unicode codepoint to utf8 character support.

master
arkiver 9 months ago
parent 12abd58d4d
commit 3add4f891c

@ -1 +1,2 @@
FROM atdr.meo.ws/archiveteam/grab-base FROM atdr.meo.ws/archiveteam/grab-base
RUN luarocks install utf8

@ -59,7 +59,7 @@ if not WGET_AT:
# #
# Update this each time you make a non-cosmetic change. # Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker. # It will be added to the WARC files and reported to the tracker.
VERSION = '20230910.03' VERSION = '20230910.04'
TRACKER_ID = 'reddit' TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li' TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 100 MULTI_ITEM_SIZE = 100

@ -1,6 +1,7 @@
local urlparse = require("socket.url") local urlparse = require("socket.url")
local http = require("socket.http") local http = require("socket.http")
local cjson = require("cjson") local cjson = require("cjson")
local utf8 = require("utf8")
local item_names = os.getenv('item_names') local item_names = os.getenv('item_names')
local item_dir = os.getenv('item_dir') local item_dir = os.getenv('item_dir')
@ -283,7 +284,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
url_ = string.gsub( url_ = string.gsub(
url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])",
function (s) function (s)
return unicode_codepoint_as_utf8(tonumber(s, 16)) return utf8.char(tonumber(s, 16))
end end
) )
end end

Loading…
Cancel
Save