diff --git a/Dockerfile b/Dockerfile index c3197f3..34a1687 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1 +1,2 @@ FROM atdr.meo.ws/archiveteam/grab-base +RUN luarocks install utf8 diff --git a/pipeline.py b/pipeline.py index 8dc4986..f514f82 100644 --- a/pipeline.py +++ b/pipeline.py @@ -59,7 +59,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20230910.03' +VERSION = '20230910.04' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 100 diff --git a/reddit.lua b/reddit.lua index 84f52fb..e2edce3 100644 --- a/reddit.lua +++ b/reddit.lua @@ -1,6 +1,7 @@ local urlparse = require("socket.url") local http = require("socket.http") local cjson = require("cjson") +local utf8 = require("utf8") local item_names = os.getenv('item_names') local item_dir = os.getenv('item_dir') @@ -283,7 +284,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) url_ = string.gsub( url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", function (s) - return unicode_codepoint_as_utf8(tonumber(s, 16)) + return utf8.char(tonumber(s, 16)) end ) end