From 1b3690d994a024c126cd837a850a852e8117243b Mon Sep 17 00:00:00 2001 From: arkiver Date: Tue, 30 Mar 2021 22:20:43 +0200 Subject: [PATCH] Version 20210330.04. Only decode unicode characters in URLs on v.redd.it URLs. --- pipeline.py | 2 +- reddit.lua | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pipeline.py b/pipeline.py index d6f1154..90ede67 100644 --- a/pipeline.py +++ b/pipeline.py @@ -60,7 +60,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20210330.03' +VERSION = '20210330.04' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 20 diff --git a/reddit.lua b/reddit.lua index d6b84a1..d384b35 100644 --- a/reddit.lua +++ b/reddit.lua @@ -243,18 +243,20 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.match(url, "^(.-)%.?$") - url_ = string.gsub( - url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", - function (s) - local i = tonumber(s, 16) - if i < 128 then - return string.char(i) - else - -- should not have these - abort_item() + if not string.find(url, "v.redd.it") then + url_ = string.gsub( + url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", + function (s) + local i = tonumber(s, 16) + if i < 128 then + return string.char(i) + else + -- should not have these + abort_item() + end end - end - ) + ) + end while string.find(url_, "&") do url_ = string.gsub(url_, "&", "&") end