diff --git a/pipeline.py b/pipeline.py index 4fcdb3a..528e5b7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -54,7 +54,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20200727.03' +VERSION = '20200728.01' USER_AGENT = 'Archive Team' TRACKER_ID = 'reddittest' TRACKER_HOST = 'trackerproxy.meo.ws' diff --git a/reddit.lua b/reddit.lua index 81ee793..cd3ca5e 100644 --- a/reddit.lua +++ b/reddit.lua @@ -92,6 +92,11 @@ allowed = function(url, parenturl) parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") + ) + or not ( + string.match(url, "^https?://[^/]*redd%.it/") + or string.match(url, "^https?://[^/]*reddit%.com/") + or string.match(url, "^https?://[^/]*redditmedia%.com/") ) then return false end @@ -274,19 +279,38 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then - for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do - local post_data = '{"token":"' .. s .. '"}' - local comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)") - if comment_id == nil then - comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)") - end - if comment_id == nil then - comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)") + local comments_data = nil + if string.match(url, "^https?://www%.reddit%.com/") then + comments_data = string.match(html, '%s*window%.___r%s*=%s*({.+});%s*%s*