From b2654e93171aecbe0e4ebcff1791a842b98d0264 Mon Sep 17 00:00:00 2001 From: arkiver Date: Tue, 9 May 2023 05:43:21 +0200 Subject: [PATCH] Version 20230509.01. Support for new design. --- pipeline.py | 2 +- reddit.lua | 46 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/pipeline.py b/pipeline.py index 0b74a08..bef2158 100644 --- a/pipeline.py +++ b/pipeline.py @@ -61,7 +61,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20221021.01' +VERSION = '20230509.01' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 20 diff --git a/reddit.lua b/reddit.lua index d00d0ad..8876245 100644 --- a/reddit.lua +++ b/reddit.lua @@ -115,6 +115,7 @@ allowed = function(url, parenturl) or string.match(url, "^https?://old%.reddit%.com/gold%?") or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com") or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit") + or string.match(url, "/%?context=1$") or ( string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/") @@ -148,6 +149,10 @@ allowed = function(url, parenturl) return false end + if string.match(url, "^https?://www%.reddit%.com/svc/") then + return true + end + local tested = {} for s in string.gmatch(url, "([^/]+)") do if tested[s] == nil then @@ -259,10 +264,15 @@ end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil + local no_more_svc = false downloaded[url] = true local function check(urla) + if no_more_svc + and string.match(urla, "^https?://[^/]+/svc/") then + return nil + end local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.match(url, "^(.-)%.?$") @@ -334,7 +344,8 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) end if string.match(url, "^https?://www%.reddit%.com/") - and not string.match(url, "/api/") then + and not string.match(url, "/api/") + and not string.match(url, "^https?://[^/]+/svc/") then check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) end @@ -357,9 +368,9 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) and not string.match(url, "%.mpd") ) then html = read_file(file) - if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then + --[[if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then check(url .. "?utm_source=reddit&utm_medium=web2x&context=3") - end + end]] if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") @@ -385,7 +396,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) post_data=post_data}) end end - elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") + --[[elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then @@ -435,7 +446,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) post_data=post_data }) end - end + end]] end if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do @@ -508,6 +519,19 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) checknewshorturl(url) end end + if string.match(url, "^https?://www%.reddit%.com/svc/") then + for src_url, cursor in string.gmatch(html, ']+src="([^"]+)"[^>]*>%s*') do + src_url = string.gsub(src_url, "&", "&") + local requested_s = src_url .. cursor + if not requested_children[requested_s] then + print("posting with cursor", cursor) + table.insert(urls, {url= + urlparse.absolute(url, src_url), + post_data="cursor=" .. cursor-- .. "&csrf_token=" .. csrf_token + }) + end + end + end if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then json = load_json_file(html) if not json or not json["data"] or not json["data"]["children"] then @@ -537,8 +561,20 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) if crosspost_parent and crosspost_parent ~= string.match(url, "(t[0-9]_[a-z0-9]+)") then is_crosspost = true end + local id = child["data"]["id"] + local subreddit = child["data"]["subreddit"] + if child["kind"] == "t1" then + check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/" .. child["data"]["link_id"] .. "/t1_" .. id .. "?render-mode=partial&shredtop=") + elseif child["kind"] == "t3" then + check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/t3_" .. id .. "?render-mode=partial") + else + io.stdout:write("Kind is not supported.\n") + io.stdout:flush() + abort_item() + end end end + no_more_svc = true for newurl in string.gmatch(string.gsub(html, """, '"'), '([^"%s]+)') do checknewurl(newurl) end