Version 20230509.01. Support for new design.

pull/17/head
arkiver 1 year ago
parent 7f4db17348
commit b2654e9317

@ -61,7 +61,7 @@ if not WGET_AT:
# #
# Update this each time you make a non-cosmetic change. # Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker. # It will be added to the WARC files and reported to the tracker.
VERSION = '20221021.01' VERSION = '20230509.01'
TRACKER_ID = 'reddit' TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li' TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 20 MULTI_ITEM_SIZE = 20

@ -115,6 +115,7 @@ allowed = function(url, parenturl)
or string.match(url, "^https?://old%.reddit%.com/gold%?") or string.match(url, "^https?://old%.reddit%.com/gold%?")
or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com") or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com")
or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit") or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit")
or string.match(url, "/%?context=1$")
or ( or (
string.match(url, "^https?://gateway%.reddit%.com/") string.match(url, "^https?://gateway%.reddit%.com/")
and not string.match(url, "/morecomments/") and not string.match(url, "/morecomments/")
@ -148,6 +149,10 @@ allowed = function(url, parenturl)
return false return false
end end
if string.match(url, "^https?://www%.reddit%.com/svc/") then
return true
end
local tested = {} local tested = {}
for s in string.gmatch(url, "([^/]+)") do for s in string.gmatch(url, "([^/]+)") do
if tested[s] == nil then if tested[s] == nil then
@ -259,10 +264,15 @@ end
wget.callbacks.get_urls = function(file, url, is_css, iri) wget.callbacks.get_urls = function(file, url, is_css, iri)
local urls = {} local urls = {}
local html = nil local html = nil
local no_more_svc = false
downloaded[url] = true downloaded[url] = true
local function check(urla) local function check(urla)
if no_more_svc
and string.match(urla, "^https?://[^/]+/svc/") then
return nil
end
local origurl = url local origurl = url
local url = string.match(urla, "^([^#]+)") local url = string.match(urla, "^([^#]+)")
local url_ = string.match(url, "^(.-)%.?$") local url_ = string.match(url, "^(.-)%.?$")
@ -334,7 +344,8 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
end end
if string.match(url, "^https?://www%.reddit%.com/") if string.match(url, "^https?://www%.reddit%.com/")
and not string.match(url, "/api/") then and not string.match(url, "/api/")
and not string.match(url, "^https?://[^/]+/svc/") then
check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/"))
end end
@ -357,9 +368,9 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
and not string.match(url, "%.mpd") and not string.match(url, "%.mpd")
) then ) then
html = read_file(file) html = read_file(file)
if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then --[[if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then
check(url .. "?utm_source=reddit&utm_medium=web2x&context=3") check(url .. "?utm_source=reddit&utm_medium=web2x&context=3")
end end]]
if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then
html = string.gsub(html, '\\"', '"') html = string.gsub(html, '\\"', '"')
elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/")
@ -385,7 +396,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
post_data=post_data}) post_data=post_data})
end end
end end
elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") --[[elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]")
or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
@ -435,7 +446,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
post_data=post_data post_data=post_data
}) })
end end
end end]]
end end
if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then
for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do
@ -508,6 +519,19 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
checknewshorturl(url) checknewshorturl(url)
end end
end end
if string.match(url, "^https?://www%.reddit%.com/svc/") then
for src_url, cursor in string.gmatch(html, '<faceplate%-partial[^>]+src="([^"]+)"[^>]*>%s*<input%s+type="hidden"%s+name="cursor"%s+value="([^"]+)"%s*/>') do
src_url = string.gsub(src_url, "&amp;", "&")
local requested_s = src_url .. cursor
if not requested_children[requested_s] then
print("posting with cursor", cursor)
table.insert(urls, {url=
urlparse.absolute(url, src_url),
post_data="cursor=" .. cursor-- .. "&csrf_token=" .. csrf_token
})
end
end
end
if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then
json = load_json_file(html) json = load_json_file(html)
if not json or not json["data"] or not json["data"]["children"] then if not json or not json["data"] or not json["data"]["children"] then
@ -537,8 +561,20 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
if crosspost_parent and crosspost_parent ~= string.match(url, "(t[0-9]_[a-z0-9]+)") then if crosspost_parent and crosspost_parent ~= string.match(url, "(t[0-9]_[a-z0-9]+)") then
is_crosspost = true is_crosspost = true
end end
local id = child["data"]["id"]
local subreddit = child["data"]["subreddit"]
if child["kind"] == "t1" then
check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/" .. child["data"]["link_id"] .. "/t1_" .. id .. "?render-mode=partial&shredtop=")
elseif child["kind"] == "t3" then
check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/t3_" .. id .. "?render-mode=partial")
else
io.stdout:write("Kind is not supported.\n")
io.stdout:flush()
abort_item()
end
end end
end end
no_more_svc = true
for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"%s]+)') do for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"%s]+)') do
checknewurl(newurl) checknewurl(newurl)
end end

Loading…
Cancel
Save