Version 20200728.01. Ignore non-reddit URLs. Fix extraction of tokens for morecomments.

pull/5/head
arkiver 4 years ago
parent 9a6417ecbc
commit 450d4e0413

@ -54,7 +54,7 @@ if not WGET_AT:
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20200727.03'
VERSION = '20200728.01'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'reddittest'
TRACKER_HOST = 'trackerproxy.meo.ws'

@ -92,6 +92,11 @@ allowed = function(url, parenturl)
parenturl
and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/")
and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/")
)
or not (
string.match(url, "^https?://[^/]*redd%.it/")
or string.match(url, "^https?://[^/]*reddit%.com/")
or string.match(url, "^https?://[^/]*redditmedia%.com/")
) then
return false
end
@ -274,19 +279,38 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do
local post_data = '{"token":"' .. s .. '"}'
local comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)")
if comment_id == nil then
comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)")
end
if comment_id == nil then
comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)")
local comments_data = nil
if string.match(url, "^https?://www%.reddit%.com/") then
comments_data = string.match(html, '<script%s+id="data">%s*window%.___r%s*=%s*({.+});%s*</script>%s*<script>')
if comments_data == nil then
print("Could not find comments data.")
abortgrab = true
end
if comment_id == nil then
print("Could not find comment ID.")
comments_data = load_json_file(comments_data)["moreComments"]["models"]
elseif string.match(url, "^https?://gateway%.reddit%.com/") then
comments_data = load_json_file(html)["moreComments"]
end
if comments_data == nil then
print("Error handling comments data.")
abortgrab = true
end
local comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)")
if comment_id == nil then
comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)")
end
if comment_id == nil then
comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)")
end
if comment_id == nil then
print("Could not find comment ID.")
abortgrab = true
end
for _, d in pairs(comments_data) do
if d["token"] == nil then
print("Could not find token.")
abortgrab = true
end
local post_data = '{"token":"' .. d["token"] .. '"}'
if not requested_children[post_data] then
requested_children[post_data] = true
table.insert(urls, {url=

Loading…
Cancel
Save