Version 20200726.01. Fully support new and old design for posts.

pull/5/head
arkiver 4 years ago
parent 56571306dd
commit 2f6a602313

@ -1,168 +1,2 @@
https://www.reddit.com/
https://www.redditstatic.com/desktop2x/img/snoo-upvote.png
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e50c34178d20d5fa4ab3c1f6c67901a9.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-0b0b9b2b7159c9bc6463e7ab3b0e8bd0.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-5267af566ab853eb9d74db1a78a46c67.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-b85bf848c28799f5ad34ee29db68051c.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-5a241c76c24e463ef9bcc5855d20209b.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-255b4934a1f414dd312aa89382d65114.woff
https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-b16bb0524a7e7ee597970333c0c67180.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-29d72243d2cd6145b28bcb80dc33f0e4.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-06ee3f893717454d11a16c3e8d0aa9f9.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-116bb6d508f5307861d3b1269bc597e7.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-c4b185e25a4dde85a29f902cd5ce5360.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-1051a531d3e1ee3483a6533158557139.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-875de5047556e7c822519d95d7ee692d.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.eot
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.woff
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.ttf
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.svg
https://www.redditstatic.com/desktop2x/Legacy~runtime~Reddit.97787cdd6c63f5bae0e3.js
https://www.redditstatic.com/desktop2x/Legacy~RedesignContentFonts.b488720bff09b2af6ac0.js
https://www.redditstatic.com/desktop2x/Legacy~RedesignSystemFonts.6f01c338e1546906d45d.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Poll~Reddit~RedesignChat.b7dece05c8943cdea084.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Governance~Reddit.91fc1d178146f81897d1.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Governance~Reddit.b2f8d4f01894b2592d5b.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Client~Governance~Reddit.f0c54af17c78c7813e02.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Gifts~Poll~Reddit.465b9d2661dafd9d1341.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Governance~Reddit.faa615b571967bf75e18.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Reddit.2d2f290c34f50d2de6eb.js
https://www.redditstatic.com/desktop2x/Legacy~Governance~Profile~ProfileHomepage~ProfilePostComments~R2CommentsPage~R2Listing~Reddit.1e5b73bb32dd79ae00af.js
https://www.redditstatic.com/desktop2x/Legacy~ChatPage~Client~Gifts~Governance~Reddit.1b90b6f863290aa3e6d6.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Gifts~Governance~Reddit.c28cb0086d650fcaf481.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Governance~Reddit.8b3e1712da474ac3ca10.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Governance~Reddit~RedesignChat.f6ae3f8ed29ed729ce89.js
https://www.redditstatic.com/desktop2x/Legacy~Client~Governance~Reddit.342d915109c0445b1569.js
https://www.redditstatic.com/desktop2x/Legacy~Governance~Reddit.4be452fa5e67799d348a.js
https://www.redditstatic.com/desktop2x/Legacy~Reddit.4a01b13741b20152b396.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPag~2698b78e.7a3127ba5ec30ed4e1c0.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~CollectionCommentsPage~CommentsPage~Frontpage~PostCreation~RedesignChat~RichTextEditor~~f6a0790c.b36927a082075e6f9b86.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage.f88935fd981121681471.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GlobalModalContainer~GovernanceReleaseNotesMod~6b4ca950.94d657605a47d3304551.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~db251346.e51c88db6b855beb60a4.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Frontpage~ModListing~ModQueuePages~ModerationPages~Multireddit~N~0ef8faec.67b48672a7164c7362bc.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~1084d5fc.a27786f9cac978bb4a0a.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~fc7712a4.441c314ec50477ea7912.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~d27514f2.1da837e692736e5e6c43.js
https://www.redditstatic.com/desktop2x/Legacy~CommentsPage.51d37b5d4496c188cc12.js
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-57x57.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-60x60.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-72x72.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-76x76.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-114x114.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-144x144.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-152x152.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-180x180.png
https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-32x32.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-96x96.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-16x16.png
https://www.redditstatic.com/desktop2x/img/favicon/manifest.json
https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png
https://www.redditstatic.com/desktop2x/js/ads.js
https://www.redditstatic.com/desktop2x/Legacy~Subreddit.32e5fa17c24840b8cbca.js
https://www.redditstatic.com/desktop2x/Legacy~Frontpage.cae047d3c2afb9e86a73.js
https://www.redditstatic.com/desktop2x/chunkCSS/Reddit.583754539e6661085608.css
https://www.redditstatic.com/desktop2x/img/gold/badges/award-gold-medium.png?v=2
https://about.reddit.com/
https://www.redditinc.com/
http://www.w3.org/2000/svg
https://www.redditstatic.com/desktop2x/img/favicon/ms-icon-144x144.png
https://s.imgur.com/min/sharePlayer.css?1554398656
https://s.imgur.com/min/imageViewerInline.js?1554398656
https://i.imgur.com/favicon.ico
https://imgur.com/favicon.ico
https://s.imgur.com/min/sharePlayer.js?1554398656
https://s.imgur.com/images/share-player-ffbg.png
https://s.imgur.com/images/loaders/ddddd1_181817/24.gif
https://s.imgur.com/images/favicon-32x32.png
https://s.imgur.com/images/favicon-96x96.png
https://s.imgur.com/images/favicon-16x16.png
https://s.imgur.com/min/global.css?1554398656
https://s.imgur.com/min/gallery.css?1554398656
https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js
https://s.imgur.com/include/js/ext/jquery.2.1.1.min.js
https://s.imgur.com/min/react15.js?1554398656
https://s.imgur.com/min/global.js?1554398656
https://s.imgur.com/min/advertising.js?1554398656
https://s.imgur.com/min/px.js?ch=1
https://s.imgur.com/min/px.js?ch=2
https://s.imgur.com/min/runSlots.js?1554398656
https://s.imgur.com/min/gallery.js?1554398656
https://s.imgur.com/include/fonts/imgur.eot?7
https://s.imgur.com/include/fonts/imgur.woff?7
https://s.imgur.com/include/fonts/imgur.ttf?7
https://s.imgur.com/include/fonts/imgur.svg?7
https://s.imgur.com/include/fonts/proxima-nova-regular.eot
https://s.imgur.com/include/fonts/proxima-nova-regular.eot?
https://s.imgur.com/include/fonts/proxima-nova-regular.woff2
https://s.imgur.com/include/fonts/proxima-nova-regular.woff
https://s.imgur.com/include/fonts/proxima-nova-regular.ttf
https://s.imgur.com/include/fonts/proxima-nova-regular.svg
https://s.imgur.com/include/fonts/proxima-nova-bold.eot
https://s.imgur.com/include/fonts/proxima-nova-bold.eot?
https://s.imgur.com/include/fonts/proxima-nova-bold.woff2
https://s.imgur.com/include/fonts/proxima-nova-bold.woff
https://s.imgur.com/include/fonts/proxima-nova-bold.ttf
https://s.imgur.com/include/fonts/proxima-nova-bold.svg
https://s.imgur.com/images/site-sprite.png?1430420391
https://s.imgur.com/images/button-icons.png
https://s.imgur.com/images/imgur-logo.svg?1
https://s.imgur.com/images/svg/comment-notification.svg
https://s.imgur.com/images/svg/stars-notoriety.svg
https://s.imgur.com/images/house-cta/cta-background.jpg
https://s.imgur.com/images/house-cta/snowflake1.png
https://s.imgur.com/images/house-cta/snowflake2.png
https://s.imgur.com/images/house-cta/snowflake3.png
https://s.imgur.com/images/loaders/181817_ffffff/48.gif
https://s.imgur.com/images/house-cta/cta-sms-stars.png
https://s.imgur.com/images/house-cta/cta-sms-phone.png
https://s.imgur.com/images/icons/close-outline.svg
https://s.imgur.com/images/datepicker/datepicker_t.png
https://s.imgur.com/images/datepicker/datepicker_b.png
https://s.imgur.com/images/datepicker/datepicker_l.png
https://s.imgur.com/images/datepicker/datepicker_r.png
https://s.imgur.com/images/datepicker/datepicker_tl.png
https://s.imgur.com/images/datepicker/datepicker_tr.png
https://s.imgur.com/images/datepicker/datepicker_bl.png
https://s.imgur.com/images/datepicker/datepicker_br.png
https://s.imgur.com/images/imgur.gif
https://s.imgur.com/images/loaders/ddddd1_181817/48.gif
https://s.imgur.com/images/loaders/ddddd1_2b2b2b/24.gif
https://s.imgur.com/images/tipsy.png
https://s.imgur.com/include/magnify.cur
https://s.imgur.com/images/icons/icon-cake.svg
https://s.imgur.com/images/loaders/ddddd1_2b2b2b/48.gif
https://s.imgur.com/images/loaders/ddddd1_121211/48.gif
https://s.imgur.com/images/report-ad-layouts.png
https://s.imgur.com/images/radiobox_checkmark_small.png
https://s.imgur.com/images/icons/volume-high.png
https://s.imgur.com/images/icons/volume-mute.png
https://s.imgur.com/images/icons/full-screen.png
https://s.imgur.com/images/icons/full-screen-minimize.png
https://s.imgur.com/images/reactionGifPromoAsset.png
https://s.imgur.com/images/calendar.png
https://imgur.com/6JayaOr.png?
https://i.imgur.com/6JayaOr.png?
https://s.imgur.com/images/buttons-sprite.png
https://s.imgur.com/images/giraffe-tophat.png
https://s.imgur.com/images/icons/Teal-Folder.svg
https://s.imgur.com/images/icons/Outline-Folder.svg
https://s.imgur.com/images/loaders/ddddd1_121211/16.gif
https://s.imgur.com/images/fp-edit.png
https://s.imgur.com/images/dot-dot-dot.svg
http://www.redditblog.com/
https://www.redditblog.com/
https://redditblog.com/
https://www.reddithelp.com/
https://www.reddithelp.com/en
http://redditgifts.com/
https://www.redditgifts.com/
https://www.reddithelp.com/
https://about.reddit.com/
https://old.reddit.com/static/opensearch.xml
https://reddit.com/static/pixel.png

@ -54,8 +54,8 @@ if not WGET_AT:
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20200701.01'
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; WOW64; Trident/4.0; SLCC1)'
VERSION = '20200726.01'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'trackerproxy.meo.ws'
@ -216,6 +216,7 @@ class WgetArgs(object):
'--timeout', '30',
'--tries', 'inf',
'--domains', 'reddit.com',
'--header', 'Cookie: over18=1',
'--span-hosts',
'--waitretry', '30',
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
@ -237,18 +238,21 @@ class WgetArgs(object):
])
item_name = item['item_name']
item_type, item_value = item_name.split(':', 1)
item_type, item_value = item_name.split('.', 1)
item['item_type'] = item_type
item['item_value'] = item_value
if item_type == 'posts':
if item_type in ('posts', 'comments'):
start, end = item_value.split('-')
for i in range(int(start), int(end)+1):
post_id = self.int_to_str(i)
wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)])
wget_args.append('https://www.reddit.com/comments/{}'.format(post_id))
#wget_args.append('https://old.reddit.com/comments/{}'.format(post_id))
if item_type == 'posts':
wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)])
wget_args.append('https://www.reddit.com/api/info.json?id=t3_{}'.format(post_id))
elif item_type == 'comments':
wget_args.extend(['--warc-header', 'reddit-comment: {}'.format(post_id)])
wget_args.append('https://www.reddit.com/api/info.json?id=t1_{}'.format(post_id))
else:
raise Exception('Unknown item')

@ -15,6 +15,7 @@ local abortgrab = false
local posts = {}
local requested_children = {}
local thumbs = {}
for ignore in io.open("ignore-list", "r"):lines() do
downloaded[ignore] = true
@ -46,19 +47,50 @@ processed = function(url)
return false
end
allowed = function(url, parenturl, source)
allowed = function(url, parenturl)
local match = string.match(url, "^https?://[^%.]+%.thumbs%.redditmedia%.com/([^%.]+)%.")
if match
and parenturl
and string.match(parenturl, "^https?://www%.reddit%.com/api/info%.json%?id=") then
thumbs[match] = true
end
if match and not thumbs[match] then
return false
end
if string.match(url, "'+")
or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/login")
or string.match(url, "^https?://[^/]*reddit%.com/register")
or string.match(url, "%?sort=")
or string.match(url, "^https?://[^/]*reddit%.app%.link/")
or string.match(url, "^https?://out%.reddit%.com/r/")
or (string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/"))
or string.match(url, "/%.rss$")
or (parenturl and string.match(url, "^https?://amp%.reddit%.com/")) then
or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/login")
or string.match(url, "^https?://[^/]*reddit%.com/register")
or string.match(url, "%?sort=")
or string.match(url, "%?limit=500$")
or string.match(url, "%?ref=readnext$")
or string.match(url, "^https?://[^/]*reddit%.app%.link/")
or string.match(url, "^https?://out%.reddit%.com/r/")
or string.match(url, "^https?://emoji%.redditmedia%.com/")
or string.match(url, "^https?://styles%.redditmedia%.com/")
or string.match(url, "^https?://[^%.]+%.redd%.it/award_images/")
or (
string.match(url, "^https?://gateway%.reddit%.com/")
and not string.match(url, "/morecomments/")
)
or string.match(url, "/%.rss$")
or (
parenturl
and string.match(url, "^https?://amp%.reddit%.com/")
)
or (
item_type == "posts"
and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$")
)
or (
parenturl
and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/")
and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/")
) then
return false
end
@ -77,10 +109,13 @@ allowed = function(url, parenturl, source)
return false
end
if string.match(url, "^https?://[^/]*redditmedia%.com/")
or string.match(url, "^https?://www%.reddit%.com/api/morechildren$")
or string.match(url, "^https?://v%.redd%.it/[^/]+/[^/]+$")
or string.match(url, "^https?://preview%.redd%.it/[^/]+/[^/]+$") then
if (string.match(url, "^https?://[^/]*redditmedia%.com/")
or string.match(url, "^https?://old%.reddit%.com/api/morechildren$")
or string.match(url, "^https?://v%.redd%.it/")
or string.match(url, "^https?://i%.redd%.it/")
or string.match(url, "^https?://[^%.]*preview%.redd%.it/.")
)
and not string.match(item_type, "comment") then
return true
end
@ -89,16 +124,6 @@ allowed = function(url, parenturl, source)
return true
end
end
if parenturl
and string.match(parenturl, "^https?://www%.reddit%.com/")
and source ~= "download_child_p"
and not string.match(url, "^https?://[^/]*reddit%.com/")
and not string.match(url, "^https?://[^/]*youtube%.com")
and not string.match(url, "^https?://[^/]*youtu%.be")
and not string.match(url, "^https?://[^/]*redd%.it/") then
return true
end
return false
end
@ -107,18 +132,27 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars
local url = urlpos["url"]["url"]
local html = urlpos["link_expect_html"]
if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") then
if item_type == "comments" then
return false
end
if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(url, "^https?://[^/]*redditstatic%.com/")
or string.match(url, "^https?://old%.reddit%.com/static/")
or string.match(url, "^https?://www%.reddit%.com/static/")
or string.match(url, "^https?://styles%.redditmedia%.com/")
or string.match(url, "^https?://emoji%.redditmedia%.com/")
or string.match(url, "/%.rss$") then
return false
end
if string.match(parent["url"], "^https?://www%.reddit%.com/comments/[a-z0-9]+") then
if string.match(parent["url"], "^https?://old%.reddit%.com/comments/[a-z0-9]+") then
return true
end
if not processed(url)
and (allowed(url, parent["url"], "download_child_p") or (allowed(parent["url"], nil, "download_child_p") and html == 0)) then
and (allowed(url, parent["url"]) or (allowed(parent["url"]) and html == 0)) then
addedtolist[url] = true
print('b ' .. html .. ' ' .. url)
return true
end
@ -137,12 +171,11 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&amp;", "&")
if not processed(url_)
and string.match(url_, "^https?://.+")
and allowed(url_, origurl, "get_urls")
and allowed(url_, origurl)
and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then
table.insert(urls, { url=url_ })
addedtolist[url_] = true
addedtolist[url] = true
print('a ' .. url)
end
end
@ -170,79 +203,100 @@ print('a ' .. url)
if string.match(newurl, "^%?") then
check(string.match(url, "^(https?://[^%?]+)") .. newurl)
elseif not (string.match(newurl, "^https?:\\?/\\?//?/?")
or string.match(newurl, "^[/\\]")
or string.match(newurl, "^%./")
or string.match(newurl, "^[jJ]ava[sS]cript:")
or string.match(newurl, "^[mM]ail[tT]o:")
or string.match(newurl, "^vine:")
or string.match(newurl, "^android%-app:")
or string.match(newurl, "^ios%-app:")
or string.match(newurl, "^%${")) then
or string.match(newurl, "^[/\\]")
or string.match(newurl, "^%./")
or string.match(newurl, "^[jJ]ava[sS]cript:")
or string.match(newurl, "^[mM]ail[tT]o:")
or string.match(newurl, "^vine:")
or string.match(newurl, "^android%-app:")
or string.match(newurl, "^ios%-app:")
or string.match(newurl, "^data:")
or string.match(newurl, "^%${")) then
check(string.match(url, "^(https?://.+/)") .. newurl)
end
end
if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[a-z0-9]+") then
posts[string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([a-z0-9]+)")] = true
if string.match(url, "^https?://www%.reddit%.com/") then
check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/"))
--elseif string.match(url, "^https?://old%.reddit%.com/") then
-- check(string.gsub(url, "^https?://old%.reddit%.com/", "https://www.reddit.com/"))
end
if allowed(url, nil, "get_urls")
and status_code < 300
and not string.match(url, "^https?://[^/]*redditmedia%.com/")
and not string.match(url, "^https?://[^/]*redditstatic%.com/")
and not string.match(url, "^https?://out%.reddit%.com/")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts$")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*$") then
if allowed(url)
and status_code < 300
and not string.match(url, "^https?://[^/]*redditmedia%.com/")
and not string.match(url, "^https?://[^/]*redditstatic%.com/")
and not string.match(url, "^https?://out%.reddit%.com/")
and not string.match(url, "^https?://[^%.]*preview%.redd%.it/")
and not string.match(url, "^https?://i%.redd%.it/")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.mp4") then
html = read_file(file)
if string.match(url, "^https?://www%.reddit%.com/api/morechildren$") then
if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then
html = string.gsub(html, '\\"', '"')
elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/")
or string.match(url, "^https?://www%.reddit%.com/r/[^/]+/duplicates/") then
elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/")
or string.match(url, "^https?://old%.reddit%.com/r/[^/]+/duplicates/") then
html = string.gsub(html, "<div%s+class='spacer'>%s*<div%s+class=\"titlebox\">.-</div>%s*</div>%s*<div%s+class='spacer'>%s*<div%s+id=\"ad_[0-9]+\"%s*class=\"ad%-container%s*\">", "")
end
if string.match(url, "^https?://www%.reddit%.com/") then
for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*[0-9]+,%s*'[^']+'%))") do
local link_id, sort, children, depth, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*([0-9]+),%s*'([^']+)'%)$")
if string.match(url, "^https?://old%.reddit%.com/") then
for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*'[^']+'%))") do
local link_id, sort, children, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*'([^']+)'%)$")
local id = string.match(children, "^([^,]+)")
local subreddit = string.match(html, 'data%-subreddit="([^"]+)"')
local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&depth=" .. depth .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html"
if requested_children[post_data] == nil then
local post_data =
"link_id=" .. link_id ..
"&sort=" .. sort ..
"&children=" .. string.gsub(children, ",", "%%2C") ..
"&id=t1_" .. id ..
"&limit_children=" .. limit_children ..
"&r=" .. subreddit ..
"&renderstyle=html"
if not requested_children[post_data] then
requested_children[post_data] = true
table.insert(urls, {url="https://www.reddit.com/api/morechildren",
table.insert(urls, {url="https://old.reddit.com/api/morechildren",
post_data=post_data})
end
end
--elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
-- or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
-- or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
-- for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do
-- local post_data = '{"token":"' .. s .. '"}'
-- local comment_id = nil
-- if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") then
-- comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)")
-- elseif string.match(url, "^https?://www%.reddit%.com/comments/[^/]") then
-- comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)")
-- elseif string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
-- comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)")
-- end
-- if requested_children[post_data] == nil then
-- requested_children[post_data] = true
-- table.insert(urls, {url="https://gateway.reddit.com/desktopapi/v1/morecomments/t3_" .. comment_id .. "?rtj=only&allow_over18=1&include=",
-- post_data=post_data})
-- end
-- end
elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do
local post_data = '{"token":"' .. s .. '"}'
local comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)")
if comment_id == nil then
comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)")
end
if comment_id == nil then
comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)")
end
if comment_id == nil then
print("Could not find comment ID.")
abortgrab = true
end
if not requested_children[post_data] then
requested_children[post_data] = true
table.insert(urls, {url=
"https://gateway.reddit.com/desktopapi/v1/morecomments/t3_" .. comment_id ..
"?emotes_as_images=true" ..
"&rtj=only" ..
"&allow_over18=1" ..
"&include=",
post_data=post_data
})
end
end
end
if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then
for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do
check("https?://www.reddit.com" .. s)
end
end
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.mpd$") then
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.mpd") then
for s in string.gmatch(html, "<BaseURL>([^<]+)</BaseURL>") do
checknewshorturl(s)
end
end
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.m3u8$") then
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.m3u8") then
for s in string.gmatch(html, "(.-)\n") do
if not string.match(s, "^#") then
checknewshorturl(s)
@ -279,6 +333,11 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n")
io.stdout:flush()
local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$")
if match then
posts[match] = true
end
if status_code == 204 then
return wget.actions.EXIT
end
@ -292,25 +351,17 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
elseif not string.match(newloc, "^https?://") then
newloc = string.match(url["url"], "^(https?://.+/)") .. newloc
end
if downloaded[newloc] == true or addedtolist[newloc] == true then
if processed(newloc) or not allowed(newloc, url["url"]) then
return wget.actions.EXIT
end
end
if downloaded[url["url"]] and http_stat["rderrmsg"] then
io.stdout:write("Url was already downloaded.\n")
io.stdout:write(http_stat["rderrmsg"] .. "\n")
io.stdout:write("Skipping URL.\n")
io.stdout:flush()
return wget.actions.EXIT
end
if (status_code >= 200 and status_code <= 399) then
downloaded[url["url"]] = true
downloaded[string.gsub(url["url"], "https?://", "http://")] = true
end
if abortgrab == true then
if abortgrab then
io.stdout:write("ABORTING...\n")
return wget.actions.ABORT
end
@ -321,23 +372,26 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. "). Sleeping.\n")
io.stdout:flush()
local maxtries = 8
if not allowed(url["url"], nil, "httploop_result") then
if not allowed(url["url"]) then
maxtries = 0
end
if tries >= maxtries then
io.stdout:write("\nI give up...\n")
io.stdout:flush()
tries = 0
if allowed(url["url"], nil, "httploop_result") then
if allowed(url["url"]) then
return wget.actions.ABORT
else
return wget.actions.EXIT
end
else
os.execute("sleep " .. math.floor(math.pow(2, tries)))
tries = tries + 1
return wget.actions.CONTINUE
end
os.execute("sleep " .. math.floor(math.pow(2, tries)))
tries = tries + 1
return wget.actions.CONTINUE
end
if string.match(url["url"], "^https?://[^/]+%.reddit%.com/api/info%?id=t[0-9]_[a-z0-9]+$") then
return wget.actions.EXIT
end
tries = 0
@ -352,7 +406,7 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
end
wget.callbacks.before_exit = function(exit_status, exit_status_string)
if abortgrab == true then
if abortgrab then
return wget.exits.IO_FAIL
end
return exit_status

Loading…
Cancel
Save