Version 20190405.01; support www.reddit.com; support videos; support outlinks

pull/1/head
Arkiver2 5 years ago
parent 9d1ea0c688
commit 8902255c76

@ -0,0 +1,168 @@
https://www.reddit.com/
https://www.redditstatic.com/desktop2x/img/snoo-upvote.png
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e50c34178d20d5fa4ab3c1f6c67901a9.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-0b0b9b2b7159c9bc6463e7ab3b0e8bd0.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Italic-5267af566ab853eb9d74db1a78a46c67.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-b85bf848c28799f5ad34ee29db68051c.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-5a241c76c24e463ef9bcc5855d20209b.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoSans/Bold-Italic-255b4934a1f414dd312aa89382d65114.woff
https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-b16bb0524a7e7ee597970333c0c67180.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoMono/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-29d72243d2cd6145b28bcb80dc33f0e4.woff2
https://www.redditstatic.com/desktop2x/fonts/NotoMono/el-Regular-06ee3f893717454d11a16c3e8d0aa9f9.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-116bb6d508f5307861d3b1269bc597e7.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Regular-e6bbcdd30d3bd4d6b170bcb6d3552cab.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-c4b185e25a4dde85a29f902cd5ce5360.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Medium-1051a531d3e1ee3483a6533158557139.woff
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-875de5047556e7c822519d95d7ee692d.woff2
https://www.redditstatic.com/desktop2x/fonts/IBMPlexSans/Bold-c34ba754b7235b49d33b294ff7a54179.woff
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.eot
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.woff
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.ttf
https://www.redditstatic.com/desktop2x/fonts/redesignIcon/redesignFont.da087541a3f91c4af004a7c765fb21f4.svg
https://www.redditstatic.com/desktop2x/Legacy~runtime~Reddit.97787cdd6c63f5bae0e3.js
https://www.redditstatic.com/desktop2x/Legacy~RedesignContentFonts.b488720bff09b2af6ac0.js
https://www.redditstatic.com/desktop2x/Legacy~RedesignSystemFonts.6f01c338e1546906d45d.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Poll~Reddit~RedesignChat.b7dece05c8943cdea084.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Governance~Reddit.91fc1d178146f81897d1.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Governance~Reddit.b2f8d4f01894b2592d5b.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Client~Governance~Reddit.f0c54af17c78c7813e02.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Gifts~Poll~Reddit.465b9d2661dafd9d1341.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Governance~Reddit.faa615b571967bf75e18.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Reddit.2d2f290c34f50d2de6eb.js
https://www.redditstatic.com/desktop2x/Legacy~Governance~Profile~ProfileHomepage~ProfilePostComments~R2CommentsPage~R2Listing~Reddit.1e5b73bb32dd79ae00af.js
https://www.redditstatic.com/desktop2x/Legacy~ChatPage~Client~Gifts~Governance~Reddit.1b90b6f863290aa3e6d6.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Gifts~Governance~Reddit.c28cb0086d650fcaf481.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Client~Governance~Reddit.8b3e1712da474ac3ca10.js
https://www.redditstatic.com/desktop2x/Legacy~Chat~Governance~Reddit~RedesignChat.f6ae3f8ed29ed729ce89.js
https://www.redditstatic.com/desktop2x/Legacy~Client~Governance~Reddit.342d915109c0445b1569.js
https://www.redditstatic.com/desktop2x/Legacy~Governance~Reddit.4be452fa5e67799d348a.js
https://www.redditstatic.com/desktop2x/Legacy~Reddit.4a01b13741b20152b396.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPag~2698b78e.7a3127ba5ec30ed4e1c0.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~CollectionCommentsPage~CommentsPage~Frontpage~PostCreation~RedesignChat~RichTextEditor~~f6a0790c.b36927a082075e6f9b86.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~CollectionCommentsPage~CommentsPage.f88935fd981121681471.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GlobalModalContainer~GovernanceReleaseNotesMod~6b4ca950.94d657605a47d3304551.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~db251346.e51c88db6b855beb60a4.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Frontpage~ModListing~ModQueuePages~ModerationPages~Multireddit~N~0ef8faec.67b48672a7164c7362bc.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~GovernanceReleaseNotesModal~ModListing~ModQueu~1084d5fc.a27786f9cac978bb4a0a.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~fc7712a4.441c314ec50477ea7912.js
https://www.redditstatic.com/desktop2x/Legacy~CollectionCommentsPage~CommentsPage~Explore~Frontpage~ModListing~ModQueuePages~ModerationPages~Multi~d27514f2.1da837e692736e5e6c43.js
https://www.redditstatic.com/desktop2x/Legacy~CommentsPage.51d37b5d4496c188cc12.js
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-57x57.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-60x60.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-72x72.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-76x76.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-114x114.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-144x144.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-152x152.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-180x180.png
https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-32x32.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-96x96.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-16x16.png
https://www.redditstatic.com/desktop2x/img/favicon/manifest.json
https://www.redditstatic.com/desktop2x/img/renderTimingPixel.png
https://www.redditstatic.com/desktop2x/js/ads.js
https://www.redditstatic.com/desktop2x/Legacy~Subreddit.32e5fa17c24840b8cbca.js
https://www.redditstatic.com/desktop2x/Legacy~Frontpage.cae047d3c2afb9e86a73.js
https://www.redditstatic.com/desktop2x/chunkCSS/Reddit.583754539e6661085608.css
https://www.redditstatic.com/desktop2x/img/gold/badges/award-gold-medium.png?v=2
https://about.reddit.com/
https://www.redditinc.com/
http://www.w3.org/2000/svg
https://www.redditstatic.com/desktop2x/img/favicon/ms-icon-144x144.png
https://s.imgur.com/min/sharePlayer.css?1554398656
https://s.imgur.com/min/imageViewerInline.js?1554398656
https://i.imgur.com/favicon.ico
https://imgur.com/favicon.ico
https://s.imgur.com/min/sharePlayer.js?1554398656
https://s.imgur.com/images/share-player-ffbg.png
https://s.imgur.com/images/loaders/ddddd1_181817/24.gif
https://s.imgur.com/images/favicon-32x32.png
https://s.imgur.com/images/favicon-96x96.png
https://s.imgur.com/images/favicon-16x16.png
https://s.imgur.com/min/global.css?1554398656
https://s.imgur.com/min/gallery.css?1554398656
https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js
https://s.imgur.com/include/js/ext/jquery.2.1.1.min.js
https://s.imgur.com/min/react15.js?1554398656
https://s.imgur.com/min/global.js?1554398656
https://s.imgur.com/min/advertising.js?1554398656
https://s.imgur.com/min/px.js?ch=1
https://s.imgur.com/min/px.js?ch=2
https://s.imgur.com/min/runSlots.js?1554398656
https://s.imgur.com/min/gallery.js?1554398656
https://s.imgur.com/include/fonts/imgur.eot?7
https://s.imgur.com/include/fonts/imgur.woff?7
https://s.imgur.com/include/fonts/imgur.ttf?7
https://s.imgur.com/include/fonts/imgur.svg?7
https://s.imgur.com/include/fonts/proxima-nova-regular.eot
https://s.imgur.com/include/fonts/proxima-nova-regular.eot?
https://s.imgur.com/include/fonts/proxima-nova-regular.woff2
https://s.imgur.com/include/fonts/proxima-nova-regular.woff
https://s.imgur.com/include/fonts/proxima-nova-regular.ttf
https://s.imgur.com/include/fonts/proxima-nova-regular.svg
https://s.imgur.com/include/fonts/proxima-nova-bold.eot
https://s.imgur.com/include/fonts/proxima-nova-bold.eot?
https://s.imgur.com/include/fonts/proxima-nova-bold.woff2
https://s.imgur.com/include/fonts/proxima-nova-bold.woff
https://s.imgur.com/include/fonts/proxima-nova-bold.ttf
https://s.imgur.com/include/fonts/proxima-nova-bold.svg
https://s.imgur.com/images/site-sprite.png?1430420391
https://s.imgur.com/images/button-icons.png
https://s.imgur.com/images/imgur-logo.svg?1
https://s.imgur.com/images/svg/comment-notification.svg
https://s.imgur.com/images/svg/stars-notoriety.svg
https://s.imgur.com/images/house-cta/cta-background.jpg
https://s.imgur.com/images/house-cta/snowflake1.png
https://s.imgur.com/images/house-cta/snowflake2.png
https://s.imgur.com/images/house-cta/snowflake3.png
https://s.imgur.com/images/loaders/181817_ffffff/48.gif
https://s.imgur.com/images/house-cta/cta-sms-stars.png
https://s.imgur.com/images/house-cta/cta-sms-phone.png
https://s.imgur.com/images/icons/close-outline.svg
https://s.imgur.com/images/datepicker/datepicker_t.png
https://s.imgur.com/images/datepicker/datepicker_b.png
https://s.imgur.com/images/datepicker/datepicker_l.png
https://s.imgur.com/images/datepicker/datepicker_r.png
https://s.imgur.com/images/datepicker/datepicker_tl.png
https://s.imgur.com/images/datepicker/datepicker_tr.png
https://s.imgur.com/images/datepicker/datepicker_bl.png
https://s.imgur.com/images/datepicker/datepicker_br.png
https://s.imgur.com/images/imgur.gif
https://s.imgur.com/images/loaders/ddddd1_181817/48.gif
https://s.imgur.com/images/loaders/ddddd1_2b2b2b/24.gif
https://s.imgur.com/images/tipsy.png
https://s.imgur.com/include/magnify.cur
https://s.imgur.com/images/icons/icon-cake.svg
https://s.imgur.com/images/loaders/ddddd1_2b2b2b/48.gif
https://s.imgur.com/images/loaders/ddddd1_121211/48.gif
https://s.imgur.com/images/report-ad-layouts.png
https://s.imgur.com/images/radiobox_checkmark_small.png
https://s.imgur.com/images/icons/volume-high.png
https://s.imgur.com/images/icons/volume-mute.png
https://s.imgur.com/images/icons/full-screen.png
https://s.imgur.com/images/icons/full-screen-minimize.png
https://s.imgur.com/images/reactionGifPromoAsset.png
https://s.imgur.com/images/calendar.png
https://imgur.com/6JayaOr.png?
https://i.imgur.com/6JayaOr.png?
https://s.imgur.com/images/buttons-sprite.png
https://s.imgur.com/images/giraffe-tophat.png
https://s.imgur.com/images/icons/Teal-Folder.svg
https://s.imgur.com/images/icons/Outline-Folder.svg
https://s.imgur.com/images/loaders/ddddd1_121211/16.gif
https://s.imgur.com/images/fp-edit.png
https://s.imgur.com/images/dot-dot-dot.svg
http://www.redditblog.com/
https://www.redditblog.com/
https://redditblog.com/
https://www.reddithelp.com/
https://www.reddithelp.com/en
http://redditgifts.com/
https://www.redditgifts.com/
https://www.reddithelp.com/
https://about.reddit.com/

@ -69,7 +69,7 @@ if not WGET_LUA:
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20190222.01'
VERSION = '20190405.01'
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'tracker.archiveteam.org'
@ -202,8 +202,10 @@ class MoveFiles(SimpleTask):
if os.path.exists('%(item_dir)s/%(warc_file_base)s.warc' % item):
raise Exception('Please compile wget with zlib support!')
os.rename('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item,
'%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item)
#os.rename('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item,
# '%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item)
os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item,
'%(data_dir)s/%(warc_file_base)s.warc.gz' % item)
os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
'%(data_dir)s/%(warc_file_base)s_data.txt' % item)
@ -246,7 +248,7 @@ class WgetArgs(object):
'-U', USER_AGENT,
'-nv',
'--lua-script', 'reddit.lua',
'--load-cookies', 'cookies',
'--load-cookies', 'cookies.txt',
'-o', ItemInterpolation('%(item_dir)s/wget.log'),
'--no-check-certificate',
'--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
@ -322,12 +324,12 @@ pipeline = Pipeline(
'warc_file_base': ItemValue('warc_file_base')
}
),
Deduplicate(),
PrepareStatsForTracker(
defaults={'downloader': downloader, 'version': VERSION},
file_groups={
'data': [
ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')
#ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
]
},
id_function=stats_id_function,
@ -341,7 +343,8 @@ pipeline = Pipeline(
downloader=downloader,
version=VERSION,
files=[
ItemInterpolation('%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz'),
ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz'),
#ItemInterpolation('%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz'),
ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
],
rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),

@ -15,6 +15,7 @@ local abortgrab = false
local posts = {}
local requested_children = {}
local outlinks = {}
for ignore in io.open("ignore-list", "r"):lines() do
downloaded[ignore] = true
@ -42,10 +43,14 @@ end
allowed = function(url, parenturl)
if string.match(url, "'+")
or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/login")
or string.match(url, "^https?://[^/]*reddit%.com/register")
or string.match(url, "%?sort=")
or string.match(url, "^https?://www%.reddit%.com/") --TEMP
or string.match(url, "^https?://[^/]*reddit%.app%.link/")
or string.match(url, "^https?://out%.reddit%.com/r/")
or (string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/"))
or string.match(url, "/%.rss$") then
return false
end
@ -67,7 +72,8 @@ allowed = function(url, parenturl)
if string.match(url, "^https?://i%.redd%.it/")
or string.match(url, "^https?://[^/]*redditmedia%.com/")
or string.match(url, "^https://old.reddit.com/api/morechildren$") then
or string.match(url, "^https?://old%.reddit%.com/api/morechildren$")
or string.match(url, "^https?://v%.redd%.it/[^/]+/[^/]+$") then
return true
end
@ -76,6 +82,17 @@ allowed = function(url, parenturl)
return true
end
end
if parenturl
and (string.match(parenturl, "^https?://www%.reddit%.com/") or outlinks[parenturl])
and not string.match(url, "^https?://[^/]*reddit%.com/")
and not string.match(url, "^https?://[^/]*youtube%.com")
and not string.match(url, "^https?://[^/]*youtu%.be") then
if outlinks[parenturl] == nil then
outlinks[url] = true
end
return true
end
return false
end
@ -106,7 +123,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
local function check(urla)
local origurl = url
local url = string.match(urla, "^([^#]+)")
local url_ = string.gsub(url, "&amp;", "&")
local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&amp;", "&")
if (downloaded[url_] ~= true and addedtolist[url_] ~= true)
and allowed(url_, origurl) then
table.insert(urls, { url=url_ })
@ -158,7 +175,10 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
if allowed(url, nil)
and not string.match(url, "^https?://[^/]*redditmedia%.com/")
and not string.match(url, "^https?://[^/]*redditstatic%.com/") then
and not string.match(url, "^https?://[^/]*redditstatic%.com/")
and not string.match(url, "^https?://out%.reddit%.com/")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts$")
and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*$") then
html = read_file(file)
if string.match(url, "^https://old.reddit.com/api/morechildren$") then
html = string.gsub(html, '\\"', '"')
@ -175,6 +195,42 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
post_data=post_data})
end
end
elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
for s in string.gmatch(html, '"token"%s*:%s*"([^"]+)"') do
local post_data = '{"token":"' .. s .. '"}'
local comment_id = nil
if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") then
comment_id = string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([^/]+)")
elseif string.match(url, "^https?://www%.reddit%.com/comments/[^/]") then
comment_id = string.match(url, "^https?://www%.reddit%.com/comments/([^/]+)")
elseif string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
comment_id = string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_([^%?]+)")
end
if requested_children[post_data] == nil then
requested_children[post_data] = true
table.insert(urls, {url="https://gateway.reddit.com/desktopapi/v1/morecomments/t3_" .. comment_id .. "?rtj=only&allow_over18=1&include=",
post_data=post_data})
end
end
end
if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then
for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do
check("https?://www.reddit.com" .. s)
end
end
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.mpd$") then
for s in string.gmatch(html, "<BaseURL>([^<]+)</BaseURL>") do
checknewshorturl(s)
end
end
if string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]+%.m3u8$") then
for s in string.gmatch(html, "(.-)\n") do
if not string.match(s, "^#") then
checknewshorturl(s)
end
end
end
for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do
checknewurl(newurl)

Loading…
Cancel
Save