mirror of
https://github.com/ArchiveTeam/reddit-grab
synced 2024-11-12 01:10:50 +00:00
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink.
This commit is contained in:
parent
7c5ea717a8
commit
3d20ca90af
55
pipeline.py
55
pipeline.py
@ -41,12 +41,7 @@ if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
|
||||
|
||||
WGET_AT = find_executable(
|
||||
'Wget+AT',
|
||||
[
|
||||
'GNU Wget 1.20.3-at.20200401.01',
|
||||
'GNU Wget 1.20.3-at.20200804.01',
|
||||
'GNU Wget 1.20.3-at.20200902.01',
|
||||
'GNU Wget 1.20.3-at.20201030.01'
|
||||
],
|
||||
['GNU Wget 1.20.3-at.20201030.01'],
|
||||
['./wget-at']
|
||||
)
|
||||
|
||||
@ -112,7 +107,8 @@ class PrepareDirectories(SimpleTask):
|
||||
|
||||
def process(self, item):
|
||||
item_name = item['item_name']
|
||||
escaped_item_name = item_name.replace(':', '_').replace('/', '_').replace('~', '_')
|
||||
item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
|
||||
escaped_item_name = item_name_hash
|
||||
dirname = '/'.join((item['data_dir'], escaped_item_name))
|
||||
|
||||
if os.path.isdir(dirname):
|
||||
@ -121,8 +117,11 @@ class PrepareDirectories(SimpleTask):
|
||||
os.makedirs(dirname)
|
||||
|
||||
item['item_dir'] = dirname
|
||||
item['warc_file_base'] = '%s-%s-%s' % (self.warc_prefix, escaped_item_name[:50],
|
||||
time.strftime('%Y%m%d-%H%M%S'))
|
||||
item['warc_file_base'] = '-'.join([
|
||||
self.warc_prefix,
|
||||
item_name_hash,
|
||||
time.strftime('%Y%m%d-%H%M%S')
|
||||
])
|
||||
|
||||
open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
|
||||
open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()
|
||||
@ -227,11 +226,12 @@ class WgetArgs(object):
|
||||
'--waitretry', '30',
|
||||
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
|
||||
'--warc-header', 'operator: Archive Team',
|
||||
'--warc-header', 'reddit-dld-script-version: ' + VERSION,
|
||||
'--warc-header', ItemInterpolation('reddit-item: %(item_name)s'),
|
||||
'--warc-header', 'x-wget-at-project-version: ' + VERSION,
|
||||
'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
|
||||
'--warc-dedup-url-agnostic',
|
||||
'--warc-compression-use-zstd',
|
||||
'--warc-zstd-dict-no-include'
|
||||
'--warc-zstd-dict-no-include',
|
||||
'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8'
|
||||
]
|
||||
|
||||
dict_data = ZstdDict.get_dict()
|
||||
@ -243,21 +243,21 @@ class WgetArgs(object):
|
||||
'--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
|
||||
])
|
||||
|
||||
item_name = item['item_name']
|
||||
item_type, item_value = item_name.split(':', 1)
|
||||
for item_name in item['item_name'].split('\0'):
|
||||
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
|
||||
wget_args.append('item-name://'+item_name)
|
||||
item_type, item_value = item_name.split(':', 1)
|
||||
if item_type in ('post', 'comment'):
|
||||
if item_type == 'post':
|
||||
wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
|
||||
wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
|
||||
elif item_type == 'comment':
|
||||
wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
|
||||
wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
|
||||
else:
|
||||
raise Exception('Unknown item')
|
||||
|
||||
item['item_type'] = item_type
|
||||
item['item_value'] = item_value
|
||||
|
||||
if item_type in ('post', 'comment'):
|
||||
if item_type == 'post':
|
||||
wget_args.extend(['--warc-header', 'reddit-post: {}'.format(item_value)])
|
||||
wget_args.append('https://www.reddit.com/api/info.json?id=t3_{}'.format(item_value))
|
||||
elif item_type == 'comment':
|
||||
wget_args.extend(['--warc-header', 'reddit-comment: {}'.format(item_value)])
|
||||
wget_args.append('https://www.reddit.com/api/info.json?id=t1_{}'.format(item_value))
|
||||
else:
|
||||
raise Exception('Unknown item')
|
||||
item['item_name_newline'] = item['item_name'].replace('\0', '\n')
|
||||
|
||||
if 'bind_address' in globals():
|
||||
wget_args.extend(['--bind-address', globals()['bind_address']])
|
||||
@ -294,8 +294,7 @@ pipeline = Pipeline(
|
||||
accept_on_exit_code=[0, 4, 8],
|
||||
env={
|
||||
'item_dir': ItemValue('item_dir'),
|
||||
'item_value': ItemValue('item_value'),
|
||||
'item_type': ItemValue('item_type'),
|
||||
'item_names': ItemValue('item_name_newline'),
|
||||
'warc_file_base': ItemValue('warc_file_base'),
|
||||
}
|
||||
),
|
||||
|
73
reddit.lua
73
reddit.lua
@ -4,11 +4,18 @@ local urlparse = require("socket.url")
|
||||
local http = require("socket.http")
|
||||
JSON = (loadfile "JSON.lua")()
|
||||
|
||||
local item_type = os.getenv('item_type')
|
||||
local item_value = os.getenv('item_value')
|
||||
local item_names = os.getenv('item_names')
|
||||
local item_dir = os.getenv('item_dir')
|
||||
local warc_file_base = os.getenv('warc_file_base')
|
||||
|
||||
local item_types = {}
|
||||
for s in string.gmatch(item_names, "([^\n]+)") do
|
||||
local t, n = string.match(s, "^([^:]+):(.+)$")
|
||||
item_types[n] = t
|
||||
end
|
||||
|
||||
local item_type = nil
|
||||
|
||||
if urlparse == nil or http == nil then
|
||||
io.stdout:write("socket not corrently installed.\n")
|
||||
io.stdout:flush()
|
||||
@ -70,12 +77,13 @@ allowed = function(url, parenturl)
|
||||
end
|
||||
|
||||
if string.match(url, "'+")
|
||||
or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
|
||||
or string.match(urlparse.unescape(url), "[<>\\%*%$;%^%[%],%(%){}]")
|
||||
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
|
||||
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
|
||||
or string.match(url, "^https?://[^/]*reddit%.com/login")
|
||||
or string.match(url, "^https?://[^/]*reddit%.com/register")
|
||||
or string.match(url, "%?sort=")
|
||||
or string.match(url, "%?utm_source=reddit")
|
||||
or string.match(url, "%?limit=500$")
|
||||
or string.match(url, "%?ref=readnext$")
|
||||
or string.match(url, "^https?://[^/]*reddit%.app%.link/")
|
||||
@ -222,21 +230,27 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||||
elseif string.match(newurl, "^https?:\\/\\?/") then
|
||||
check(string.gsub(newurl, "\\", ""))
|
||||
elseif string.match(newurl, "^\\/\\/") then
|
||||
check(string.match(url, "^(https?:)") .. string.gsub(newurl, "\\", ""))
|
||||
checknewurl(string.gsub(newurl, "\\", ""))
|
||||
elseif string.match(newurl, "^//") then
|
||||
check(string.match(url, "^(https?:)") .. newurl)
|
||||
check(urlparse.absolute(url, newurl))
|
||||
elseif string.match(newurl, "^\\/") then
|
||||
check(string.match(url, "^(https?://[^/]+)") .. string.gsub(newurl, "\\", ""))
|
||||
checknewurl(string.gsub(newurl, "\\", ""))
|
||||
elseif string.match(newurl, "^/") then
|
||||
check(string.match(url, "^(https?://[^/]+)") .. newurl)
|
||||
check(urlparse.absolute(url, newurl))
|
||||
elseif string.match(newurl, "^%.%./") then
|
||||
if string.match(url, "^https?://[^/]+/[^/]+/") then
|
||||
check(urlparse.absolute(url, newurl))
|
||||
else
|
||||
checknewurl(string.match(newurl, "^%.%.(/.+)$"))
|
||||
end
|
||||
elseif string.match(newurl, "^%./") then
|
||||
checknewurl(string.match(newurl, "^%.(.+)"))
|
||||
check(urlparse.absolute(url, newurl))
|
||||
end
|
||||
end
|
||||
|
||||
local function checknewshorturl(newurl)
|
||||
if string.match(newurl, "^%?") then
|
||||
check(string.match(url, "^(https?://[^%?]+)") .. newurl)
|
||||
check(urlparse.absolute(url, newurl))
|
||||
elseif not (string.match(newurl, "^https?:\\?/\\?//?/?")
|
||||
or string.match(newurl, "^[/\\]")
|
||||
or string.match(newurl, "^%./")
|
||||
@ -248,7 +262,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||||
or string.match(newurl, "^data:")
|
||||
or string.match(newurl, "^irc:")
|
||||
or string.match(newurl, "^%${")) then
|
||||
check(string.match(url, "^(https?://.+/)") .. newurl)
|
||||
check(urlparse.absolute(url, "/" .. newurl))
|
||||
end
|
||||
end
|
||||
|
||||
@ -369,6 +383,22 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||||
end
|
||||
end
|
||||
end
|
||||
if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then
|
||||
json = load_json_file(html)
|
||||
if not json or not json["data"] or not json["data"]["children"] then
|
||||
io.stdout:write("Could not load JSON.\n")
|
||||
io.stdout:flush()
|
||||
abortgrab = true
|
||||
end
|
||||
for _, child in pairs(json["data"]["children"]) do
|
||||
if not child["data"] or not child["data"]["permalink"] then
|
||||
io.stdout:write("Permalink is missing.\n")
|
||||
io.stdout:flush()
|
||||
abortgrab = true
|
||||
end
|
||||
checknewurl(child["data"]["permalink"])
|
||||
end
|
||||
end
|
||||
for newurl in string.gmatch(string.gsub(html, """, '"'), '([^"]+)') do
|
||||
checknewurl(newurl)
|
||||
end
|
||||
@ -402,22 +432,29 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
|
||||
local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$")
|
||||
if match then
|
||||
posts[match] = true
|
||||
if not item_types[match] then
|
||||
io.stdout:write("Type for ID not found.\n")
|
||||
io.stdout:flush()
|
||||
abortgrab = true
|
||||
end
|
||||
item_type = item_types[match]
|
||||
end
|
||||
|
||||
if status_code == 204 then
|
||||
return wget.actions.EXIT
|
||||
end
|
||||
|
||||
if (status_code >= 300 and status_code <= 399) then
|
||||
local newloc = string.match(http_stat["newloc"], "^([^#]+)")
|
||||
if string.match(newloc, "^//") then
|
||||
newloc = string.match(url["url"], "^(https?:)") .. string.match(newloc, "^//(.+)")
|
||||
elseif string.match(newloc, "^/") then
|
||||
newloc = string.match(url["url"], "^(https?://[^/]+)") .. newloc
|
||||
elseif not string.match(newloc, "^https?://") then
|
||||
newloc = string.match(url["url"], "^(https?://.+/)") .. newloc
|
||||
if status_code >= 300 and status_code <= 399 then
|
||||
local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
|
||||
if string.match(newloc, "inactive%.min")
|
||||
or string.match(newloc, "ReturnUrl")
|
||||
or string.match(newloc, "adultcontent") then
|
||||
io.stdout:write("Found invalid redirect.\n")
|
||||
io.stdout:flush()
|
||||
abortgrab = true
|
||||
end
|
||||
if processed(newloc) or not allowed(newloc, url["url"]) then
|
||||
tries = 0
|
||||
return wget.actions.EXIT
|
||||
end
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user