Handle NULL byte seperated multi items. Support unicode chars in JSON permalink.

This commit is contained in:
arkiver 2021-01-08 23:18:48 +01:00
parent 7c5ea717a8
commit 3d20ca90af
2 changed files with 82 additions and 46 deletions

View File

@ -41,12 +41,7 @@ if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
WGET_AT = find_executable(
'Wget+AT',
[
'GNU Wget 1.20.3-at.20200401.01',
'GNU Wget 1.20.3-at.20200804.01',
'GNU Wget 1.20.3-at.20200902.01',
'GNU Wget 1.20.3-at.20201030.01'
],
['GNU Wget 1.20.3-at.20201030.01'],
['./wget-at']
)
@ -112,7 +107,8 @@ class PrepareDirectories(SimpleTask):
def process(self, item):
item_name = item['item_name']
escaped_item_name = item_name.replace(':', '_').replace('/', '_').replace('~', '_')
item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
escaped_item_name = item_name_hash
dirname = '/'.join((item['data_dir'], escaped_item_name))
if os.path.isdir(dirname):
@ -121,8 +117,11 @@ class PrepareDirectories(SimpleTask):
os.makedirs(dirname)
item['item_dir'] = dirname
item['warc_file_base'] = '%s-%s-%s' % (self.warc_prefix, escaped_item_name[:50],
time.strftime('%Y%m%d-%H%M%S'))
item['warc_file_base'] = '-'.join([
self.warc_prefix,
item_name_hash,
time.strftime('%Y%m%d-%H%M%S')
])
open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()
@ -227,11 +226,12 @@ class WgetArgs(object):
'--waitretry', '30',
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
'--warc-header', 'operator: Archive Team',
'--warc-header', 'reddit-dld-script-version: ' + VERSION,
'--warc-header', ItemInterpolation('reddit-item: %(item_name)s'),
'--warc-header', 'x-wget-at-project-version: ' + VERSION,
'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
'--warc-dedup-url-agnostic',
'--warc-compression-use-zstd',
'--warc-zstd-dict-no-include'
'--warc-zstd-dict-no-include',
'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8'
]
dict_data = ZstdDict.get_dict()
@ -243,21 +243,21 @@ class WgetArgs(object):
'--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
])
item_name = item['item_name']
item_type, item_value = item_name.split(':', 1)
for item_name in item['item_name'].split('\0'):
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
wget_args.append('item-name://'+item_name)
item_type, item_value = item_name.split(':', 1)
if item_type in ('post', 'comment'):
if item_type == 'post':
wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
elif item_type == 'comment':
wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
else:
raise Exception('Unknown item')
item['item_type'] = item_type
item['item_value'] = item_value
if item_type in ('post', 'comment'):
if item_type == 'post':
wget_args.extend(['--warc-header', 'reddit-post: {}'.format(item_value)])
wget_args.append('https://www.reddit.com/api/info.json?id=t3_{}'.format(item_value))
elif item_type == 'comment':
wget_args.extend(['--warc-header', 'reddit-comment: {}'.format(item_value)])
wget_args.append('https://www.reddit.com/api/info.json?id=t1_{}'.format(item_value))
else:
raise Exception('Unknown item')
item['item_name_newline'] = item['item_name'].replace('\0', '\n')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
@ -294,8 +294,7 @@ pipeline = Pipeline(
accept_on_exit_code=[0, 4, 8],
env={
'item_dir': ItemValue('item_dir'),
'item_value': ItemValue('item_value'),
'item_type': ItemValue('item_type'),
'item_names': ItemValue('item_name_newline'),
'warc_file_base': ItemValue('warc_file_base'),
}
),

View File

@ -4,11 +4,18 @@ local urlparse = require("socket.url")
local http = require("socket.http")
JSON = (loadfile "JSON.lua")()
local item_type = os.getenv('item_type')
local item_value = os.getenv('item_value')
local item_names = os.getenv('item_names')
local item_dir = os.getenv('item_dir')
local warc_file_base = os.getenv('warc_file_base')
local item_types = {}
for s in string.gmatch(item_names, "([^\n]+)") do
local t, n = string.match(s, "^([^:]+):(.+)$")
item_types[n] = t
end
local item_type = nil
if urlparse == nil or http == nil then
io.stdout:write("socket not corrently installed.\n")
io.stdout:flush()
@ -70,12 +77,13 @@ allowed = function(url, parenturl)
end
if string.match(url, "'+")
or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(urlparse.unescape(url), "[<>\\%*%$;%^%[%],%(%){}]")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
or string.match(url, "^https?://[^/]*reddit%.com/login")
or string.match(url, "^https?://[^/]*reddit%.com/register")
or string.match(url, "%?sort=")
or string.match(url, "%?utm_source=reddit")
or string.match(url, "%?limit=500$")
or string.match(url, "%?ref=readnext$")
or string.match(url, "^https?://[^/]*reddit%.app%.link/")
@ -222,21 +230,27 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
elseif string.match(newurl, "^https?:\\/\\?/") then
check(string.gsub(newurl, "\\", ""))
elseif string.match(newurl, "^\\/\\/") then
check(string.match(url, "^(https?:)") .. string.gsub(newurl, "\\", ""))
checknewurl(string.gsub(newurl, "\\", ""))
elseif string.match(newurl, "^//") then
check(string.match(url, "^(https?:)") .. newurl)
check(urlparse.absolute(url, newurl))
elseif string.match(newurl, "^\\/") then
check(string.match(url, "^(https?://[^/]+)") .. string.gsub(newurl, "\\", ""))
checknewurl(string.gsub(newurl, "\\", ""))
elseif string.match(newurl, "^/") then
check(string.match(url, "^(https?://[^/]+)") .. newurl)
check(urlparse.absolute(url, newurl))
elseif string.match(newurl, "^%.%./") then
if string.match(url, "^https?://[^/]+/[^/]+/") then
check(urlparse.absolute(url, newurl))
else
checknewurl(string.match(newurl, "^%.%.(/.+)$"))
end
elseif string.match(newurl, "^%./") then
checknewurl(string.match(newurl, "^%.(.+)"))
check(urlparse.absolute(url, newurl))
end
end
local function checknewshorturl(newurl)
if string.match(newurl, "^%?") then
check(string.match(url, "^(https?://[^%?]+)") .. newurl)
check(urlparse.absolute(url, newurl))
elseif not (string.match(newurl, "^https?:\\?/\\?//?/?")
or string.match(newurl, "^[/\\]")
or string.match(newurl, "^%./")
@ -248,7 +262,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
or string.match(newurl, "^data:")
or string.match(newurl, "^irc:")
or string.match(newurl, "^%${")) then
check(string.match(url, "^(https?://.+/)") .. newurl)
check(urlparse.absolute(url, "/" .. newurl))
end
end
@ -369,6 +383,22 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
end
end
end
if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then
json = load_json_file(html)
if not json or not json["data"] or not json["data"]["children"] then
io.stdout:write("Could not load JSON.\n")
io.stdout:flush()
abortgrab = true
end
for _, child in pairs(json["data"]["children"]) do
if not child["data"] or not child["data"]["permalink"] then
io.stdout:write("Permalink is missing.\n")
io.stdout:flush()
abortgrab = true
end
checknewurl(child["data"]["permalink"])
end
end
for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do
checknewurl(newurl)
end
@ -402,22 +432,29 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$")
if match then
posts[match] = true
if not item_types[match] then
io.stdout:write("Type for ID not found.\n")
io.stdout:flush()
abortgrab = true
end
item_type = item_types[match]
end
if status_code == 204 then
return wget.actions.EXIT
end
if (status_code >= 300 and status_code <= 399) then
local newloc = string.match(http_stat["newloc"], "^([^#]+)")
if string.match(newloc, "^//") then
newloc = string.match(url["url"], "^(https?:)") .. string.match(newloc, "^//(.+)")
elseif string.match(newloc, "^/") then
newloc = string.match(url["url"], "^(https?://[^/]+)") .. newloc
elseif not string.match(newloc, "^https?://") then
newloc = string.match(url["url"], "^(https?://.+/)") .. newloc
if status_code >= 300 and status_code <= 399 then
local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
if string.match(newloc, "inactive%.min")
or string.match(newloc, "ReturnUrl")
or string.match(newloc, "adultcontent") then
io.stdout:write("Found invalid redirect.\n")
io.stdout:flush()
abortgrab = true
end
if processed(newloc) or not allowed(newloc, url["url"]) then
tries = 0
return wget.actions.EXIT
end
end