You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
4.4 KiB
Lua
159 lines
4.4 KiB
Lua
9 years ago
|
dofile("urlcode.lua")
|
||
|
dofile("table_show.lua")
|
||
|
|
||
|
local url_count = 0
|
||
|
local tries = 0
|
||
|
local item_type = os.getenv('item_type')
|
||
|
local item_value = os.getenv('item_value')
|
||
|
|
||
|
local downloaded = {}
|
||
|
local addedtolist = {}
|
||
|
|
||
|
read_file = function(file)
|
||
|
if file then
|
||
|
local f = assert(io.open(file))
|
||
|
local data = f:read("*all")
|
||
|
f:close()
|
||
|
return data
|
||
|
else
|
||
|
return ""
|
||
|
end
|
||
|
end
|
||
|
|
||
|
wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
|
||
|
local url = urlpos["url"]["url"]
|
||
|
local html = urlpos["link_expect_html"]
|
||
|
|
||
|
if downloaded[url] == true or addedtolist[url] == true then
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
if (downloaded[url] ~= true or addedtolist[url] ~= true) then
|
||
|
if (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]")) or html == 0 then
|
||
|
return true
|
||
|
else
|
||
|
return false
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
|
||
|
wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||
|
local urls = {}
|
||
|
local html = nil
|
||
|
|
||
|
if downloaded[url] ~= true then
|
||
|
downloaded[url] = true
|
||
|
end
|
||
|
|
||
|
local function check(url)
|
||
|
if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or string.match(url, "redditmedia%.com")) and not string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then
|
||
|
if string.match(url, "&") then
|
||
|
table.insert(urls, { url=string.gsub(url, "&", "&") })
|
||
|
addedtolist[url] = true
|
||
|
addedtolist[string.gsub(url, "&", "&")] = true
|
||
|
elseif string.match(url, "#") then
|
||
|
table.insert(urls, { url=string.match(url, "(https?:[^#]+)#") })
|
||
|
addedtolist[url] = true
|
||
|
addedtolist[string.match(url, "(https?:[^#]+)#")] = true
|
||
|
else
|
||
|
table.insert(urls, { url=url })
|
||
|
addedtolist[url] = true
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then
|
||
|
html = read_file(file)
|
||
|
for newurl in string.gmatch(html, '"(https?://[^"]+)"') do
|
||
|
check(newurl)
|
||
|
end
|
||
|
for newurl in string.gmatch(html, "'(https?://[^']+)'") do
|
||
|
check(newurl)
|
||
|
end
|
||
|
for newurl in string.gmatch(html, '("/[^"]+)"') do
|
||
|
if string.match(newurl, '"//') then
|
||
|
check(string.gsub(newurl, '"//', 'http://'))
|
||
|
elseif not string.match(newurl, '"//') then
|
||
|
check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)'))
|
||
|
end
|
||
|
end
|
||
|
for newurl in string.gmatch(html, "('/[^']+)'") do
|
||
|
if string.match(newurl, "'//") then
|
||
|
check(string.gsub(newurl, "'//", "http://"))
|
||
|
elseif not string.match(newurl, "'//") then
|
||
|
check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)"))
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
return urls
|
||
|
end
|
||
|
|
||
|
|
||
|
wget.callbacks.httploop_result = function(url, err, http_stat)
|
||
|
-- NEW for 2014: Slightly more verbose messages because people keep
|
||
|
-- complaining that it's not moving or not working
|
||
|
status_code = http_stat["statcode"]
|
||
|
|
||
|
url_count = url_count + 1
|
||
|
io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ". \n")
|
||
|
io.stdout:flush()
|
||
|
|
||
|
if (status_code >= 200 and status_code <= 399) then
|
||
|
if string.match(url.url, "https://") then
|
||
|
local newurl = string.gsub(url.url, "https://", "http://")
|
||
|
downloaded[newurl] = true
|
||
|
else
|
||
|
downloaded[url.url] = true
|
||
|
end
|
||
|
end
|
||
|
|
||
|
if status_code >= 500 or
|
||
|
(status_code >= 400 and status_code ~= 404 and status_code ~= 403) then
|
||
|
|
||
|
io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
|
||
|
io.stdout:flush()
|
||
|
|
||
|
os.execute("sleep 10")
|
||
|
|
||
|
tries = tries + 1
|
||
|
|
||
|
if tries >= 6 then
|
||
|
io.stdout:write("\nI give up...\n")
|
||
|
io.stdout:flush()
|
||
|
tries = 0
|
||
|
return wget.actions.ABORT
|
||
|
else
|
||
|
return wget.actions.CONTINUE
|
||
|
end
|
||
|
elseif status_code == 0 then
|
||
|
|
||
|
io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
|
||
|
io.stdout:flush()
|
||
|
|
||
|
os.execute("sleep 10")
|
||
|
|
||
|
tries = tries + 1
|
||
|
|
||
|
if tries >= 6 then
|
||
|
io.stdout:write("\nI give up...\n")
|
||
|
io.stdout:flush()
|
||
|
tries = 0
|
||
|
return wget.actions.ABORT
|
||
|
else
|
||
|
return wget.actions.CONTINUE
|
||
|
end
|
||
|
end
|
||
|
|
||
|
tries = 0
|
||
|
|
||
|
local sleep_time = 0
|
||
|
|
||
|
if sleep_time > 0.001 then
|
||
|
os.execute("sleep " .. sleep_time)
|
||
|
end
|
||
|
|
||
|
return wget.actions.NOTHING
|
||
|
end
|