You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 lines
6.1 KiB
Lua

9 years ago
dofile("urlcode.lua")
dofile("table_show.lua")
local url_count = 0
local tries = 0
local item_type = os.getenv('item_type')
local item_value = os.getenv('item_value')
local downloaded = {}
local addedtolist = {}
-- Do not download these urls:
downloaded["http://pixel.redditmedia.com/pixel/of_destiny.png?v=q1Ga4BM4n71zceWwjRg4266wx1BqgGjx8isnnrLeBUv%2FXq%2Bk60QeBpQruPDKFQFv%2FDWVNxp63YPBIKv8pMk%2BhrkV3HA5b7GO"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_doom.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_delight.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_discovery.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_diversity.png"] = true
downloaded["http://pixel.redditmedia.com/click"] = true
downloaded["https://stats.redditmedia.com/"] = true
9 years ago
read_file = function(file)
if file then
local f = assert(io.open(file))
local data = f:read("*all")
f:close()
return data
else
return ""
end
end
wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
local url = urlpos["url"]["url"]
local html = urlpos["link_expect_html"]
9 years ago
if downloaded[url] == true or addedtolist[url] == true then
return false
end
if (downloaded[url] ~= true or addedtolist[url] ~= true) then
if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com") or string.match(url, "thumbs%.redditmedia%.com")) then
addedtolist[url] = true
9 years ago
return true
else
return false
end
else
return false
9 years ago
end
end
wget.callbacks.get_urls = function(file, url, is_css, iri)
local urls = {}
local html = nil
if downloaded[url] ~= true then
downloaded[url] = true
end
local function check(url)
if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or (string.match(url, "redditmedia%.com")) and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "thumbs%.redditmedia%.com") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com")) then
9 years ago
if string.match(url, "&") then
table.insert(urls, { url=string.gsub(url, "&", "&") })
addedtolist[url] = true
addedtolist[string.gsub(url, "&", "&")] = true
elseif string.match(url, "#") then
table.insert(urls, { url=string.match(url, "(https?//:[^#]+)#") })
9 years ago
addedtolist[url] = true
addedtolist[string.match(url, "(https?//:[^#]+)#")] = true
9 years ago
else
table.insert(urls, { url=url })
addedtolist[url] = true
end
end
end
if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "/related/"..item_value)) then
9 years ago
html = read_file(file)
for newurl in string.gmatch(html, '"thumbnail[^"]+"[^"]+"[^"]+"[^"]+"(//[^"]+)"') do
if downloaded[string.gsub(newurl, "//", "http://")] ~= true and addedtolist[string.gsub(newurl, "//", "http://")] ~= true then
table.insert(urls, { url=string.gsub(newurl, "//", "http://") })
addedtolist[string.gsub(newurl, "//", "http://")] = true
end
end
9 years ago
for newurl in string.gmatch(html, '"(https?://[^"]+)"') do
check(newurl)
end
for newurl in string.gmatch(html, "'(https?://[^']+)'") do
check(newurl)
end
for newurl in string.gmatch(html, '("/[^"]+)"') do
if string.match(newurl, '"//') then
check(string.gsub(newurl, '"//', 'http://'))
elseif not string.match(newurl, '"//') then
check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)'))
end
end
for newurl in string.gmatch(html, "('/[^']+)'") do
if string.match(newurl, "'//") then
check(string.gsub(newurl, "'//", "http://"))
elseif not string.match(newurl, "'//") then
check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)"))
end
end
end
return urls
end
wget.callbacks.httploop_result = function(url, err, http_stat)
-- NEW for 2014: Slightly more verbose messages because people keep
-- complaining that it's not moving or not working
status_code = http_stat["statcode"]
url_count = url_count + 1
io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ". \n")
io.stdout:flush()
if (status_code >= 200 and status_code <= 399) then
if string.match(url.url, "https://") then
local newurl = string.gsub(url.url, "https://", "http://")
downloaded[newurl] = true
else
downloaded[url.url] = true
end
end
if status_code >= 500 or
(status_code >= 400 and status_code ~= 404 and status_code ~= 403) then
io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
io.stdout:flush()
os.execute("sleep 10")
tries = tries + 1
if tries >= 6 then
io.stdout:write("\nI give up...\n")
io.stdout:flush()
tries = 0
if string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then
return wget.actions.ABORT
else
return wget.actions.EXIT
end
9 years ago
else
return wget.actions.CONTINUE
end
elseif status_code == 0 then
io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
io.stdout:flush()
os.execute("sleep 10")
tries = tries + 1
if tries >= 6 then
io.stdout:write("\nI give up...\n")
io.stdout:flush()
tries = 0
return wget.actions.ABORT
else
return wget.actions.CONTINUE
end
end
tries = 0
local sleep_time = 0
if sleep_time > 0.001 then
os.execute("sleep " .. sleep_time)
end
return wget.actions.NOTHING
end