reddit-grab/reddit.lua

dofile("urlcode.lua")
dofile("table_show.lua")

local url_count = 0
local tries = 0
local item_type = os.getenv('item_type')
local item_value = os.getenv('item_value')

local downloaded = {}
local addedtolist = {}

-- Do not download these urls:
downloaded["http://pixel.redditmedia.com/pixel/of_destiny.png?v=q1Ga4BM4n71zceWwjRg4266wx1BqgGjx8isnnrLeBUv%2FXq%2Bk60QeBpQruPDKFQFv%2FDWVNxp63YPBIKv8pMk%2BhrkV3HA5b7GO"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_doom.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_delight.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_discovery.png"] = true
downloaded["http://pixel.redditmedia.com/pixel/of_diversity.png"] = true
downloaded["http://pixel.redditmedia.com/click"] = true
downloaded["https://stats.redditmedia.com/"] = true

read_file = function(file)
  if file then
    local f = assert(io.open(file))
    local data = f:read("*all")
    f:close()
    return data
  else
    return ""
  end
end

wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
  local url = urlpos["url"]["url"]
  local html = urlpos["link_expect_html"]

  if downloaded[url] == true or addedtolist[url] == true then
    return false
  end
  
  if (downloaded[url] ~= true or addedtolist[url] ~= true) then
    if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com") or string.match(url, "thumbs%.redditmedia%.com")) then
      addedtolist[url] = true
      return true
    else
      return false
    end
  else
    return false
  end
end


wget.callbacks.get_urls = function(file, url, is_css, iri)
  local urls = {}
  local html = nil

  if downloaded[url] ~= true then
    downloaded[url] = true
  end
 
  local function check(url)
    if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or (string.match(url, "redditmedia%.com")) and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "thumbs%.redditmedia%.com") or string.match(url, "%?sort=") or string.match(url, "%?ref=")  or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com")) then
      if string.match(url, "&amp;") then
        table.insert(urls, { url=string.gsub(url, "&amp;", "&") })
        addedtolist[url] = true
        addedtolist[string.gsub(url, "&amp;", "&")] = true
      elseif string.match(url, "#") then
        table.insert(urls, { url=string.match(url, "(https?//:[^#]+)#") })
        addedtolist[url] = true
        addedtolist[string.match(url, "(https?//:[^#]+)#")] = true
      else
        table.insert(urls, { url=url })
        addedtolist[url] = true
      end
    end
  end
  
  if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "/related/"..item_value)) then
    html = read_file(file)
    for newurl in string.gmatch(html, '"thumbnail[^"]+"[^"]+"[^"]+"[^"]+"(//[^"]+)"') do
      if downloaded[string.gsub(newurl, "//", "http://")] ~= true and addedtolist[string.gsub(newurl, "//", "http://")] ~= true then
        table.insert(urls, { url=string.gsub(newurl, "//", "http://") })
        addedtolist[string.gsub(newurl, "//", "http://")] = true
      end
    end
    for newurl in string.gmatch(html, '"(https?://[^"]+)"') do
      check(newurl)
    end
    for newurl in string.gmatch(html, "'(https?://[^']+)'") do
      check(newurl)
    end
    for newurl in string.gmatch(html, '("/[^"]+)"') do
      if string.match(newurl, '"//') then
        check(string.gsub(newurl, '"//', 'http://'))
      elseif not string.match(newurl, '"//') then
        check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)'))
      end
    end
    for newurl in string.gmatch(html, "('/[^']+)'") do
      if string.match(newurl, "'//") then
        check(string.gsub(newurl, "'//", "http://"))
      elseif not string.match(newurl, "'//") then
        check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)"))
      end
    end
  end
  
  return urls
end
  

wget.callbacks.httploop_result = function(url, err, http_stat)
  -- NEW for 2014: Slightly more verbose messages because people keep
  -- complaining that it's not moving or not working
  status_code = http_stat["statcode"]
  
  url_count = url_count + 1
  io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ".  \n")
  io.stdout:flush()

  if (status_code >= 200 and status_code <= 399) then
    if string.match(url.url, "https://") then
      local newurl = string.gsub(url.url, "https://", "http://")
      downloaded[newurl] = true
    else
      downloaded[url.url] = true
    end
  end
  
  if status_code >= 500 or
    (status_code >= 400 and status_code ~= 404 and status_code ~= 403) then

    io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
    io.stdout:flush()

    os.execute("sleep 10")

    tries = tries + 1

    if tries >= 6 then
      io.stdout:write("\nI give up...\n")
      io.stdout:flush()
      tries = 0
      if string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then
        return wget.actions.ABORT
      else
        return wget.actions.EXIT
      end
    else
      return wget.actions.CONTINUE
    end
  elseif status_code == 0 then

    io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
    io.stdout:flush()

    os.execute("sleep 10")
    
    tries = tries + 1

    if tries >= 6 then
      io.stdout:write("\nI give up...\n")
      io.stdout:flush()
      tries = 0
      return wget.actions.ABORT
    else
      return wget.actions.CONTINUE
    end
  end

  tries = 0

  local sleep_time = 0

  if sleep_time > 0.001 then
    os.execute("sleep " .. sleep_time)
  end

  return wget.actions.NOTHING
end
reddit.lua 9 years ago			`dofile("urlcode.lua")`
			`dofile("table_show.lua")`

			`local url_count = 0`
			`local tries = 0`
			`local item_type = os.getenv('item_type')`
			`local item_value = os.getenv('item_value')`

			`local downloaded = {}`
			`local addedtolist = {}`

reddit.lua: ignore urls, fixes 9 years ago			`-- Do not download these urls:`
			`downloaded["http://pixel.redditmedia.com/pixel/of_destiny.png?v=q1Ga4BM4n71zceWwjRg4266wx1BqgGjx8isnnrLeBUv%2FXq%2Bk60QeBpQruPDKFQFv%2FDWVNxp63YPBIKv8pMk%2BhrkV3HA5b7GO"] = true`
			`downloaded["http://pixel.redditmedia.com/pixel/of_doom.png"] = true`
			`downloaded["http://pixel.redditmedia.com/pixel/of_delight.png"] = true`
			`downloaded["http://pixel.redditmedia.com/pixel/of_discovery.png"] = true`
			`downloaded["http://pixel.redditmedia.com/pixel/of_diversity.png"] = true`
			`downloaded["http://pixel.redditmedia.com/click"] = true`
			`downloaded["https://stats.redditmedia.com/"] = true`

reddit.lua 9 years ago			`read_file = function(file)`
			`if file then`
			`local f = assert(io.open(file))`
			`local data = f:read("*all")`
			`f:close()`
			`return data`
			`else`
			`return ""`
			`end`
			`end`

			`wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)`
			`local url = urlpos["url"]["url"]`
			`local html = urlpos["link_expect_html"]`
reddit.lua: ignore urls, fixes 9 years ago
reddit.lua 9 years ago			`if downloaded[url] == true or addedtolist[url] == true then`
			`return false`
			`end`

			`if (downloaded[url] ~= true or addedtolist[url] ~= true) then`
reddit.lua: ignore urls, fixes 9 years ago			`if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com") or string.match(url, "thumbs%.redditmedia%.com")) then`
			`addedtolist[url] = true`
reddit.lua 9 years ago			`return true`
			`else`
			`return false`
			`end`
reddit.lua: ignore urls, fixes 9 years ago			`else`
			`return false`
reddit.lua 9 years ago			`end`
			`end`


			`wget.callbacks.get_urls = function(file, url, is_css, iri)`
			`local urls = {}`
			`local html = nil`

			`if downloaded[url] ~= true then`
			`downloaded[url] = true`
			`end`

			`local function check(url)`
reddit.lua: ignore urls, fixes 9 years ago			`if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or (string.match(url, "redditmedia%.com")) and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "thumbs%.redditmedia%.com") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com")) then`
reddit.lua 9 years ago			`if string.match(url, "&") then`
			`table.insert(urls, { url=string.gsub(url, "&", "&") })`
			`addedtolist[url] = true`
			`addedtolist[string.gsub(url, "&", "&")] = true`
			`elseif string.match(url, "#") then`
reddit.lua: ignore urls, fixes 9 years ago			`table.insert(urls, { url=string.match(url, "(https?//:[^#]+)#") })`
reddit.lua 9 years ago			`addedtolist[url] = true`
reddit.lua: ignore urls, fixes 9 years ago			`addedtolist[string.match(url, "(https?//:[^#]+)#")] = true`
reddit.lua 9 years ago			`else`
			`table.insert(urls, { url=url })`
			`addedtolist[url] = true`
			`end`
			`end`
			`end`

reddit.lua: ignore urls, fixes 9 years ago			`if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "/related/"..item_value)) then`
reddit.lua 9 years ago			`html = read_file(file)`
reddit.lua: ignore urls, fixes 9 years ago			`for newurl in string.gmatch(html, '"thumbnail[^"]+"[^"]+"[^"]+"[^"]+"(//[^"]+)"') do`
			`if downloaded[string.gsub(newurl, "//", "http://")] ~= true and addedtolist[string.gsub(newurl, "//", "http://")] ~= true then`
			`table.insert(urls, { url=string.gsub(newurl, "//", "http://") })`
			`addedtolist[string.gsub(newurl, "//", "http://")] = true`
			`end`
			`end`
reddit.lua 9 years ago			`for newurl in string.gmatch(html, '"(https?://[^"]+)"') do`
			`check(newurl)`
			`end`
			`for newurl in string.gmatch(html, "'(https?://[^']+)'") do`
			`check(newurl)`
			`end`
			`for newurl in string.gmatch(html, '("/[^"]+)"') do`
			`if string.match(newurl, '"//') then`
			`check(string.gsub(newurl, '"//', 'http://'))`
			`elseif not string.match(newurl, '"//') then`
			`check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)'))`
			`end`
			`end`
			`for newurl in string.gmatch(html, "('/[^']+)'") do`
			`if string.match(newurl, "'//") then`
			`check(string.gsub(newurl, "'//", "http://"))`
			`elseif not string.match(newurl, "'//") then`
			`check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)"))`
			`end`
			`end`
			`end`

			`return urls`
			`end`


			`wget.callbacks.httploop_result = function(url, err, http_stat)`
			`-- NEW for 2014: Slightly more verbose messages because people keep`
			`-- complaining that it's not moving or not working`
			`status_code = http_stat["statcode"]`

			`url_count = url_count + 1`
			`io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ". \n")`
			`io.stdout:flush()`

			`if (status_code >= 200 and status_code <= 399) then`
			`if string.match(url.url, "https://") then`
			`local newurl = string.gsub(url.url, "https://", "http://")`
			`downloaded[newurl] = true`
			`else`
			`downloaded[url.url] = true`
			`end`
			`end`

			`if status_code >= 500 or`
			`(status_code >= 400 and status_code ~= 404 and status_code ~= 403) then`

			`io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")`
			`io.stdout:flush()`

			`os.execute("sleep 10")`

			`tries = tries + 1`

			`if tries >= 6 then`
			`io.stdout:write("\nI give up...\n")`
			`io.stdout:flush()`
			`tries = 0`
reddit.lua: ignore urls, fixes 9 years ago			`if string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then`
			`return wget.actions.ABORT`
			`else`
			`return wget.actions.EXIT`
			`end`
reddit.lua 9 years ago			`else`
			`return wget.actions.CONTINUE`
			`end`
			`elseif status_code == 0 then`

			`io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")`
			`io.stdout:flush()`

			`os.execute("sleep 10")`

			`tries = tries + 1`

			`if tries >= 6 then`
			`io.stdout:write("\nI give up...\n")`
			`io.stdout:flush()`
			`tries = 0`
			`return wget.actions.ABORT`
			`else`
			`return wget.actions.CONTINUE`
			`end`
			`end`

			`tries = 0`

			`local sleep_time = 0`

			`if sleep_time > 0.001 then`
			`os.execute("sleep " .. sleep_time)`
			`end`

			`return wget.actions.NOTHING`
			`end`