dofile("table_show.lua") dofile("urlcode.lua") JSON = (loadfile "JSON.lua")() local item_type = os.getenv('item_type') local item_value = os.getenv('item_value') local item_dir = os.getenv('item_dir') local warc_file_base = os.getenv('warc_file_base') local url_count = 0 local tries = 0 local downloaded = {} local addedtolist = {} local abortgrab = false local posts = {} local requested_children = {} local thumbs = {} for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true end load_json_file = function(file) if file then return JSON:decode(file) else return nil end end read_file = function(file) if file then local f = assert(io.open(file)) local data = f:read("*all") f:close() return data else return "" end end processed = function(url) if downloaded[url] or addedtolist[url] then return true end return false end allowed = function(url, parenturl) local match = string.match(url, "^https?://[^%.]+%.thumbs%.redditmedia%.com/([^%.]+)%.") if match and parenturl and string.match(parenturl, "^https?://www%.reddit%.com/api/info%.json%?id=") then thumbs[match] = true end if match and not thumbs[match] then return false end if string.match(url, "'+") or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/login") or string.match(url, "^https?://[^/]*reddit%.com/register") or string.match(url, "%?sort=") or string.match(url, "%?limit=500$") or string.match(url, "%?ref=readnext$") or string.match(url, "^https?://[^/]*reddit%.app%.link/") or string.match(url, "^https?://out%.reddit%.com/r/") or string.match(url, "^https?://emoji%.redditmedia%.com/") or string.match(url, "^https?://styles%.redditmedia%.com/") or string.match(url, "^https?://old%.reddit%.com/gallery/") or string.match(url, "^https?://old%.reddit%.com/gold%?") or string.match(url, "^https?://[^%.]+%.redd%.it/award_images/") or ( string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/") ) or string.match(url, "/%.rss$") or ( parenturl and string.match(url, "^https?://amp%.reddit%.com/") ) or ( item_type == "posts" and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$") ) or ( parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") ) then return false end local tested = {} for s in string.gmatch(url, "([^/]+)") do if tested[s] == nil then tested[s] = 0 end if tested[s] == 6 then return false end tested[s] = tested[s] + 1 end if url .. "/" == parenturl then return false end if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") or string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then return true end if (string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://v%.redd%.it/") or string.match(url, "^https?://i%.redd%.it/") or string.match(url, "^https?://[^%.]*preview%.redd%.it/.") ) and not string.match(item_type, "comment") then return true end for s in string.gmatch(url, "([a-z0-9]+)") do if posts[s] then return true end end return false end wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] if item_type == "comments" then return false end if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") or string.match(url, "^https?://[^/]*redditstatic%.com/") or string.match(url, "^https?://old%.reddit%.com/static/") or string.match(url, "^https?://www%.reddit%.com/static/") or string.match(url, "^https?://styles%.redditmedia%.com/") or string.match(url, "^https?://emoji%.redditmedia%.com/") or string.match(url, "/%.rss$") then return false end if string.match(parent["url"], "^https?://old%.reddit%.com/comments/[a-z0-9]+") then return true end if not processed(url) and (allowed(url, parent["url"]) or (allowed(parent["url"]) and html == 0)) then addedtolist[url] = true return true end return false end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil downloaded[url] = true local function check(urla) local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&", "&") if not processed(url_) and string.match(url_, "^https?://.+") and allowed(url_, origurl) and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true end end local function checknewurl(newurl) if string.match(newurl, "^https?:////") then check(string.gsub(newurl, ":////", "://")) elseif string.match(newurl, "^https?://") then check(newurl) elseif string.match(newurl, "^https?:\\/\\?/") then check(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^\\/\\/") then check(string.match(url, "^(https?:)") .. string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^//") then check(string.match(url, "^(https?:)") .. newurl) elseif string.match(newurl, "^\\/") then check(string.match(url, "^(https?://[^/]+)") .. string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^/") then check(string.match(url, "^(https?://[^/]+)") .. newurl) elseif string.match(newurl, "^%./") then checknewurl(string.match(newurl, "^%.(.+)")) end end local function checknewshorturl(newurl) if string.match(newurl, "^%?") then check(string.match(url, "^(https?://[^%?]+)") .. newurl) elseif not (string.match(newurl, "^https?:\\?/\\?//?/?") or string.match(newurl, "^[/\\]") or string.match(newurl, "^%./") or string.match(newurl, "^[jJ]ava[sS]cript:") or string.match(newurl, "^[mM]ail[tT]o:") or string.match(newurl, "^vine:") or string.match(newurl, "^android%-app:") or string.match(newurl, "^ios%-app:") or string.match(newurl, "^data:") or string.match(newurl, "^irc:") or string.match(newurl, "^%${")) then check(string.match(url, "^(https?://.+/)") .. newurl) end end if string.match(url, "^https?://www%.reddit%.com/") and not string.match(url, "/api/") then check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) end local match = string.match(url, "^https?://preview%.redd%.it/([a-zA-Z0-9]+%.[a-zA-Z0-9]+)") if match then check("https://i.redd.it/" .. match) end if allowed(url) and status_code < 300 and not string.match(url, "^https?://[^/]*redditmedia%.com/") and not string.match(url, "^https?://[^/]*redditstatic%.com/") and not string.match(url, "^https?://out%.reddit%.com/") and not string.match(url, "^https?://[^%.]*preview%.redd%.it/") and not string.match(url, "^https?://i%.redd%.it/") and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts") and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.mp4") then html = read_file(file) if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") or string.match(url, "^https?://old%.reddit%.com/r/[^/]+/duplicates/") then html = string.gsub(html, "