dofile("table_show.lua") dofile("urlcode.lua") JSON = (loadfile "JSON.lua")() local item_type = os.getenv('item_type') local item_value = os.getenv('item_value') local item_dir = os.getenv('item_dir') local warc_file_base = os.getenv('warc_file_base') local url_count = 0 local tries = 0 local downloaded = {} local addedtolist = {} local abortgrab = false local posts = {} local requested_children = {} for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true end load_json_file = function(file) if file then return JSON:decode(file) else return nil end end read_file = function(file) if file then local f = assert(io.open(file)) local data = f:read("*all") f:close() return data else return "" end end processed = function(url) if downloaded[url] or addedtolist[url] then return true end return false end allowed = function(url, parenturl, source) if string.match(url, "'+") or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/login") or string.match(url, "^https?://[^/]*reddit%.com/register") or string.match(url, "%?sort=") or string.match(url, "^https?://[^/]*reddit%.app%.link/") or string.match(url, "^https?://out%.reddit%.com/r/") or (string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/")) or string.match(url, "/%.rss$") or (parenturl and string.match(url, "^https?://amp%.reddit%.com/")) then return false end local tested = {} for s in string.gmatch(url, "([^/]+)") do if tested[s] == nil then tested[s] = 0 end if tested[s] == 6 then return false end tested[s] = tested[s] + 1 end if url .. "/" == parenturl then return false end if string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://www%.reddit%.com/api/morechildren$") or string.match(url, "^https?://v%.redd%.it/[^/]+/[^/]+$") or string.match(url, "^https?://preview%.redd%.it/[^/]+/[^/]+$") then return true end for s in string.gmatch(url, "([a-z0-9]+)") do if posts[s] then return true end end if parenturl and string.match(parenturl, "^https?://www%.reddit%.com/") and source ~= "download_child_p" and not string.match(url, "^https?://[^/]*reddit%.com/") and not string.match(url, "^https?://[^/]*youtube%.com") and not string.match(url, "^https?://[^/]*youtu%.be") and not string.match(url, "^https?://[^/]*redd%.it/") then return true end return false end wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") then return false end if string.match(parent["url"], "^https?://www%.reddit%.com/comments/[a-z0-9]+") then return true end if not processed(url) and (allowed(url, parent["url"], "download_child_p") or (allowed(parent["url"], nil, "download_child_p") and html == 0)) then addedtolist[url] = true print('b ' .. html .. ' ' .. url) return true end return false end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil downloaded[url] = true local function check(urla) local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&", "&") if not processed(url_) and string.match(url_, "^https?://.+") and allowed(url_, origurl, "get_urls") and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true print('a ' .. url) end end local function checknewurl(newurl) if string.match(newurl, "^https?:////") then check(string.gsub(newurl, ":////", "://")) elseif string.match(newurl, "^https?://") then check(newurl) elseif string.match(newurl, "^https?:\\/\\?/") then check(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^\\/\\/") then check(string.match(url, "^(https?:)") .. string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^//") then check(string.match(url, "^(https?:)") .. newurl) elseif string.match(newurl, "^\\/") then check(string.match(url, "^(https?://[^/]+)") .. string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^/") then check(string.match(url, "^(https?://[^/]+)") .. newurl) elseif string.match(newurl, "^%./") then checknewurl(string.match(newurl, "^%.(.+)")) end end local function checknewshorturl(newurl) if string.match(newurl, "^%?") then check(string.match(url, "^(https?://[^%?]+)") .. newurl) elseif not (string.match(newurl, "^https?:\\?/\\?//?/?") or string.match(newurl, "^[/\\]") or string.match(newurl, "^%./") or string.match(newurl, "^[jJ]ava[sS]cript:") or string.match(newurl, "^[mM]ail[tT]o:") or string.match(newurl, "^vine:") or string.match(newurl, "^android%-app:") or string.match(newurl, "^ios%-app:") or string.match(newurl, "^%${")) then check(string.match(url, "^(https?://.+/)") .. newurl) end end if string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[a-z0-9]+") then posts[string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/([a-z0-9]+)")] = true end if allowed(url, nil, "get_urls") and status_code < 300 and not string.match(url, "^https?://[^/]*redditmedia%.com/") and not string.match(url, "^https?://[^/]*redditstatic%.com/") and not string.match(url, "^https?://out%.reddit%.com/") and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*%.ts$") and not string.match(url, "^https?://v%.redd%.it/[^/]+/[^%.]*$") then html = read_file(file) if string.match(url, "^https?://www%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/") or string.match(url, "^https?://www%.reddit%.com/r/[^/]+/duplicates/") then html = string.gsub(html, "