dofile("table_show.lua") dofile("urlcode.lua") local urlparse = require("socket.url") local http = require("socket.http") JSON = (loadfile "JSON.lua")() local item_names = os.getenv('item_names') local item_dir = os.getenv('item_dir') local warc_file_base = os.getenv('warc_file_base') local item_types = {} for s in string.gmatch(item_names, "([^\n]+)") do local t, n = string.match(s, "^([^:]+):(.+)$") item_types[n] = t end local item_type = nil if urlparse == nil or http == nil then io.stdout:write("socket not corrently installed.\n") io.stdout:flush() abortgrab = true end local url_count = 0 local tries = 0 local downloaded = {} local addedtolist = {} local abortgrab = false local posts = {} local requested_children = {} local thumbs = {} local outlinks = {} for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true end load_json_file = function(file) if file then return JSON:decode(file) else return nil end end read_file = function(file) if file then local f = assert(io.open(file)) local data = f:read("*all") f:close() return data else return "" end end processed = function(url) if downloaded[url] or addedtolist[url] then return true end return false end allowed = function(url, parenturl) local match = string.match(url, "^https?://[^%.]+%.thumbs%.redditmedia%.com/([^%.]+)%.") if match and parenturl and string.match(parenturl, "^https?://www%.reddit%.com/api/info%.json%?id=") then thumbs[match] = true end if match and not thumbs[match] then return false end if string.match(url, "'+") or string.match(urlparse.unescape(url), "[<>\\%$%^%[%]%(%){}]") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/login") or string.match(url, "^https?://[^/]*reddit%.com/register") or string.match(url, "%?sort=") or string.match(url, "%?limit=500$") or string.match(url, "%?ref=readnext$") or string.match(url, "^https?://[^/]*reddit%.app%.link/") or string.match(url, "^https?://out%.reddit%.com/r/") or string.match(url, "^https?://emoji%.redditmedia%.com/") or string.match(url, "^https?://styles%.redditmedia%.com/") or string.match(url, "^https?://old%.reddit%.com/gallery/") or string.match(url, "^https?://old%.reddit%.com/gold%?") or string.match(url, "^https?://[^%.]+%.redd%.it/award_images/") or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com") or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit") or ( string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/") ) or string.match(url, "/%.rss$") or ( parenturl and string.match(url, "^https?://amp%.reddit%.com/") ) or ( item_type == "post" and ( string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$") or string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?%?utm_source=") ) ) or ( parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") ) or ( parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") ) then return false end local tested = {} for s in string.gmatch(url, "([^/]+)") do if tested[s] == nil then tested[s] = 0 end if tested[s] == 6 then return false end tested[s] = tested[s] + 1 end if not ( string.match(url, "^https?://[^/]*redd%.it/") or string.match(url, "^https?://[^/]*reddit%.com/") or string.match(url, "^https?://[^/]*redditmedia%.com/") ) then if not string.match(url, "^https?://[^/]*redditstatic%.com/") then outlinks[url] = true end return false end if url .. "/" == parenturl then return false end if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") or string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then return true end if (string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://v%.redd%.it/") or string.match(url, "^https?://i%.redd%.it/") or string.match(url, "^https?://[^%.]*preview%.redd%.it/.") ) and not string.match(item_type, "comment") then return true end for s in string.gmatch(url, "([a-z0-9]+)") do if posts[s] then return true end end return false end wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] if item_type == "comment" then return false end if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") or string.match(url, "^https?://[^/]*redditstatic%.com/") or string.match(url, "^https?://old%.reddit%.com/static/") or string.match(url, "^https?://www%.reddit%.com/static/") or string.match(url, "^https?://styles%.redditmedia%.com/") or string.match(url, "^https?://emoji%.redditmedia%.com/") or string.match(url, "/%.rss$") then return false end if string.match(parent["url"], "^https?://old%.reddit%.com/comments/[a-z0-9]+") then return true end if not processed(url) and (allowed(url, parent["url"]) or (allowed(parent["url"]) and html == 0)) then addedtolist[url] = true return true end return false end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil downloaded[url] = true local function check(urla) local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.gsub(string.match(url, "^(.-)%.?$"), "&", "&") if not processed(url_) and string.match(url_, "^https?://.+") and allowed(url_, origurl) and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true end end local function checknewurl(newurl) if string.match(newurl, "^https?:////") then check(string.gsub(newurl, ":////", "://")) elseif string.match(newurl, "^https?://") then check(newurl) elseif string.match(newurl, "^https?:\\/\\?/") then check(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^\\/\\/") then checknewurl(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^//") then check(urlparse.absolute(url, newurl)) elseif string.match(newurl, "^\\/") then checknewurl(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^/") then check(urlparse.absolute(url, newurl)) elseif string.match(newurl, "^%.%./") then if string.match(url, "^https?://[^/]+/[^/]+/") then check(urlparse.absolute(url, newurl)) else checknewurl(string.match(newurl, "^%.%.(/.+)$")) end elseif string.match(newurl, "^%./") then check(urlparse.absolute(url, newurl)) end end local function checknewshorturl(newurl) if string.match(newurl, "^%?") then check(urlparse.absolute(url, newurl)) elseif not ( string.match(newurl, "^https?:\\?/\\?//?/?") or string.match(newurl, "^[/\\]") or string.match(newurl, "^%./") or string.match(newurl, "^[jJ]ava[sS]cript:") or string.match(newurl, "^[mM]ail[tT]o:") or string.match(newurl, "^vine:") or string.match(newurl, "^android%-app:") or string.match(newurl, "^ios%-app:") or string.match(newurl, "^data:") or string.match(newurl, "^irc:") or string.match(newurl, "^%${") ) then check(urlparse.absolute(url, newurl)) end end if string.match(url, "^https?://www%.reddit%.com/") and not string.match(url, "/api/") then check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) end local match = string.match(url, "^https?://preview%.redd%.it/([a-zA-Z0-9]+%.[a-zA-Z0-9]+)") if match then check("https://i.redd.it/" .. match) end if allowed(url) and status_code < 300 and not string.match(url, "^https?://[^/]*redditmedia%.com/") and not string.match(url, "^https?://[^/]*redditstatic%.com/") and not string.match(url, "^https?://out%.reddit%.com/") and not string.match(url, "^https?://[^%.]*preview%.redd%.it/") and not string.match(url, "^https?://i%.redd%.it/") and not ( string.match(url, "^https?://v%.redd%.it/") and not string.match(url, "%.m3u8") and not string.match(url, "%.mpd") ) then html = read_file(file) if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") or string.match(url, "^https?://old%.reddit%.com/r/[^/]+/duplicates/") then html = string.gsub(html, "%s*.-%s*%s*%s*", "") end if string.match(url, "^https?://old%.reddit%.com/") then for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*'[^']+'%))") do local link_id, sort, children, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*'([^']+)'%)$") local id = string.match(children, "^([^,]+)") local subreddit = string.match(html, 'data%-subreddit="([^"]+)"') local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html" if not requested_children[post_data] then requested_children[post_data] = true table.insert(urls, {url="https://old.reddit.com/api/morechildren", post_data=post_data}) end end elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then local comments_data = nil if string.match(url, "^https?://www%.reddit%.com/") then comments_data = string.match(html, '%s*window%.___r%s*=%s*({.+});%s*%s*