local urlparse = require("socket.url") local http = require("socket.http") local cjson = require("cjson") local utf8 = require("utf8") local item_names = os.getenv('item_names') local item_dir = os.getenv('item_dir') local warc_file_base = os.getenv('warc_file_base') local item_type = nil local item_name = nil local item_value = nil local selftext = nil local retry_url = true local item_types = {} for s in string.gmatch(item_names, "([^\n]+)") do local t, n = string.match(s, "^([^:]+):(.+)$") item_types[n] = t end if urlparse == nil or http == nil then io.stdout:write("socket not corrently installed.\n") io.stdout:flush() abortgrab = true end local url_count = 0 local tries = 0 local downloaded = {} local addedtolist = {} local abortgrab = false local killgrab = false local posts = {} local requested_children = {} local is_crosspost = false local outlinks = {} local reddit_media_urls = {} local bad_items = {} for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true end abort_item = function(item) abortgrab = true if not item then item = item_name end if not bad_items[item] then io.stdout:write("Aborting item " .. item .. ".\n") io.stdout:flush() bad_items[item] = true end end kill_grab = function(item) io.stdout:write("Aborting crawling.\n") killgrab = true end read_file = function(file) if file then local f = assert(io.open(file)) local data = f:read("*all") f:close() return data else return "" end end processed = function(url) if downloaded[url] or addedtolist[url] then return true end return false end allowed = function(url, parenturl) if item_type == "url" then if url ~= item_value then reddit_media_urls["url:" .. url] = true return false end return true end --[[if string.match(url, "^https?://www%.reddit%.com/svc/") then return true end]] if string.match(url, "'+") or string.match(urlparse.unescape(url), "[<>\\%$%^%[%]%(%){}]") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+") or string.match(url, "^https?://[^/]*reddit%.com/login") or string.match(url, "^https?://[^/]*reddit%.com/register") or string.match(url, "^https?://[^/]*reddit%.com/r/undefined/") or ( string.match(url, "%?sort=") and not string.match(url, "/svc/") ) or string.match(url, "%?limit=500$") or string.match(url, "%?ref=readnext$") or string.match(url, "/tailwind%-build%.css$") or string.match(url, "^https?://v%.redd%.it/.+%?source=fallback$") or string.match(url, "^https?://[^/]*reddit%.app%.link/") or string.match(url, "^https?://out%.reddit%.com/r/") or string.match(url, "^https?://old%.reddit%.com/gallery/") or string.match(url, "^https?://old%.reddit%.com/gold%?") or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com") or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit") or string.match(url, "/%?context=1$") or string.match(url, '/"$') or string.match(url, "^https?://[^/]+/message/compose") or string.match(url, "www%.reddit%.com/avatar[/]?$") or ( string.match(url, "^https?://gateway%.reddit%.com/") and not string.match(url, "/morecomments/") ) or string.match(url, "/%.rss$") or ( parenturl and string.match(url, "^https?://amp%.reddit%.com/") ) or ( parenturl and string.match(url, "^https?://v%.redd%.it/[^/]+/HLSPlaylist%.m3u8") ) or ( item_type == "post" and ( string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$") or string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?%?utm_source=") ) ) or ( parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/duplicates/") ) or ( parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") ) or ( parenturl and string.match(parenturl, "^https?://[^/]+/r/EASportsFC/") and string.match(url, "^https?://[^/]+/r/FIFA/") ) then return false end local tested = {} for s in string.gmatch(url, "([^/]+)") do if tested[s] == nil then tested[s] = 0 end if tested[s] == 6 then return false end tested[s] = tested[s] + 1 end if not ( string.match(url, "^https?://[^/]*redd%.it/") or string.match(url, "^https?://[^/]*reddit%.com/") or string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://[^/]*redditstatic%.com/") ) then local temp = "" for c in string.gmatch(url, "(.)") do local b = string.byte(c) if b < 32 or b > 126 then c = string.format("%%%02X", b) end temp = temp .. c end url = temp outlinks[url] = true return false end if url .. "/" == parenturl then return false end if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") or string.match(url, "^https?://old%.reddit%.com/api/morechildren$") or string.match(url, "^https?://[^/]*reddit%.com/video/") then return true end if ( string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://v%.redd%.it/") or string.match(url, "^https?://[^/]*reddit%.com/video/") or string.match(url, "^https?://i%.redd%.it/") or string.match(url, "^https?://[^%.]*preview%.redd%.it/.") ) and not string.match(item_type, "comment") and not string.match(url, "^https?://[^/]*redditmedia%.com/mediaembed/") and not is_crosspost then if parenturl and string.match(parenturl, "^https?://www%.reddit.com/api/info%.json%?id=t") and not string.match(url, "^https?://v%.redd%.it/") and not string.match(url, "^https?://[^/]*reddit%.com/video/") and not string.find(url, "thumbs.") then return false end if not string.match(url, "^https?://v%.redd%.it/") or string.match(url, "%.mp4$") or string.match(url, "%.ts$") then reddit_media_urls["url:" .. url] = true return false end return true end for s in string.gmatch(url, "([a-z0-9]+)") do if posts[s] then return true end end return false end wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] if item_type == "comment" or item_type == "url" then return false end if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") or string.match(url, "^https?://[^/]*redditstatic%.com/") or string.match(url, "^https?://old%.reddit%.com/static/") or string.match(url, "^https?://www%.reddit%.com/static/") or string.match(url, "^https?://styles%.redditmedia%.com/") or string.match(url, "^https?://emoji%.redditmedia%.com/") or string.match(url, "/%.rss$") then return false end if string.match(parent["url"], "^https?://old%.reddit%.com/comments/[a-z0-9]+") then return true end url = string.gsub(url, "&", "&") if not processed(url) and (allowed(url, parent["url"]) or (allowed(parent["url"]) and html == 0)) then addedtolist[url] = true return true end return false end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil local no_more_svc = false downloaded[url] = true if abortgrab then return {} end local function check(urla) if no_more_svc and string.match(urla, "^https?://[^/]+/svc/") then return nil end local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.match(url, "^(.-)%.?$") if not string.find(url, "old.reddit.com") then url_ = string.gsub( url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", function (s) return utf8.char(tonumber(s, 16)) end ) end while string.find(url_, "&") do url_ = string.gsub(url_, "&", "&") end if not processed(url_) and string.match(url_, "^https?://.+") and allowed(url_, origurl) and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true end end local function checknewurl(newurl) if string.match(newurl, "^https?:////") then check(string.gsub(newurl, ":////", "://")) elseif string.match(newurl, "^https?://") then check(newurl) elseif string.match(newurl, "^https?:\\/\\?/") then check(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^\\/\\/") then checknewurl(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^//") then check(urlparse.absolute(url, newurl)) elseif string.match(newurl, "^\\/") then checknewurl(string.gsub(newurl, "\\", "")) elseif string.match(newurl, "^/") then check(urlparse.absolute(url, newurl)) elseif string.match(newurl, "^%.%./") then if string.match(url, "^https?://[^/]+/[^/]+/") then check(urlparse.absolute(url, newurl)) else checknewurl(string.match(newurl, "^%.%.(/.+)$")) end elseif string.match(newurl, "^%./") then check(urlparse.absolute(url, newurl)) end end local function checknewshorturl(newurl) if string.match(newurl, "^%?") then check(urlparse.absolute(url, newurl)) elseif not ( string.match(newurl, "^https?:\\?/\\?//?/?") or string.match(newurl, "^[/\\]") or string.match(newurl, "^%./") or string.match(newurl, "^[jJ]ava[sS]cript:") or string.match(newurl, "^[mM]ail[tT]o:") or string.match(newurl, "^vine:") or string.match(newurl, "^android%-app:") or string.match(newurl, "^ios%-app:") or string.match(newurl, "^data:") or string.match(newurl, "^irc:") or string.match(newurl, "^%${") ) then check(urlparse.absolute(url, newurl)) end end if string.match(url, "^https?://www%.reddit%.com/") and not string.match(url, "/api/") and not string.match(url, "^https?://[^/]+/svc/") then check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/")) end local match = string.match(url, "^https?://preview%.redd%.it/([a-zA-Z0-9]+%.[a-zA-Z0-9]+)") if match then check("https://i.redd.it/" .. match) end if string.match(url, "is_lit_ssr=") and not string.match(url, "/svc/shreddit/more%-comments/") then check(string.gsub(url, "([%?&]is_lit_ssr=)[a-z]+", "%1true")) check(string.gsub(url, "([%?&]is_lit_ssr=)[a-z]+", "%1false")) end if allowed(url) and status_code < 300 and item_type ~= "url" and not string.match(url, "^https?://[^/]*redditmedia%.com/") and not string.match(url, "^https?://[^/]*redditstatic%.com/") and not string.match(url, "^https?://out%.reddit%.com/") and not string.match(url, "^https?://[^%.]*preview%.redd%.it/") and not string.match(url, "^https?://i%.redd%.it/") and not ( string.match(url, "^https?://v%.redd%.it/") and not string.match(url, "%.m3u8") and not string.match(url, "%.mpd") ) then html = read_file(file) --[[if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then check(url .. "?utm_source=reddit&utm_medium=web2x&context=3") end]] if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then html = string.gsub(html, '\\"', '"') elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/") or string.match(url, "^https?://old%.reddit%.com/r/[^/]+/duplicates/") then html = string.gsub(html, "%s*.-%s*%s*%s*", "") end if string.match(url, "^https?://old%.reddit%.com/") then for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*'[^']+'%))") do local link_id, sort, children, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*'([^']+)'%)$") local id = string.match(children, "^([^,]+)") local subreddit = string.match(html, 'data%-subreddit="([^"]+)"') local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html" if not requested_children[post_data] then requested_children[post_data] = true print("posting for modechildren with", post_data) table.insert(urls, { url="https://old.reddit.com/api/morechildren", post_data=post_data, headers={ ["Content-Type"]="application/x-www-form-urlencoded; charset=UTF-8", ["X-Requested-With"]="XMLHttpRequest" } }) end end --[[elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]") or string.match(url, "^https?://www%.reddit%.com/comments/[^/]") or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then local comments_data = nil if string.match(url, "^https?://www%.reddit%.com/") then comments_data = string.match(html, '%s*window%.___r%s*=%s*({.+});%s*%s*