From 16836ba20140dda9c3d2b9de1329c2188d2d187f Mon Sep 17 00:00:00 2001 From: arkiver Date: Fri, 8 Jan 2021 22:40:49 +0100 Subject: [PATCH] Support single comment and post items. Queue outlinks to URLs project. --- reddit.lua | 59 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/reddit.lua b/reddit.lua index 32e2edc..7ac6407 100644 --- a/reddit.lua +++ b/reddit.lua @@ -1,5 +1,7 @@ dofile("table_show.lua") dofile("urlcode.lua") +local urlparse = require("socket.url") +local http = require("socket.http") JSON = (loadfile "JSON.lua")() local item_type = os.getenv('item_type') @@ -7,6 +9,12 @@ local item_value = os.getenv('item_value') local item_dir = os.getenv('item_dir') local warc_file_base = os.getenv('warc_file_base') +if urlparse == nil or http == nil then + io.stdout:write("socket not corrently installed.\n") + io.stdout:flush() + abortgrab = true +end + local url_count = 0 local tries = 0 local downloaded = {} @@ -17,6 +25,8 @@ local posts = {} local requested_children = {} local thumbs = {} +local outlinks = {} + for ignore in io.open("ignore-list", "r"):lines() do downloaded[ignore] = true end @@ -85,7 +95,7 @@ allowed = function(url, parenturl) and string.match(url, "^https?://amp%.reddit%.com/") ) or ( - item_type == "posts" + item_type == "post" and ( string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?$") or string.match(url, "^https?://[^/]*reddit%.com/r/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]+/?%?utm_source=") @@ -100,11 +110,6 @@ allowed = function(url, parenturl) parenturl and string.match(parenturl, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") and string.match(url, "^https?://[^/]*reddit%.com/user/[^/]+/duplicates/") - ) - or not ( - string.match(url, "^https?://[^/]*redd%.it/") - or string.match(url, "^https?://[^/]*reddit%.com/") - or string.match(url, "^https?://[^/]*redditmedia%.com/") ) then return false end @@ -120,6 +125,17 @@ allowed = function(url, parenturl) tested[s] = tested[s] + 1 end + if not ( + string.match(url, "^https?://[^/]*redd%.it/") + or string.match(url, "^https?://[^/]*reddit%.com/") + or string.match(url, "^https?://[^/]*redditmedia%.com/") + ) then + if not string.match(url, "^https?://[^/]*redditstatic%.com/") then + outlinks[url] = true + end + return false + end + if url .. "/" == parenturl then return false end @@ -151,7 +167,7 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] - if item_type == "comments" then + if item_type == "comment" then return false end @@ -455,6 +471,35 @@ wget.callbacks.httploop_result = function(url, err, http_stat) return wget.actions.NOTHING end +wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time) + local items = nil + for item, _ in pairs(outlinks) do + print('found item', item) + if items == nil then + items = item + else + items = items .. "\0" .. item + end + end + if items ~= nil then + local tries = 0 + while tries < 10 do + local body, code, headers, status = http.request( + "http://blackbird-amqp.meo.ws:23038/urls-t05crln9brluand/", + items + ) + if code == 200 or code == 409 then + break + end + os.execute("sleep " .. math.floor(math.pow(2, tries))) + tries = tries + 1 + end + if tries == 10 then + abortgrab = true + end + end +end + wget.callbacks.before_exit = function(exit_status, exit_status_string) if abortgrab then return wget.exits.IO_FAIL