From 10eaa7c50c6fcde23e10ec16c41c767a1462cd4f Mon Sep 17 00:00:00 2001 From: arkiver Date: Wed, 23 Mar 2022 16:16:58 +0100 Subject: [PATCH] Version 20220323.01. Fix backfeed. Fix maxtries use. --- pipeline.py | 2 +- reddit.lua | 43 +++++++++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pipeline.py b/pipeline.py index 4970be9..5d92e99 100644 --- a/pipeline.py +++ b/pipeline.py @@ -59,7 +59,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20220312.01' +VERSION = '20220323.01' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 20 diff --git a/reddit.lua b/reddit.lua index ff823d6..5950923 100644 --- a/reddit.lua +++ b/reddit.lua @@ -648,6 +648,30 @@ wget.callbacks.httploop_result = function(url, err, http_stat) end wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time) + local function submit_backfeed(newurls, key) + local tries = 0 + local maxtries = 4 + while tries < maxtries do + local body, code, headers, status = http.request( + "https://legacy-api.arpa.li/backfeed/legacy/" .. key, + newurls .. "\0" + ) + print(body) + if code == 200 then + io.stdout:write("Submitted discovered URLs.\n") + io.stdout:flush() + break + end + io.stdout:write("Failed to submit discovered URLs." .. tostring(code) .. tostring(body) .. "\n") + io.stdout:flush() + os.execute("sleep " .. math.floor(math.pow(2, tries))) + tries = tries + 1 + end + if tries == maxtries then + abortgrab = true + end + end + local file = io.open(item_dir .. '/' .. warc_file_base .. '_bad-items.txt', 'w') for url, _ in pairs(bad_items) do file:write(url .. "\n") @@ -663,24 +687,7 @@ wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total end end if items ~= nil then - local tries = 0 - while tries < 10 do - local body, code, headers, status = http.request( - "https://legacy-api.arpa.li/backfeed/legacy/urls-bbpritdbwn5hnp7", - items .. "\0" - ) - print(body) - if code == 200 then - break - end - io.stdout:write("Could not queue items.\n") - io.stdout:flush() - os.execute("sleep " .. math.floor(math.pow(2, tries))) - tries = tries + 1 - end - if tries == 10 then - abort_item() - end + submit_backfeed(newurls, "reddit-ldayno5jboa5c0o") end end