From f1ef7d169771c785c3182fccb1b04fbf1c3eb663 Mon Sep 17 00:00:00 2001 From: arkiver Date: Mon, 19 Jun 2023 18:28:52 +0200 Subject: [PATCH] Version 20230619.02. Accept 404 on mediaembed URL. --- pipeline.py | 2 +- reddit.lua | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pipeline.py b/pipeline.py index babb706..4b6739a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -59,7 +59,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20230619.01' +VERSION = '20230619.02' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 40 diff --git a/reddit.lua b/reddit.lua index 2e1e914..5abbb4d 100644 --- a/reddit.lua +++ b/reddit.lua @@ -657,13 +657,18 @@ wget.callbacks.write_to_warc = function(url, http_stat) end end if ( - http_stat["len"] == 0 - and status_code == 200 - ) or ( - status_code ~= 200 - and status_code ~= 301 - and status_code ~= 302 - and status_code ~= 308 + ( + http_stat["len"] == 0 + and status_code == 200 + ) or ( + status_code ~= 200 + and status_code ~= 301 + and status_code ~= 302 + and status_code ~= 308 + ) + ) and not ( + string.match(url["url"], "^https?://[^/]*redditmedia%.com/mediaembed/") + and status_code == 404 ) then print("Not writing to WARC.") retry_url = true