From 6e1584155001c1948d9c1f3c98f69c7235433480 Mon Sep 17 00:00:00 2001 From: arkiver Date: Wed, 7 Apr 2021 00:38:20 +0200 Subject: [PATCH] Version 20210407.01. Improve video archiving. Detect if video is still being processed by reddit. --- pipeline.py | 2 +- reddit.lua | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pipeline.py b/pipeline.py index 90ede67..e9ecd69 100644 --- a/pipeline.py +++ b/pipeline.py @@ -60,7 +60,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20210330.04' +VERSION = '20210407.01' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 20 diff --git a/reddit.lua b/reddit.lua index d384b35..42f1c91 100644 --- a/reddit.lua +++ b/reddit.lua @@ -178,6 +178,7 @@ allowed = function(url, parenturl) if (string.match(url, "^https?://[^/]*redditmedia%.com/") or string.match(url, "^https?://v%.redd%.it/") + or string.match(url, "^https?://[^/]*reddit%.com/video/") or string.match(url, "^https?://i%.redd%.it/") or string.match(url, "^https?://[^%.]*preview%.redd%.it/.") ) @@ -185,6 +186,7 @@ allowed = function(url, parenturl) if parenturl and string.match(parenturl, "^https?://www%.reddit.com/api/info%.json%?id=t") and not string.match(url, "^https?://v%.redd%.it/") + and not string.match(url, "^https?://[^/]*reddit%.com/video/") and not string.find(url, "thumbs.") then return false end @@ -243,7 +245,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) local origurl = url local url = string.match(urla, "^([^#]+)") local url_ = string.match(url, "^(.-)%.?$") - if not string.find(url, "v.redd.it") then + if not string.find(url, "old.reddit.com") then url_ = string.gsub( url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])", function (s) @@ -261,9 +263,9 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) url_ = string.gsub(url_, "&", "&") end if not processed(url_) - and string.match(url_, "^https?://.+") - and allowed(url_, origurl) - and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then + and string.match(url_, "^https?://.+") + and allowed(url_, origurl) + and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then table.insert(urls, { url=url_ }) addedtolist[url_] = true addedtolist[url] = true @@ -510,6 +512,11 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) end selftext = child["data"]["selftext"] checknewurl(child["data"]["permalink"]) + if child["data"]["is_video"] and not child["data"]["secure_media"] then + io.stdout:write("Video still being processed.\n") + io.stdout:flush() + abort_item() + end end end for newurl in string.gmatch(string.gsub(html, """, '"'), '([^"]+)') do