From 6e1584155001c1948d9c1f3c98f69c7235433480 Mon Sep 17 00:00:00 2001
From: arkiver <arkiver@protonmail.com>
Date: Wed, 7 Apr 2021 00:38:20 +0200
Subject: [PATCH] Version 20210407.01. Improve video archiving. Detect if video
 is still being processed by reddit.

---
 pipeline.py |  2 +-
 reddit.lua  | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 90ede67..e9ecd69 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -60,7 +60,7 @@ if not WGET_AT:
 #
 # Update this each time you make a non-cosmetic change.
 # It will be added to the WARC files and reported to the tracker.
-VERSION = '20210330.04'
+VERSION = '20210407.01'
 TRACKER_ID = 'reddit'
 TRACKER_HOST = 'legacy-api.arpa.li'
 MULTI_ITEM_SIZE = 20
diff --git a/reddit.lua b/reddit.lua
index d384b35..42f1c91 100644
--- a/reddit.lua
+++ b/reddit.lua
@@ -178,6 +178,7 @@ allowed = function(url, parenturl)
 
   if (string.match(url, "^https?://[^/]*redditmedia%.com/")
       or string.match(url, "^https?://v%.redd%.it/")
+      or string.match(url, "^https?://[^/]*reddit%.com/video/")
       or string.match(url, "^https?://i%.redd%.it/")
       or string.match(url, "^https?://[^%.]*preview%.redd%.it/.")
     )
@@ -185,6 +186,7 @@ allowed = function(url, parenturl)
     if parenturl
       and string.match(parenturl, "^https?://www%.reddit.com/api/info%.json%?id=t")
       and not string.match(url, "^https?://v%.redd%.it/")
+      and not string.match(url, "^https?://[^/]*reddit%.com/video/")
       and not string.find(url, "thumbs.") then
       return false
     end
@@ -243,7 +245,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
     local origurl = url
     local url = string.match(urla, "^([^#]+)")
     local url_ = string.match(url, "^(.-)%.?$")
-    if not string.find(url, "v.redd.it") then
+    if not string.find(url, "old.reddit.com") then
       url_ = string.gsub(
         url_, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])",
         function (s)
@@ -261,9 +263,9 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
       url_ = string.gsub(url_, "&amp;", "&")
     end
     if not processed(url_)
-        and string.match(url_, "^https?://.+")
-        and allowed(url_, origurl)
-        and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then
+      and string.match(url_, "^https?://.+")
+      and allowed(url_, origurl)
+      and not (string.match(url_, "[^/]$") and processed(url_ .. "/")) then
       table.insert(urls, { url=url_ })
       addedtolist[url_] = true
       addedtolist[url] = true
@@ -510,6 +512,11 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
         end
         selftext = child["data"]["selftext"]
         checknewurl(child["data"]["permalink"])
+        if child["data"]["is_video"] and not child["data"]["secure_media"] then
+          io.stdout:write("Video still being processed.\n")
+          io.stdout:flush()
+          abort_item()
+        end
       end
     end
     for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do