Version 20220729.01. Queue media URLs back to reddit project and download individually.

2024-11-12 01:10:50 +00:00 · 2022-07-28 18:09:04 +02:00 · 2022-07-28 18:09:04 +02:00 · f81b2ce97e
commit f81b2ce97e
parent edacb2065a
2 changed files with 57 additions and 58 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -42,8 +42,7 @@ if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
 WGET_AT = find_executable(
    'Wget+AT',
    [
-        'GNU Wget 1.20.3-at.20211001.01',
-        'GNU Wget 1.21.3-at.20220503.02'
+        'GNU Wget 1.21.3-at.20220608.02'
    ],
    [
         './wget-at',
@ -60,7 +59,7 @@ if not WGET_AT:
 #
 # Update this each time you make a non-cosmetic change.
 # It will be added to the WARC files and reported to the tracker.
-VERSION = '20220605.01'
+VERSION = '20220729.01'
 TRACKER_ID = 'reddit'
 TRACKER_HOST = 'legacy-api.arpa.li'
 MULTI_ITEM_SIZE = 20
@ -279,13 +278,15 @@ class WgetArgs(object):
          wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
          wget_args.append('item-name://'+item_name)
          item_type, item_value = item_name.split(':', 1)
-          if item_type in ('post', 'comment'):
-              if item_type == 'post':
-                  wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
-                  wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
-              elif item_type == 'comment':
-                  wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
-                  wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
+          if item_type == 'post':
+              wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
+              wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
+          elif item_type == 'comment':
+              wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
+              wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
+          elif item_type == 'url':
+              wget_args.extend(['--warc-header', 'reddit-media-url: '+item_value])
+              wget_args.append(item_value)
          else:
              raise Exception('Unknown item')

--- a/reddit.lua
+++ b/reddit.lua
@ -34,10 +34,10 @@ local killgrab = false

 local posts = {}
 local requested_children = {}
-local thumbs = {}
 local is_crosspost = false

 local outlinks = {}
+local reddit_media_urls = {}

 local bad_items = {}

@ -89,15 +89,8 @@ processed = function(url)
 end

 allowed = function(url, parenturl)
-  local match = string.match(url, "^https?://[^%.]+%.thumbs%.redditmedia%.com/([^%.]+)%.")
-  if match
-    and parenturl
-    and string.match(parenturl, "^https?://www%.reddit%.com/api/info%.json%?id=") then
-    thumbs[match] = true
-  end
-
-  if match and not thumbs[match] then
-    return false
+  if item_type == "url" then
+    return true
  end

  if string.match(url, "'+")
@ -113,11 +106,8 @@ allowed = function(url, parenturl)
    or string.match(url, "^https?://v%.redd%.it/.+%?source=fallback$")
    or string.match(url, "^https?://[^/]*reddit%.app%.link/")
    or string.match(url, "^https?://out%.reddit%.com/r/")
-    or string.match(url, "^https?://emoji%.redditmedia%.com/")
-    or string.match(url, "^https?://styles%.redditmedia%.com/")
    or string.match(url, "^https?://old%.reddit%.com/gallery/")
    or string.match(url, "^https?://old%.reddit%.com/gold%?")
-    or string.match(url, "^https?://[^%.]+%.redd%.it/award_images/")
    or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com")
    or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit")
    or (
@ -189,17 +179,20 @@ allowed = function(url, parenturl)
  end

  if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/")
-    or string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then
+    or string.match(url, "^https?://old%.reddit%.com/api/morechildren$")
+    or string.match(url, "^https?://[^/]*reddit%.com/video/") then
    return true
  end

-  if (string.match(url, "^https?://[^/]*redditmedia%.com/")
+  if (
+      string.match(url, "^https?://[^/]*redditmedia%.com/")
      or string.match(url, "^https?://v%.redd%.it/")
      or string.match(url, "^https?://[^/]*reddit%.com/video/")
      or string.match(url, "^https?://i%.redd%.it/")
      or string.match(url, "^https?://[^%.]*preview%.redd%.it/.")
    )
    and not string.match(item_type, "comment")
+    and not string.match(url, "^https?://[^/]*redditmedia%.com/mediaembed/")
    and not is_crosspost then
    if parenturl
      and string.match(parenturl, "^https?://www%.reddit.com/api/info%.json%?id=t")
@ -208,6 +201,12 @@ allowed = function(url, parenturl)
      and not string.find(url, "thumbs.") then
      return false
    end
+    if not string.match(url, "^https?://v%.redd%.it/")
+      or string.match(url, "%.mp4$")
+      or string.match(url, "%.ts$") then
+      reddit_media_urls[url] = true
+      return false
+    end
    return true
  end

@ -224,7 +223,7 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars
  local url = urlpos["url"]["url"]
  local html = urlpos["link_expect_html"]

-  if item_type == "comment" then
+  if item_type == "comment" or item_type == "url" then
    return false
  end

@ -342,6 +341,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)

  if allowed(url)
    and status_code < 300
+    and item_type ~= "url"
    and not string.match(url, "^https?://[^/]*redditmedia%.com/")
    and not string.match(url, "^https?://[^/]*redditstatic%.com/")
    and not string.match(url, "^https?://out%.reddit%.com/")
@ -570,6 +570,9 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
  end

  local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$")
+  if not match and item_types[url["url"]] then
+    match = url["url"]
+  end
  if match then
    abortgrab = false
    selftext = nil
@ -583,6 +586,8 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
    item_type = item_types[match]
    item_value = match
    item_name = item_type .. ":" .. item_value
+    io.stdout:write("Archiving item " .. item_name .. ".\n")
+    io.stdout:flush()
  end

  if status_code == 204 then
@ -624,26 +629,14 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
      or status_code  == 0 then
    io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. "). Sleeping.\n")
    io.stdout:flush()
-    local maxtries = 8
-    if not allowed(url["url"]) then
-        maxtries = 0
-    end
-    if tries >= maxtries then
-      io.stdout:write("\nI give up...\n")
-      io.stdout:flush()
-      tries = 0
-      if allowed(url["url"]) then
-        return wget.actions.ABORT
-      else
-        return wget.actions.EXIT
-      end
-    end
+    abort_item()
    os.execute("sleep " .. math.floor(math.pow(2, tries)))
    tries = tries + 1
    return wget.actions.CONTINUE
  end

-  if string.match(url["url"], "^https?://[^/]+%.reddit%.com/api/info%?id=t[0-9]_[a-z0-9]+$") then
+  if string.match(url["url"], "^https?://[^/]+%.reddit%.com/api/info%?id=t[0-9]_[a-z0-9]+$")
+    or item_type == "url" then
    return wget.actions.EXIT
  end

@ -691,26 +684,31 @@ wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total
    file:write(url .. "\n")
  end
  file:close()
-  local newurls = nil
-  local count = 0
-  local key = "urls-f1zr02i96okrkdv"
-  for newurl, _ in pairs(outlinks) do
-    print('found item', newurl)
-    if newurls == nil then
-      newurls = newurl
-    else
-      newurls = newurls .. "\0" .. newurl
+  for key, data in pairs({
+    ["reddit-v5fj9elcyh0rzck"] = reddit_media_urls,
+    ["urls-f1zr02i96okrkdv"] = outlinks
+  }) do
+    print('queuing for', string.match(key, "^(.+)%-"))--, "on shard", shard)
+    local items = nil
+    local count = 0
+    for item, _ in pairs(data) do
+      print("found item", item)
+      if items == nil then
+        items = item
+      else
+        items = items .. "\0" .. item
+      end
+      count = count + 1
+      if count == 100 then
+        submit_backfeed(items, key)
+        items = nil
+        count = 0
+      end
    end
-    count = count + 1
-    if count == 100 then
-      submit_backfeed(newurls, key)
-      newurls = nil
-      count = 0
+    if items ~= nil then
+      submit_backfeed(items, key)
    end
  end
-  if newurls ~= nil then
-    submit_backfeed(newurls, key)
-  end
 end

 wget.callbacks.before_exit = function(exit_status, exit_status_string)