Handle NULL byte seperated multi items. Support unicode chars in JSON permalink.

4 years ago · 3d20ca90af
parent 7c5ea717a8
commit 3d20ca90af
2 changed files with 83 additions and 47 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -41,12 +41,7 @@ if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):

 WGET_AT = find_executable(
    'Wget+AT',
-    [
-        'GNU Wget 1.20.3-at.20200401.01',
-        'GNU Wget 1.20.3-at.20200804.01',
-        'GNU Wget 1.20.3-at.20200902.01',
-        'GNU Wget 1.20.3-at.20201030.01'
-    ],
+    ['GNU Wget 1.20.3-at.20201030.01'],
    ['./wget-at']
 )

@ -112,7 +107,8 @@ class PrepareDirectories(SimpleTask):

    def process(self, item):
        item_name = item['item_name']
-        escaped_item_name = item_name.replace(':', '_').replace('/', '_').replace('~', '_')
+        item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
+        escaped_item_name = item_name_hash
        dirname = '/'.join((item['data_dir'], escaped_item_name))

        if os.path.isdir(dirname):
@ -121,8 +117,11 @@ class PrepareDirectories(SimpleTask):
        os.makedirs(dirname)

        item['item_dir'] = dirname
-        item['warc_file_base'] = '%s-%s-%s' % (self.warc_prefix, escaped_item_name[:50],
-            time.strftime('%Y%m%d-%H%M%S'))
+        item['warc_file_base'] = '-'.join([
+            self.warc_prefix,
+            item_name_hash,
+            time.strftime('%Y%m%d-%H%M%S')
+        ])

        open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
        open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()
@ -227,11 +226,12 @@ class WgetArgs(object):
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
-            '--warc-header', 'reddit-dld-script-version: ' + VERSION,
-            '--warc-header', ItemInterpolation('reddit-item: %(item_name)s'),
+            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
+            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
-            '--warc-zstd-dict-no-include'
+            '--warc-zstd-dict-no-include',
+            '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8'
        ]

        dict_data = ZstdDict.get_dict()
@ -243,22 +243,22 @@ class WgetArgs(object):
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

-        item_name = item['item_name']
+        for item_name in item['item_name'].split('\0'):
+          wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
+          wget_args.append('item-name://'+item_name)
          item_type, item_value = item_name.split(':', 1)
-
-        item['item_type'] = item_type
-        item['item_value'] = item_value
-
          if item_type in ('post', 'comment'):
              if item_type == 'post':
-                wget_args.extend(['--warc-header', 'reddit-post: {}'.format(item_value)])
-                wget_args.append('https://www.reddit.com/api/info.json?id=t3_{}'.format(item_value))
+                  wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
+                  wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
              elif item_type == 'comment':
-                wget_args.extend(['--warc-header', 'reddit-comment: {}'.format(item_value)])
-                wget_args.append('https://www.reddit.com/api/info.json?id=t1_{}'.format(item_value))
+                  wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
+                  wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
          else:
              raise Exception('Unknown item')

+        item['item_name_newline'] = item['item_name'].replace('\0', '\n')
+
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
@ -294,8 +294,7 @@ pipeline = Pipeline(
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
-            'item_value': ItemValue('item_value'),
-            'item_type': ItemValue('item_type'),
+            'item_names': ItemValue('item_name_newline'),
            'warc_file_base': ItemValue('warc_file_base'),
        }
    ),
--- a/reddit.lua
+++ b/reddit.lua
@ -4,11 +4,18 @@ local urlparse = require("socket.url")
 local http = require("socket.http")
 JSON = (loadfile "JSON.lua")()

-local item_type = os.getenv('item_type')
-local item_value = os.getenv('item_value')
+local item_names = os.getenv('item_names')
 local item_dir = os.getenv('item_dir')
 local warc_file_base = os.getenv('warc_file_base')

+local item_types = {}
+for s in string.gmatch(item_names, "([^\n]+)") do
+  local t, n = string.match(s, "^([^:]+):(.+)$")
+  item_types[n] = t
+end
+
+local item_type = nil
+
 if urlparse == nil or http == nil then
  io.stdout:write("socket not corrently installed.\n")
  io.stdout:flush()
@ -70,12 +77,13 @@ allowed = function(url, parenturl)
  end

  if string.match(url, "'+")
-    or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
+    or string.match(urlparse.unescape(url), "[<>\\%*%$;%^%[%],%(%){}]")
    or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?context=[0-9]+&depth=[0-9]+")
    or string.match(url, "^https?://[^/]*reddit%.com/[^%?]+%?depth=[0-9]+&context=[0-9]+")
    or string.match(url, "^https?://[^/]*reddit%.com/login")
    or string.match(url, "^https?://[^/]*reddit%.com/register")
    or string.match(url, "%?sort=")
+    or string.match(url, "%?utm_source=reddit")
    or string.match(url, "%?limit=500$")
    or string.match(url, "%?ref=readnext$")
    or string.match(url, "^https?://[^/]*reddit%.app%.link/")
@ -222,21 +230,27 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
    elseif string.match(newurl, "^https?:\\/\\?/") then
      check(string.gsub(newurl, "\\", ""))
    elseif string.match(newurl, "^\\/\\/") then
-      check(string.match(url, "^(https?:)") .. string.gsub(newurl, "\\", ""))
+      checknewurl(string.gsub(newurl, "\\", ""))
    elseif string.match(newurl, "^//") then
-      check(string.match(url, "^(https?:)") .. newurl)
+      check(urlparse.absolute(url, newurl))
    elseif string.match(newurl, "^\\/") then
-      check(string.match(url, "^(https?://[^/]+)") .. string.gsub(newurl, "\\", ""))
+      checknewurl(string.gsub(newurl, "\\", ""))
    elseif string.match(newurl, "^/") then
-      check(string.match(url, "^(https?://[^/]+)") .. newurl)
+      check(urlparse.absolute(url, newurl))
+    elseif string.match(newurl, "^%.%./") then
+      if string.match(url, "^https?://[^/]+/[^/]+/") then
+        check(urlparse.absolute(url, newurl))
+      else
+        checknewurl(string.match(newurl, "^%.%.(/.+)$"))
+      end
    elseif string.match(newurl, "^%./") then
-      checknewurl(string.match(newurl, "^%.(.+)"))
+      check(urlparse.absolute(url, newurl))
    end
  end

  local function checknewshorturl(newurl)
    if string.match(newurl, "^%?") then
-      check(string.match(url, "^(https?://[^%?]+)") .. newurl)
+      check(urlparse.absolute(url, newurl))
    elseif not (string.match(newurl, "^https?:\\?/\\?//?/?")
      or string.match(newurl, "^[/\\]")
      or string.match(newurl, "^%./")
@ -248,7 +262,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
      or string.match(newurl, "^data:")
      or string.match(newurl, "^irc:")
      or string.match(newurl, "^%${")) then
-      check(string.match(url, "^(https?://.+/)") .. newurl)
+      check(urlparse.absolute(url, "/" .. newurl))
    end
  end

@ -369,6 +383,22 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
        end
      end
    end
+    if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then
+      json = load_json_file(html)
+      if not json or not json["data"] or not json["data"]["children"] then
+        io.stdout:write("Could not load JSON.\n")
+        io.stdout:flush()
+        abortgrab = true
+      end
+      for _, child in pairs(json["data"]["children"]) do
+        if not child["data"] or not child["data"]["permalink"] then
+          io.stdout:write("Permalink is missing.\n")
+          io.stdout:flush()
+          abortgrab = true
+        end
+        checknewurl(child["data"]["permalink"])
+      end
+    end
    for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do
      checknewurl(newurl)
    end
@ -402,22 +432,29 @@ wget.callbacks.httploop_result = function(url, err, http_stat)
  local match = string.match(url["url"], "^https?://www%.reddit.com/api/info%.json%?id=t[0-9]_([a-z0-9]+)$")
  if match then
    posts[match] = true
+    if not item_types[match] then
+      io.stdout:write("Type for ID not found.\n")
+      io.stdout:flush()
+      abortgrab = true
+    end
+    item_type = item_types[match]
  end

  if status_code == 204 then
    return wget.actions.EXIT
  end

-  if (status_code >= 300 and status_code <= 399) then
-    local newloc = string.match(http_stat["newloc"], "^([^#]+)")
-    if string.match(newloc, "^//") then
-      newloc = string.match(url["url"], "^(https?:)") .. string.match(newloc, "^//(.+)")
-    elseif string.match(newloc, "^/") then
-      newloc = string.match(url["url"], "^(https?://[^/]+)") .. newloc
-    elseif not string.match(newloc, "^https?://") then
-      newloc = string.match(url["url"], "^(https?://.+/)") .. newloc
+  if status_code >= 300 and status_code <= 399 then
+    local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
+    if string.match(newloc, "inactive%.min")
+      or string.match(newloc, "ReturnUrl")
+      or string.match(newloc, "adultcontent") then
+      io.stdout:write("Found invalid redirect.\n")
+      io.stdout:flush()
+      abortgrab = true
    end
    if processed(newloc) or not allowed(newloc, url["url"]) then
+      tries = 0
      return wget.actions.EXIT
    end
  end