rewrite

5 years ago · 9d1ea0c688
parent c08fd59a29
commit 9d1ea0c688
10 changed files with 1474 additions and 214 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+*~
+*.pyc
+data/
+wget-lua
--- a/JSON.lua
+++ b/JSON.lua
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@ Running without a warrior
 -------------------------
 To run this outside the warrior, clone this repository, cd into its directory and run:

-    pip install seesaw
+    pip install --upgrade seesaw
    ./get-wget-lua.sh

 then start downloading with:
@ -32,9 +32,9 @@ For more options, run:

    run-pipeline --help

-If you don't have root access and/or your version of pip is very old, you can replace "pip install seesaw" with:
+If you don't have root access and/or your version of pip is very old, you can replace "pip install --upgrade seesaw" with:

-    wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py ; python get-pip.py --user ; ~/.local/bin/pip install --user seesaw
+    wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py ; python get-pip.py --user ; ~/.local/bin/pip install --upgrade --user seesaw

 so that pip and seesaw are installed in your home, then run

@ -56,26 +56,32 @@ Distribution-specific setup
 ### For Debian/Ubuntu:

    adduser --system --group --shell /bin/bash archiveteam
-    apt-get install -y git-core libgnutls-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev
-    pip install seesaw
+    apt-get update && apt-get install -y git-core libgnutls-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev flex autoconf
+    pip install --upgrade seesaw
    su -c "cd /home/archiveteam; git clone https://github.com/ArchiveTeam/reddit-grab.git; cd reddit-grab; ./get-wget-lua.sh" archiveteam
    screen su -c "cd /home/archiveteam/reddit-grab/; run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam
    [... ctrl+A D to detach ...]

+In __Debian Jessie__, the `libgnutls-dev` package was renamed to `libgnutls28-dev`. So, you need to do the following instead:
+
+    adduser --system --group --shell /bin/bash archiveteam
+    apt-get update && apt-get install -y git-core libgnutls28-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev flex autoconf
+    [... pretty much the same as above ...]
+
 Wget-lua is also available on [ArchiveTeam's PPA](https://launchpad.net/~archiveteam/+archive/wget-lua) for Ubuntu.

 ### For CentOS:

-Ensure that you have the CentOS equivalent of bzip2 installed as well. You might need the EPEL repository to be enabled.
+Ensure that you have the CentOS equivalent of bzip2 installed as well. You will the EPEL repository to be enabled.

-    yum -y install gnutls-devel lua-devel python-pip zlib-devel
-    pip install seesaw
+    yum -y install autoconf automake flex gnutls-devel lua-devel python-pip zlib-devel
+    pip install --upgrade seesaw
    [... pretty much the same as above ...]

 ### For openSUSE:

    zypper install liblua5_1 lua51 lua51-devel screen python-pip libgnutls-devel bzip2 python-devel gcc make
-    pip install seesaw
+    pip install --upgrade seesaw
    [... pretty much the same as above ...]

 ### For OS X:
@ -83,7 +89,7 @@ Ensure that you have the CentOS equivalent of bzip2 installed as well. You might
 You need Homebrew. Ensure that you have the OS X equivalent of bzip2 installed as well.

    brew install python lua gnutls
-    pip install seesaw
+    pip install --upgrade seesaw
    [... pretty much the same as above ...]

 **There is a known issue with some packaged versions of rsync. If you get errors during the upload stage, reddit-grab will not work with your rsync version.**
@ -97,12 +103,21 @@ This supposedly fixes it:
 Ensure that you have the Arch equivalent of bzip2 installed as well.

 1. Make sure you have `python2-pip` installed.
-2. Install [https://aur.archlinux.org/packages/wget-lua/](the wget-lua package from the AUR). 
-3. Run `pip2 install seesaw`.
+2. Install [the wget-lua package from the AUR](https://aur.archlinux.org/packages/wget-lua/). 
+3. Run `pip2 install --upgrade seesaw`.
 4. Modify the run-pipeline script in seesaw to point at `#!/usr/bin/python2` instead of `#!/usr/bin/python`.
 5. `useradd --system --group users --shell /bin/bash --create-home archiveteam`
 6. `screen su -c "cd /home/archiveteam/reddit-grab/; run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam`

+### For Alpine Linux:
+
+    apk add lua5.1 git python bzip2 bash rsync gcc libc-dev lua5.1-dev zlib-dev gnutls-dev autoconf flex make
+    python -m ensurepip
+    pip install -U seesaw
+    git clone https://github.com/ArchiveTeam/reddit-grab
+    cd reddit-grab; ./get-wget-lua.sh
+    run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE
+
 ### For FreeBSD:

 Honestly, I have no idea. `./get-wget-lua.sh` supposedly doesn't work due to differences in the `tar` that ships with FreeBSD. Another problem is the apparent absence of Lua 5.1 development headers. If you figure this out, please do let us know on IRC (irc.efnet.org #archiveteam).
@ -134,6 +149,12 @@ If you're sure that you followed the steps to install `seesaw`, permissions on y

    chmod o+rX -R /usr/local/lib/python2.7/dist-packages

+### run-pipeline: command not found
+
+Install `seesaw` using `pip2` instead of `pip`.
+
+    pip2 install seesaw
+
 ### Issues in the code

 If you notice a bug and want to file a bug report, please use the GitHub issues tracker.
@ -142,4 +163,5 @@ Are you a developer? Help write code for us! Look at our [developer documentatio

 ### Other problems

-Have an issue not listed here? Join us on IRC and ask! We can be found at irc.efnet.org #deaddit.
+Have an issue not listed here? Join us on IRC and ask! We can be found at irc.efnet.org #shreddit.
+
--- a/1
+++ b/1
@ -1 +1,2 @@
 .reddit.com	TRUE	/	FALSE	0	over18	1
+
--- a/get-wget-lua.sh
+++ b/get-wget-lua.sh
--- a/0
+++ b/0
--- a/pipeline.py
+++ b/pipeline.py
@ -5,6 +5,7 @@ import hashlib
 import os.path
 import random
 from seesaw.config import realize, NumberConfigValue
+from seesaw.externalprocess import ExternalProcess
 from seesaw.item import ItemInterpolation, ItemValue
 from seesaw.task import SimpleTask, LimitConcurrent
 from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
@ -15,8 +16,15 @@ import subprocess
 import sys
 import time
 import string
-import requests
 import re
+import random
+
+try:
+    import warcio
+    from warcio.archiveiterator import ArchiveIterator
+    from warcio.warcwriter import WARCWriter
+except:
+    raise Exception("Please install warc with 'sudo pip install warcio --upgrade'.")

 import seesaw
 from seesaw.externalprocess import WgetDownload
@ -24,10 +32,12 @@ from seesaw.pipeline import Pipeline
 from seesaw.project import Project
 from seesaw.util import find_executable

+from tornado import httpclient
+

 # check the seesaw version
-if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
-    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")
+if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
+    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


 ###########################################################################
@ -37,21 +47,21 @@ if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
 # 1. does not crash with --version, and
 # 2. prints the required version string
 WGET_LUA = find_executable(
-    "Wget+Lua",
-    ["GNU Wget 1.14.lua.20130523-9a5c"],
+    'Wget+Lua',
+    ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'],
    [
-        "./wget-lua",
-        "./wget-lua-warrior",
-        "./wget-lua-local",
-        "../wget-lua",
-        "../../wget-lua",
-        "/home/warrior/wget-lua",
-        "/usr/bin/wget-lua"
+        './wget-lua',
+        './wget-lua-warrior',
+        './wget-lua-local',
+        '../wget-lua',
+        '../../wget-lua',
+        '/home/warrior/wget-lua',
+        '/usr/bin/wget-lua'
    ]
 )

 if not WGET_LUA:
-    raise Exception("No usable Wget+Lua found.")
+    raise Exception('No usable Wget+Lua found.')


 ###########################################################################
@ -59,7 +69,7 @@ if not WGET_LUA:
 #
 # Update this each time you make a non-cosmetic change.
 # It will be added to the WARC files and reported to the tracker.
-VERSION = "20150620.02"
+VERSION = '20190222.01'
 USER_AGENT = 'ArchiveTeam'
 TRACKER_ID = 'reddit'
 TRACKER_HOST = 'tracker.archiveteam.org'
@ -73,7 +83,7 @@ TRACKER_HOST = 'tracker.archiveteam.org'
 # each item.
 class CheckIP(SimpleTask):
    def __init__(self):
-        SimpleTask.__init__(self, "CheckIP")
+        SimpleTask.__init__(self, 'CheckIP')
        self._counter = 0

    def process(self, item):
@ -106,39 +116,98 @@ class CheckIP(SimpleTask):

 class PrepareDirectories(SimpleTask):
    def __init__(self, warc_prefix):
-        SimpleTask.__init__(self, "PrepareDirectories")
+        SimpleTask.__init__(self, 'PrepareDirectories')
        self.warc_prefix = warc_prefix

    def process(self, item):
-        item_name = item["item_name"]
+        item_name = item['item_name']
        escaped_item_name = item_name.replace(':', '_').replace('/', '_').replace('~', '_')
-        dirname = "/".join((item["data_dir"], escaped_item_name))
+        item_hash = hashlib.sha1(item_name.encode('utf-8')).hexdigest()
+        dirname = '/'.join((item['data_dir'], item_hash))

        if os.path.isdir(dirname):
            shutil.rmtree(dirname)

        os.makedirs(dirname)

-        item["item_dir"] = dirname
-        item["warc_file_base"] = "%s-%s-%s" % (self.warc_prefix, escaped_item_name,
-            time.strftime("%Y%m%d-%H%M%S"))
+        item['item_dir'] = dirname
+        item['warc_file_base'] = '%s-%s-%s' % (self.warc_prefix, item_hash,
+            time.strftime('%Y%m%d-%H%M%S'))

-        open("%(item_dir)s/%(warc_file_base)s.warc.gz" % item, "w").close()
+        open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close()
+        open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()
+
+
+class Deduplicate(SimpleTask):
+    def __init__(self):
+        SimpleTask.__init__(self, 'Deduplicate')
+
+    def process(self, item):
+        digests = {}
+        input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
+        output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item
+        with open(input_filename, 'rb') as f_in, \
+                open(output_filename, 'wb') as f_out:
+            writer = WARCWriter(filebuf=f_out, gzip=True)
+            for record in ArchiveIterator(f_in):
+                url = record.rec_headers.get_header('WARC-Target-URI')
+                if url is not None and url.startswith('<'):
+                    url = re.search('^<(.+)>$', url).group(1)
+                    record.rec_headers.replace_header('WARC-Target-URI', url)
+                if record.rec_headers.get_header('WARC-Type') == 'response':
+                    digest = record.rec_headers.get_header('WARC-Payload-Digest')
+                    if digest in digests:
+                        writer.write_record(
+                            self._record_response_to_revisit(writer, record,
+                                                             digests[digest])
+                        )
+                    else:
+                        digests[digest] = (
+                            record.rec_headers.get_header('WARC-Record-ID'),
+                            record.rec_headers.get_header('WARC-Date'),
+                            record.rec_headers.get_header('WARC-Target-URI')
+                        )
+                        writer.write_record(record)
+                elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
+                    record.rec_headers.replace_header('WARC-Filename', output_filename)
+                    writer.write_record(record)
+                else:
+                    writer.write_record(record)
+
+    def _record_response_to_revisit(self, writer, record, duplicate):
+        warc_headers = record.rec_headers
+        warc_headers.replace_header('WARC-Refers-To', duplicate[0])
+        warc_headers.replace_header('WARC-Refers-To-Date', duplicate[1])
+        warc_headers.replace_header('WARC-Refers-To-Target-URI', duplicate[2])
+        warc_headers.replace_header('WARC-Type', 'revisit')
+        warc_headers.replace_header('WARC-Truncated', 'length')
+        warc_headers.replace_header('WARC-Profile',
+                                    'http://netpreserve.org/warc/1.0/' \
+                                    'revisit/identical-payload-digest')
+        warc_headers.remove_header('WARC-Block-Digest')
+        warc_headers.remove_header('Content-Length')
+        return writer.create_warc_record(
+            record.rec_headers.get_header('WARC-Target-URI'),
+            'revisit',
+            warc_headers=warc_headers,
+            http_headers=record.http_headers
+        )


 class MoveFiles(SimpleTask):
    def __init__(self):
-        SimpleTask.__init__(self, "MoveFiles")
+        SimpleTask.__init__(self, 'MoveFiles')

    def process(self, item):
-        # NEW for 2014! Check if wget was compiled with zlib support
-        if os.path.exists("%(item_dir)s/%(warc_file_base)s.warc" % item):
+        if os.path.exists('%(item_dir)s/%(warc_file_base)s.warc' % item):
            raise Exception('Please compile wget with zlib support!')

-        os.rename("%(item_dir)s/%(warc_file_base)s.warc.gz" % item,
-              "%(data_dir)s/%(warc_file_base)s.warc.gz" % item)
+        os.rename('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item,
+            '%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item)
+        os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
+            '%(data_dir)s/%(warc_file_base)s_data.txt' % item)

-        shutil.rmtree("%(item_dir)s" % item)
+        shutil.rmtree('%(item_dir)s' % item)


 def get_hash(filename):
@ -163,62 +232,54 @@ def stats_id_function(item):


 class WgetArgs(object):
+    post_chars = string.digits + string.ascii_lowercase
+
+    def int_to_str(self, i):
+        d, m = divmod(i, 36)
+        if d > 0:
+            return self.int_to_str(d) + self.post_chars[m]
+        return self.post_chars[m]
+
    def realize(self, item):
        wget_args = [
            WGET_LUA,
-            "-U", USER_AGENT,
-            "-nv",
-            "--lua-script", "reddit.lua",
-            "--load-cookies", "cookies",
-            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
-            "--no-check-certificate",
-            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
-            "--truncate-output",
-            "-e", "robots=off",
-            "--rotate-dns",
-            "--recursive", "--level=inf",
-            "--no-parent",
-            "--page-requisites",
-            "--timeout", "30",
-            "--tries", "inf",
-            "--domains", "reddit.com,redditmedia.com",
-            "--span-hosts",
-            "--waitretry", "30",
-            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
-            "--warc-header", "operator: Archive Team",
-            "--warc-header", "reddit-dld-script-version: " + VERSION,
-            "--warc-header", ItemInterpolation("reddit-user: %(item_name)s"),
+            '-U', USER_AGENT,
+            '-nv',
+            '--lua-script', 'reddit.lua',
+            '--load-cookies', 'cookies',
+            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
+            '--no-check-certificate',
+            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
+            '--truncate-output',
+            '-e', 'robots=off',
+            '--rotate-dns',
+            '--recursive', '--level=inf',
+            '--no-parent',
+            '--page-requisites',
+            '--timeout', '30',
+            '--tries', 'inf',
+            '--domains', 'reddit.com',
+            '--span-hosts',
+            '--waitretry', '30',
+            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
+            '--warc-header', 'operator: Archive Team',
+            '--warc-header', 'reddit-dld-script-version: ' + VERSION,
+            '--warc-header', ItemInterpolation('reddit-item: %(item_name)s')
        ]
        
        item_name = item['item_name']
-        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
-        
-        assert item_type in ('36comments')
-        
-        if item_type == '36comments':
-            suffixes = string.digits + string.ascii_lowercase
-            for url in ['http://redd.it/{0}{1}'.format(item_value, a) for a in suffixes]:
-                wget_args.append(url)
-#            for suffix in suffixes:
-#                commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
-#                html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
-#                print('Downloaded', html.status_code, getattr(html, 'reason'))
-#                sys.stdout.flush()
-#                if html.status_code == 200:
-#                    if not html.text:
-#                        raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
-#                    else:
-#                        for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
-#                            if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
-#                                wget_args.append(origurl)
-#                elif html.status_code == 404:
-#                    print('This url is 404.')
-#                else:
-#                    raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
+
+        if item_type in ('posts'):
+            start, end = item_value.split('-')
+            for i in range(int(start), int(end)+1):
+                post_id = self.int_to_str(i)
+                wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)])
+                wget_args.append('https://www.reddit.com/comments/{}'.format(post_id))
+                wget_args.append('https://old.reddit.com/comments/{}'.format(post_id))
        else:
            raise Exception('Unknown item')
        
@ -237,59 +298,67 @@ class WgetArgs(object):
 # This will be shown in the warrior management panel. The logo should not
 # be too big. The deadline is optional.
 project = Project(
-    title="reddit",
-    project_html="""
-        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/b/b5/Reddit_logo.png/320px-Reddit_logo.png" height="50px" title=""/>
-        <h2>www.reddit.com <span class="links"><a href="https://www.reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
-        <p>Grabbing reddit.</p>
-    """
+    title='reddit',
+    project_html='''
+        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
+        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
+        <p>Archiving everything from reddit.</p>
+    '''
 )

 pipeline = Pipeline(
    CheckIP(),
-    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
+    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
-    PrepareDirectories(warc_prefix="reddit"),
+    PrepareDirectories(warc_prefix='reddit'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
-        accept_on_exit_code=[0, 8],
+        accept_on_exit_code=[0, 4, 8],
        env={
-            "item_dir": ItemValue("item_dir"),
-            "item_value": ItemValue("item_value"),
-            "item_type": ItemValue("item_type"),
+            'item_dir': ItemValue('item_dir'),
+            'item_value': ItemValue('item_value'),
+            'item_type': ItemValue('item_type'),
+            'warc_file_base': ItemValue('warc_file_base')
        }
    ),
+    Deduplicate(),
    PrepareStatsForTracker(
-        defaults={"downloader": downloader, "version": VERSION},
+        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
-            "data": [
-                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")
+            'data': [
+                ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
-    LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
-        name="shared:rsync_threads", title="Rsync threads",
-        description="The maximum number of concurrent uploads."),
+    LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
+        name='shared:rsync_threads', title='Rsync threads',
+        description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
-            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
+            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
-                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
+                ItemInterpolation('%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz'),
+                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
-            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
+            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
-                "--recursive",
-                "--partial",
-                "--partial-dir", ".rsync-tmp",
+                '--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608',
+                '--recursive',
+                '--partial',
+                '--partial-dir', '.rsync-tmp',
+                '--min-size', '1',
+                '--no-compress',
+                '--compress-level', '0'
            ]
            ),
    ),
    SendDoneToTracker(
-        tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
-        stats=ItemValue("stats")
+        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
+        stats=ItemValue('stats')
    )
 )
+
--- a/reddit.lua
+++ b/reddit.lua
@ -1,22 +1,32 @@
-dofile("urlcode.lua")
 dofile("table_show.lua")
+dofile("urlcode.lua")
+JSON = (loadfile "JSON.lua")()

-local url_count = 0
-local tries = 0
 local item_type = os.getenv('item_type')
 local item_value = os.getenv('item_value')
+local item_dir = os.getenv('item_dir')
+local warc_file_base = os.getenv('warc_file_base')

+local url_count = 0
+local tries = 0
 local downloaded = {}
 local addedtolist = {}
+local abortgrab = false

-- Do not download these urls:
-downloaded["http://pixel.redditmedia.com/pixel/of_destiny.png?v=q1Ga4BM4n71zceWwjRg4266wx1BqgGjx8isnnrLeBUv%2FXq%2Bk60QeBpQruPDKFQFv%2FDWVNxp63YPBIKv8pMk%2BhrkV3HA5b7GO"] = true
-downloaded["http://pixel.redditmedia.com/pixel/of_doom.png"] = true
-downloaded["http://pixel.redditmedia.com/pixel/of_delight.png"] = true
-downloaded["http://pixel.redditmedia.com/pixel/of_discovery.png"] = true
-downloaded["http://pixel.redditmedia.com/pixel/of_diversity.png"] = true
-downloaded["http://pixel.redditmedia.com/click"] = true
-downloaded["https://stats.redditmedia.com/"] = true
+local posts = {}
+local requested_children = {}
+
+for ignore in io.open("ignore-list", "r"):lines() do
+  downloaded[ignore] = true
+end
+
+load_json_file = function(file)
+  if file then
+    return JSON:decode(file)
+  else
+    return nil
+  end
+end

 read_file = function(file)
  if file then
@ -29,141 +39,218 @@ read_file = function(file)
  end
 end

-wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
-  local url = urlpos["url"]["url"]
-  local html = urlpos["link_expect_html"]
-
-  if downloaded[url] == true or addedtolist[url] == true then
+allowed = function(url, parenturl)
+  if string.match(url, "'+")
+      or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]")
+      or string.match(url, "^https?://[^/]*reddit%.com/login")
+      or string.match(url, "^https?://[^/]*reddit%.com/register")
+      or string.match(url, "%?sort=")
+      or string.match(url, "^https?://www%.reddit%.com/") --TEMP
+      or string.match(url, "/%.rss$") then
    return false
  end
-  
-  if (downloaded[url] ~= true or addedtolist[url] ~= true) then
-    if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com") or string.match(url, "thumbs%.redditmedia%.com")) then
-      addedtolist[url] = true
-      return true
-    else
+
+  local tested = {}
+  for s in string.gmatch(url, "([^/]+)") do
+    if tested[s] == nil then
+      tested[s] = 0
+    end
+    if tested[s] == 6 then
      return false
    end
-  else
+    tested[s] = tested[s] + 1
+  end
+
+  if url .. "/" == parenturl then
    return false
  end
+
+  if string.match(url, "^https?://i%.redd%.it/")
+      or string.match(url, "^https?://[^/]*redditmedia%.com/")
+      or string.match(url, "^https://old.reddit.com/api/morechildren$") then
+    return true
+  end
+
+  for s in string.gmatch(url, "([a-z0-9]+)") do
+    if posts[s] then
+      return true
+    end
+  end
+  
+  return false
 end

+wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
+  local url = urlpos["url"]["url"]
+  local html = urlpos["link_expect_html"]
+
+  if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") then
+    return false
+  end
+
+  if (downloaded[url] ~= true and addedtolist[url] ~= true)
+      and (allowed(url, parent["url"]) or html == 0) then
+    addedtolist[url] = true
+    return true
+  end
+  
+  return false
+end

 wget.callbacks.get_urls = function(file, url, is_css, iri)
  local urls = {}
  local html = nil
+  
+  downloaded[url] = true

-  if downloaded[url] ~= true then
-    downloaded[url] = true
-  end
- 
-  local function check(url)
-    if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or (string.match(url, "redditmedia%.com")) and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "thumbs%.redditmedia%.com") or string.match(url, "%?sort=") or string.match(url, "%?ref=")  or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com")) then
-      if string.match(url, "&amp;") then
-        table.insert(urls, { url=string.gsub(url, "&amp;", "&") })
-        addedtolist[url] = true
-        addedtolist[string.gsub(url, "&amp;", "&")] = true
-      elseif string.match(url, "#") then
-        table.insert(urls, { url=string.match(url, "(https?//:[^#]+)#") })
-        addedtolist[url] = true
-        addedtolist[string.match(url, "(https?//:[^#]+)#")] = true
-      else
-        table.insert(urls, { url=url })
-        addedtolist[url] = true
-      end
+  local function check(urla)
+    local origurl = url
+    local url = string.match(urla, "^([^#]+)")
+    local url_ = string.gsub(url, "&amp;", "&")
+    if (downloaded[url_] ~= true and addedtolist[url_] ~= true)
+        and allowed(url_, origurl) then
+      table.insert(urls, { url=url_ })
+      addedtolist[url_] = true
+      addedtolist[url] = true
    end
  end
-  
-  if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "/related/"..item_value)) then
+
+  local function checknewurl(newurl)
+    if string.match(newurl, "^https?:////") then
+      check(string.gsub(newurl, ":////", "://"))
+    elseif string.match(newurl, "^https?://") then
+      check(newurl)
+    elseif string.match(newurl, "^https?:\\/\\?/") then
+      check(string.gsub(newurl, "\\", ""))
+    elseif string.match(newurl, "^\\/\\/") then
+      check(string.match(url, "^(https?:)")..string.gsub(newurl, "\\", ""))
+    elseif string.match(newurl, "^//") then
+      check(string.match(url, "^(https?:)")..newurl)
+    elseif string.match(newurl, "^\\/") then
+      check(string.match(url, "^(https?://[^/]+)")..string.gsub(newurl, "\\", ""))
+    elseif string.match(newurl, "^/") then
+      check(string.match(url, "^(https?://[^/]+)")..newurl)
+    elseif string.match(newurl, "^%./") then
+      checknewurl(string.match(newurl, "^%.(.+)"))
+    end
+  end
+
+  local function checknewshorturl(newurl)
+    if string.match(newurl, "^%?") then
+      check(string.match(url, "^(https?://[^%?]+)")..newurl)
+    elseif not (string.match(newurl, "^https?:\\?/\\?//?/?")
+        or string.match(newurl, "^[/\\]")
+        or string.match(newurl, "^%./")
+        or string.match(newurl, "^[jJ]ava[sS]cript:")
+        or string.match(newurl, "^[mM]ail[tT]o:")
+        or string.match(newurl, "^vine:")
+        or string.match(newurl, "^android%-app:")
+        or string.match(newurl, "^ios%-app:")
+        or string.match(newurl, "^%${")) then
+      check(string.match(url, "^(https?://.+/)")..newurl)
+    end
+  end
+
+  if string.match(url, "^https?://www%.reddit%.com/comments/[a-z0-9]+$")
+      or string.match(url, "^https?://old%.reddit%.com/comments/[a-z0-9]+$") then
+    posts[string.match(url, "[a-z0-9]+$")] = true
+  end
+
+  if allowed(url, nil)
+      and not string.match(url, "^https?://[^/]*redditmedia%.com/")
+      and not string.match(url, "^https?://[^/]*redditstatic%.com/") then
    html = read_file(file)
-    for newurl in string.gmatch(html, '"thumbnail[^"]+"[^"]+"[^"]+"[^"]+"(//[^"]+)"') do
-      if downloaded[string.gsub(newurl, "//", "http://")] ~= true and addedtolist[string.gsub(newurl, "//", "http://")] ~= true then
-        table.insert(urls, { url=string.gsub(newurl, "//", "http://") })
-        addedtolist[string.gsub(newurl, "//", "http://")] = true
+    if string.match(url, "^https://old.reddit.com/api/morechildren$") then
+      html = string.gsub(html, '\\"', '"')
+    end
+    if string.match(url, "^https?://old%.reddit%.com/") then
+      for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*[0-9]+,%s*'[^']+'%))") do
+        local link_id, sort, children, depth, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*([0-9]+),%s*'([^']+)'%)$")
+        local id = string.match(children, "^([^,]+)")
+        local subreddit = string.match(html, 'data%-subreddit="([^"]+)"')
+        local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&depth=" .. depth .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html"
+        if requested_children[post_data] == nil then
+          requested_children[post_data] = true
+          table.insert(urls, {url="https://old.reddit.com/api/morechildren",
+                              post_data=post_data})
+        end
      end
    end
-    for newurl in string.gmatch(html, '"(https?://[^"]+)"') do
-      check(newurl)
+    for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do
+      checknewurl(newurl)
    end
-    for newurl in string.gmatch(html, "'(https?://[^']+)'") do
-      check(newurl)
+    for newurl in string.gmatch(string.gsub(html, "&#039;", "'"), "([^']+)") do
+      checknewurl(newurl)
    end
-    for newurl in string.gmatch(html, '("/[^"]+)"') do
-      if string.match(newurl, '"//') then
-        check(string.gsub(newurl, '"//', 'http://'))
-      elseif not string.match(newurl, '"//') then
-        check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)'))
-      end
+    for newurl in string.gmatch(html, ">%s*([^<%s]+)") do
+      checknewurl(newurl)
    end
-    for newurl in string.gmatch(html, "('/[^']+)'") do
-      if string.match(newurl, "'//") then
-        check(string.gsub(newurl, "'//", "http://"))
-      elseif not string.match(newurl, "'//") then
-        check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)"))
-      end
+    for newurl in string.gmatch(html, "[^%-]href='([^']+)'") do
+      checknewshorturl(newurl)
+    end
+    for newurl in string.gmatch(html, '[^%-]href="([^"]+)"') do
+      checknewshorturl(newurl)
+    end
+    for newurl in string.gmatch(html, ":%s*url%(([^%)]+)%)") do
+      checknewurl(newurl)
    end
  end
-  
+
  return urls
 end
-  

 wget.callbacks.httploop_result = function(url, err, http_stat)
-  -- NEW for 2014: Slightly more verbose messages because people keep
-  -- complaining that it's not moving or not working
  status_code = http_stat["statcode"]
  
  url_count = url_count + 1
-  io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ".  \n")
+  io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. "  \n")
  io.stdout:flush()

-  if (status_code >= 200 and status_code <= 399) then
-    if string.match(url.url, "https://") then
-      local newurl = string.gsub(url.url, "https://", "http://")
-      downloaded[newurl] = true
-    else
-      downloaded[url.url] = true
+  if (status_code >= 300 and status_code <= 399) then
+    local newloc = string.match(http_stat["newloc"], "^([^#]+)")
+    if string.match(newloc, "^//") then
+      newloc = string.match(url["url"], "^(https?:)") .. string.match(newloc, "^//(.+)")
+    elseif string.match(newloc, "^/") then
+      newloc = string.match(url["url"], "^(https?://[^/]+)") .. newloc
+    elseif not string.match(newloc, "^https?://") then
+      newloc = string.match(url["url"], "^(https?://.+/)") .. newloc
+    end
+    if downloaded[newloc] == true or addedtolist[newloc] == true then
+      return wget.actions.EXIT
    end
  end
  
-  if status_code >= 500 or
-    (status_code >= 400 and status_code ~= 404 and status_code ~= 403) then
+  if (status_code >= 200 and status_code <= 399) then
+    downloaded[url["url"]] = true
+    downloaded[string.gsub(url["url"], "https?://", "http://")] = true
+  end

-    io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
+  if abortgrab == true then
+    io.stdout:write("ABORTING...\n")
+    return wget.actions.ABORT
+  end
+  
+  if status_code >= 500
+      or (status_code >= 400 and status_code ~= 403 and status_code ~= 404)
+      or status_code  == 0 then
+    io.stdout:write("Server returned "..http_stat.statcode.." ("..err.."). Sleeping.\n")
    io.stdout:flush()
-
-    os.execute("sleep 10")
-
-    tries = tries + 1
-
-    if tries >= 6 then
+    local maxtries = 8
+    if not allowed(url["url"], nil) then
+        maxtries = 2
+    end
+    if tries > maxtries then
      io.stdout:write("\nI give up...\n")
      io.stdout:flush()
      tries = 0
-      if string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then
+      if allowed(url["url"], nil) then
        return wget.actions.ABORT
      else
        return wget.actions.EXIT
      end
    else
-      return wget.actions.CONTINUE
-    end
-  elseif status_code == 0 then
-
-    io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n")
-    io.stdout:flush()
-
-    os.execute("sleep 10")
-    
-    tries = tries + 1
-
-    if tries >= 6 then
-      io.stdout:write("\nI give up...\n")
-      io.stdout:flush()
-      tries = 0
-      return wget.actions.ABORT
-    else
+      os.execute("sleep " .. math.floor(math.pow(2, tries)))
+      tries = tries + 1
      return wget.actions.CONTINUE
    end
  end
@ -178,3 +265,10 @@ wget.callbacks.httploop_result = function(url, err, http_stat)

  return wget.actions.NOTHING
 end
+
+wget.callbacks.before_exit = function(exit_status, exit_status_string)
+  if abortgrab == true then
+    return wget.exits.IO_FAIL
+  end
+  return exit_status
+end
--- a/warrior-install.sh
+++ b/warrior-install.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+PIP=pip
+
+if type pip3 > /dev/null 2>&1
+then
+  PIP=pip3
+fi
+
+echo "Installing warcio"
+if ! sudo $PIP install warcio --upgrade
+then
+  exit 1
+fi
+
+exit 0
+
--- a/0
+++ b/0