reddit-grab/pipeline.py

# encoding=utf8
import datetime
from distutils.version import StrictVersion
import hashlib
import os.path
import random
import re
from seesaw.config import realize, NumberConfigValue
from seesaw.externalprocess import ExternalProcess
from seesaw.item import ItemInterpolation, ItemValue
from seesaw.task import SimpleTask, LimitConcurrent
from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
    UploadWithTracker, SendDoneToTracker
import shutil
import socket
import subprocess
import sys
import time
import string

import seesaw
from seesaw.externalprocess import WgetDownload
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

from tornado import httpclient

import requests
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

class HigherVersion:
    def __init__(self, expression, min_version):
        self._expression = re.compile(expression)
        self._min_version = min_version

    def search(self, text):
        for result in self._expression.findall(text):
            if result >= self._min_version:
                print('Found version {}.'.format(result))
                return True

WGET_AT = find_executable(
    'Wget+AT',
    HigherVersion(
        r'(GNU Wget 1\.[0-9]{2}\.[0-9]{1}-at\.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',
        'GNU Wget 1.21.3-at.20230623.01'
    ),
    [
        './wget-at',
        '/home/warrior/data/wget-at-gnutls'
    ]
)

if not WGET_AT:
    raise Exception('No usable Wget+At found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20231127.02'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 100


###########################################################################
# This section defines project-specific tasks.
#
# Simple tasks (tasks that do not need any concurrency) are based on the
# SimpleTask class and have a process(item) method that is called for
# each item.
class CheckIP(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'CheckIP')
        self._counter = 0

    def process(self, item):
        # NEW for 2014! Check if we are behind firewall/proxy

        if self._counter <= 0:
            item.log_output('Checking IP address.')
            ip_set = set()

            ip_set.add(socket.gethostbyname('twitter.com'))
            #ip_set.add(socket.gethostbyname('facebook.com'))
            ip_set.add(socket.gethostbyname('youtube.com'))
            ip_set.add(socket.gethostbyname('microsoft.com'))
            ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))
            ip_set.add(socket.gethostbyname('archiveteam.org'))

            if len(ip_set) != 5:
                item.log_output('Got IP addresses: {0}'.format(ip_set))
                item.log_output(
                    'Are you behind a firewall/proxy? That is a big no-no!')
                raise Exception(
                    'Are you behind a firewall/proxy? That is a big no-no!')

        # Check only occasionally
        if self._counter <= 0:
            self._counter = 10
        else:
            self._counter -= 1


class PrepareDirectories(SimpleTask):
    def __init__(self, warc_prefix):
        SimpleTask.__init__(self, 'PrepareDirectories')
        self.warc_prefix = warc_prefix

    def process(self, item):
        item_name = item['item_name']
        item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
        escaped_item_name = item_name_hash
        dirname = '/'.join((item['data_dir'], escaped_item_name))

        if os.path.isdir(dirname):
            shutil.rmtree(dirname)

        os.makedirs(dirname)

        item['item_dir'] = dirname
        item['warc_file_base'] = '-'.join([
            self.warc_prefix,
            item_name_hash,
            time.strftime('%Y%m%d-%H%M%S')
        ])

        open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
        open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()

class MoveFiles(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'MoveFiles')

    def process(self, item):
        os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item,
              '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item)
        os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
              '%(data_dir)s/%(warc_file_base)s_data.txt' % item)

        shutil.rmtree('%(item_dir)s' % item)


class SetBadUrls(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'SetBadUrls')

    def process(self, item):
        item['item_name_original'] = item['item_name']
        items = item['item_name'].split('\0')
        items_lower = [s.lower() for s in items]
        with open('%(item_dir)s/%(warc_file_base)s_bad-items.txt' % item, 'r') as f:
            for aborted_item in f:
                aborted_item = aborted_item.strip().lower()
                index = items_lower.index(aborted_item)
                item.log_output('Item {} is aborted.'.format(aborted_item))
                items.pop(index)
                items_lower.pop(index)
        item['item_name'] = '\0'.join(items)


class MaybeSendDoneToTracker(SendDoneToTracker):
    def enqueue(self, item):
        if len(item['item_name']) == 0:
            return self.complete_item(item)
        return super(MaybeSendDoneToTracker, self).enqueue(item)


def get_hash(filename):
    with open(filename, 'rb') as in_file:
        return hashlib.sha1(in_file.read()).hexdigest()

CWD = os.getcwd()
PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
LUA_SHA1 = get_hash(os.path.join(CWD, 'reddit.lua'))

def stats_id_function(item):
    d = {
        'pipeline_hash': PIPELINE_SHA1,
        'lua_hash': LUA_SHA1,
        'python_version': sys.version,
    }

    return d


class ZstdDict(object):
    created = 0
    data = None

    @classmethod
    def get_dict(cls):
        if cls.data is not None and time.time() - cls.created < 1800:
            return cls.data
        response = requests.get(
            'https://legacy-api.arpa.li/dictionary',
            params={
                'project': 'reddit'
            }
        )
        response.raise_for_status()
        response = response.json()
        if cls.data is not None and response['id'] == cls.data['id']:
            cls.created = time.time()
            return cls.data
        print('Downloading latest dictionary.')
        response_dict = requests.get(response['url'])
        response_dict.raise_for_status()
        raw_data = response_dict.content
        if hashlib.sha256(raw_data).hexdigest() != response['sha256']:
            raise ValueError('Hash of downloaded dictionary does not match.')
        if raw_data[:4] == b'\x28\xB5\x2F\xFD':
            raw_data = zstandard.ZstdDecompressor().decompress(raw_data)
        cls.data = {
            'id': response['id'],
            'dict': raw_data
        }
        cls.created = time.time()
        return cls.data


class WgetArgs(object):
    post_chars = string.digits + string.ascii_lowercase

    def int_to_str(self, i):
        d, m = divmod(i, 36)
        if d > 0:
            return self.int_to_str(d) + self.post_chars[m]
        return self.post_chars[m]

    def realize(self, item):
        with open('user-agents', 'r') as f:
            user_agent = random.choice(list(f)).strip()
        wget_args = [
            WGET_AT,
            '-U', user_agent,
            '-nv',
            '--host-lookups', 'dns',
            '--hosts-file', '/dev/null',
            '--resolvconf-file', '/dev/null',
            '--dns-servers', '9.9.9.10,149.112.112.10,2620:fe::10,2620:fe::fe:10',
            '--reject-reserved-subnets',
            '--load-cookies', 'cookies.txt',
            '--content-on-error',
            '--no-http-keep-alive',
            '--lua-script', 'reddit.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'reddit.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include',
            '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
            '--ciphers', 'SECURE128'
        ]
        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = 'reddit'
        wget_args.extend([
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        for item_name in item['item_name'].split('\0'):
          wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
          wget_args.append('item-name://'+item_name)
          item_type, item_value = item_name.split(':', 1)
          if item_type == 'post':
              wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
              wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
          elif item_type == 'comment':
              wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
              wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
          elif item_type == 'url':
              wget_args.extend(['--warc-header', 'reddit-media-url: '+item_value])
              wget_args.append(item_value)
          else:
              raise Exception('Unknown item')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title='reddit',
    project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Archiving everything from reddit.</p>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://{}/{}/multi={}/'
        .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
        downloader, VERSION),
    PrepareDirectories(warc_prefix='reddit'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'item_names': ItemValue('item_name_newline'),
            'warc_file_base': ItemValue('warc_file_base'),
        }
    ),
    SetBadUrls(),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
        name='shared:rsync_threads', title='Rsync threads',
        description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--min-size', '1',
                '--no-compress',
                '--compress-level', '0'
            ]
        ),
    ),
    MaybeSendDoneToTracker(
        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue('stats')
    )
)
pipeline.py 2015-07-05 10:03:02 +00:00			`# encoding=utf8`
			`import datetime`
			`from distutils.version import StrictVersion`
			`import hashlib`
			`import os.path`
			`import random`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 2023-10-16 22:18:22 +00:00			`import re`
pipeline.py 2015-07-05 10:03:02 +00:00			`from seesaw.config import realize, NumberConfigValue`
rewrite 2019-02-22 00:15:18 +00:00			`from seesaw.externalprocess import ExternalProcess`
pipeline.py 2015-07-05 10:03:02 +00:00			`from seesaw.item import ItemInterpolation, ItemValue`
			`from seesaw.task import SimpleTask, LimitConcurrent`
			`from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \`
			`UploadWithTracker, SendDoneToTracker`
			`import shutil`
			`import socket`
			`import subprocess`
			`import sys`
			`import time`
			`import string`
rewrite 2019-02-22 00:15:18 +00:00
pipeline.py 2015-07-05 10:03:02 +00:00			`import seesaw`
			`from seesaw.externalprocess import WgetDownload`
			`from seesaw.pipeline import Pipeline`
			`from seesaw.project import Project`
			`from seesaw.util import find_executable`

rewrite 2019-02-22 00:15:18 +00:00			`from tornado import httpclient`

Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`import requests`
			`import zstandard`
pipeline.py 2015-07-05 10:03:02 +00:00
rewrite 2019-02-22 00:15:18 +00:00			`if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):`
			`raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')`
pipeline.py 2015-07-05 10:03:02 +00:00

			`###########################################################################`
			`# Find a useful Wget+Lua executable.`
			`#`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`# WGET_AT will be set to the first path that`
pipeline.py 2015-07-05 10:03:02 +00:00			`# 1. does not crash with --version, and`
			`# 2. prints the required version string`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 2023-10-16 22:18:22 +00:00			`class HigherVersion:`
			`def __init__(self, expression, min_version):`
			`self._expression = re.compile(expression)`
			`self._min_version = min_version`

			`def search(self, text):`
			`for result in self._expression.findall(text):`
			`if result >= self._min_version:`
			`print('Found version {}.'.format(result))`
			`return True`

Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`WGET_AT = find_executable(`
			`'Wget+AT',`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 2023-10-16 22:18:22 +00:00			`HigherVersion(`
Version 20231017.02. Use --secure-protocol=TLSv1_3. 2023-10-17 20:59:48 +00:00			`r'(GNU Wget 1\.[0-9]{2}\.[0-9]{1}-at\.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',`
Version 20230727.02. Only allow GNU Wget 1.21.3-at.20230623.01. Use Wget-AT option --reject-reserved-subnets. Remove old Wget files. Update README to latest. 2023-07-27 15:39:42 +00:00			`'GNU Wget 1.21.3-at.20230623.01'`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 2023-10-16 22:18:22 +00:00			`),`
Support new wget-at location 2021-02-03 01:18:34 +00:00			`[`
Version 20230607.01. Use GNU Wget 1.21.3-at.20230605.01 and arguments around DNS. 2023-06-07 13:46:23 +00:00			`'./wget-at',`
Version 20231118.01. Switch to gnutls. 2023-11-18 15:25:31 +00:00			`'/home/warrior/data/wget-at-gnutls'`
Support new wget-at location 2021-02-03 01:18:34 +00:00			`]`
pipeline.py 2015-07-05 10:03:02 +00:00			`)`

Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`if not WGET_AT:`
			`raise Exception('No usable Wget+At found.')`
pipeline.py 2015-07-05 10:03:02 +00:00

			`###########################################################################`
			`# The version number of this pipeline definition.`
			`#`
			`# Update this each time you make a non-cosmetic change.`
			`# It will be added to the WARC files and reported to the tracker.`
Version 20231127.02. New --ciphers value. 2023-11-28 01:35:20 +00:00			`VERSION = '20231127.02'`
Version 20210108.07. Use tracker reddit. 2021-01-08 22:41:15 +00:00			`TRACKER_ID = 'reddit'`
Update tracker host 2021-02-03 13:31:32 +00:00			`TRACKER_HOST = 'legacy-api.arpa.li'`
Version 20230910.03. Increase hardcoded multi item size to 100, for soft limiting on tracker side. 2023-09-10 03:37:31 +00:00			`MULTI_ITEM_SIZE = 100`
pipeline.py 2015-07-05 10:03:02 +00:00

			`###########################################################################`
			`# This section defines project-specific tasks.`
			`#`
			`# Simple tasks (tasks that do not need any concurrency) are based on the`
			`# SimpleTask class and have a process(item) method that is called for`
			`# each item.`
			`class CheckIP(SimpleTask):`
			`def __init__(self):`
rewrite 2019-02-22 00:15:18 +00:00			`SimpleTask.__init__(self, 'CheckIP')`
pipeline.py 2015-07-05 10:03:02 +00:00			`self._counter = 0`

			`def process(self, item):`
			`# NEW for 2014! Check if we are behind firewall/proxy`

			`if self._counter <= 0:`
			`item.log_output('Checking IP address.')`
			`ip_set = set()`

			`ip_set.add(socket.gethostbyname('twitter.com'))`
Version 20211004.01. Do not check facebook.com while down at the moment. 2021-10-04 19:04:03 +00:00			`#ip_set.add(socket.gethostbyname('facebook.com'))`
pipeline.py 2015-07-05 10:03:02 +00:00			`ip_set.add(socket.gethostbyname('youtube.com'))`
			`ip_set.add(socket.gethostbyname('microsoft.com'))`
			`ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))`
			`ip_set.add(socket.gethostbyname('archiveteam.org'))`

Version 20211004.02. Fix incomplete facebook.com fix. 2021-10-04 19:09:21 +00:00			`if len(ip_set) != 5:`
pipeline.py 2015-07-05 10:03:02 +00:00			`item.log_output('Got IP addresses: {0}'.format(ip_set))`
			`item.log_output(`
			`'Are you behind a firewall/proxy? That is a big no-no!')`
			`raise Exception(`
			`'Are you behind a firewall/proxy? That is a big no-no!')`

			`# Check only occasionally`
			`if self._counter <= 0:`
			`self._counter = 10`
			`else:`
			`self._counter -= 1`


			`class PrepareDirectories(SimpleTask):`
			`def __init__(self, warc_prefix):`
rewrite 2019-02-22 00:15:18 +00:00			`SimpleTask.__init__(self, 'PrepareDirectories')`
pipeline.py 2015-07-05 10:03:02 +00:00			`self.warc_prefix = warc_prefix`

			`def process(self, item):`
rewrite 2019-02-22 00:15:18 +00:00			`item_name = item['item_name']`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()`
			`escaped_item_name = item_name_hash`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`dirname = '/'.join((item['data_dir'], escaped_item_name))`
pipeline.py 2015-07-05 10:03:02 +00:00
			`if os.path.isdir(dirname):`
			`shutil.rmtree(dirname)`

			`os.makedirs(dirname)`

rewrite 2019-02-22 00:15:18 +00:00			`item['item_dir'] = dirname`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`item['warc_file_base'] = '-'.join([`
			`self.warc_prefix,`
			`item_name_hash,`
			`time.strftime('%Y%m%d-%H%M%S')`
			`])`
pipeline.py 2015-07-05 10:03:02 +00:00
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()`
rewrite 2019-02-22 00:15:18 +00:00			`open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()`

pipeline.py 2015-07-05 10:03:02 +00:00			`class MoveFiles(SimpleTask):`
			`def __init__(self):`
rewrite 2019-02-22 00:15:18 +00:00			`SimpleTask.__init__(self, 'MoveFiles')`
pipeline.py 2015-07-05 10:03:02 +00:00
			`def process(self, item):`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item,`
			`'%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item)`
rewrite 2019-02-22 00:15:18 +00:00			`os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`'%(data_dir)s/%(warc_file_base)s_data.txt' % item)`
pipeline.py 2015-07-05 10:03:02 +00:00
rewrite 2019-02-22 00:15:18 +00:00			`shutil.rmtree('%(item_dir)s' % item)`
pipeline.py 2015-07-05 10:03:02 +00:00

Version 20210130.01. Support &amp; in URL. Properly abort selected items. 2021-01-30 02:01:06 +00:00			`class SetBadUrls(SimpleTask):`
			`def __init__(self):`
			`SimpleTask.__init__(self, 'SetBadUrls')`

			`def process(self, item):`
			`item['item_name_original'] = item['item_name']`
			`items = item['item_name'].split('\0')`
			`items_lower = [s.lower() for s in items]`
			`with open('%(item_dir)s/%(warc_file_base)s_bad-items.txt' % item, 'r') as f:`
			`for aborted_item in f:`
			`aborted_item = aborted_item.strip().lower()`
			`index = items_lower.index(aborted_item)`
			`item.log_output('Item {} is aborted.'.format(aborted_item))`
			`items.pop(index)`
			`items_lower.pop(index)`
			`item['item_name'] = '\0'.join(items)`


			`class MaybeSendDoneToTracker(SendDoneToTracker):`
			`def enqueue(self, item):`
			`if len(item['item_name']) == 0:`
			`return self.complete_item(item)`
			`return super(MaybeSendDoneToTracker, self).enqueue(item)`


pipeline.py 2015-07-05 10:03:02 +00:00			`def get_hash(filename):`
			`with open(filename, 'rb') as in_file:`
			`return hashlib.sha1(in_file.read()).hexdigest()`

			`CWD = os.getcwd()`
			`PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))`
			`LUA_SHA1 = get_hash(os.path.join(CWD, 'reddit.lua'))`

			`def stats_id_function(item):`
			`d = {`
			`'pipeline_hash': PIPELINE_SHA1,`
			`'lua_hash': LUA_SHA1,`
			`'python_version': sys.version,`
			`}`

			`return d`


Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`class ZstdDict(object):`
			`created = 0`
			`data = None`

			`@classmethod`
			`def get_dict(cls):`
			`if cls.data is not None and time.time() - cls.created < 1800:`
			`return cls.data`
			`response = requests.get(`
20210225.01: update dict url 2021-02-25 01:58:24 +00:00			`'https://legacy-api.arpa.li/dictionary',`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`params={`
Version 20200726.06. Fix project name for ZSTD dictionary request. 2020-07-26 21:44:05 +00:00			`'project': 'reddit'`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`}`
			`)`
			`response.raise_for_status()`
			`response = response.json()`
			`if cls.data is not None and response['id'] == cls.data['id']:`
			`cls.created = time.time()`
			`return cls.data`
			`print('Downloading latest dictionary.')`
			`response_dict = requests.get(response['url'])`
			`response_dict.raise_for_status()`
			`raw_data = response_dict.content`
			`if hashlib.sha256(raw_data).hexdigest() != response['sha256']:`
			`raise ValueError('Hash of downloaded dictionary does not match.')`
			`if raw_data[:4] == b'\x28\xB5\x2F\xFD':`
			`raw_data = zstandard.ZstdDecompressor().decompress(raw_data)`
			`cls.data = {`
			`'id': response['id'],`
			`'dict': raw_data`
			`}`
			`cls.created = time.time()`
			`return cls.data`


pipeline.py 2015-07-05 10:03:02 +00:00			`class WgetArgs(object):`
rewrite 2019-02-22 00:15:18 +00:00			`post_chars = string.digits + string.ascii_lowercase`

			`def int_to_str(self, i):`
			`d, m = divmod(i, 36)`
			`if d > 0:`
			`return self.int_to_str(d) + self.post_chars[m]`
			`return self.post_chars[m]`

pipeline.py 2015-07-05 10:03:02 +00:00			`def realize(self, item):`
Version 20210114.01. Use a random user-agent. 2021-01-14 00:34:07 +00:00			`with open('user-agents', 'r') as f:`
			`user_agent = random.choice(list(f)).strip()`
pipeline.py 2015-07-05 10:03:02 +00:00			`wget_args = [`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`WGET_AT,`
Version 20210114.01. Use a random user-agent. 2021-01-14 00:34:07 +00:00			`'-U', user_agent,`
rewrite 2019-02-22 00:15:18 +00:00			`'-nv',`
Version 20230607.01. Use GNU Wget 1.21.3-at.20230605.01 and arguments around DNS. 2023-06-07 13:46:23 +00:00			`'--host-lookups', 'dns',`
			`'--hosts-file', '/dev/null',`
			`'--resolvconf-file', '/dev/null',`
			`'--dns-servers', '9.9.9.10,149.112.112.10,2620:fe::10,2620:fe::fe:10',`
Version 20230727.02. Only allow GNU Wget 1.21.3-at.20230623.01. Use Wget-AT option --reject-reserved-subnets. Remove old Wget files. Update README to latest. 2023-07-27 15:39:42 +00:00			`'--reject-reserved-subnets',`
Version 20210114.04. Support cookies. 2021-01-14 15:54:55 +00:00			`'--load-cookies', 'cookies.txt',`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`'--content-on-error',`
Version 20210115.01. Use Connection: Close header. 2021-01-15 13:52:09 +00:00			`'--no-http-keep-alive',`
rewrite 2019-02-22 00:15:18 +00:00			`'--lua-script', 'reddit.lua',`
			`'-o', ItemInterpolation('%(item_dir)s/wget.log'),`
			`'--no-check-certificate',`
			`'--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),`
			`'--truncate-output',`
			`'-e', 'robots=off',`
			`'--rotate-dns',`
			`'--recursive', '--level=inf',`
			`'--no-parent',`
			`'--page-requisites',`
			`'--timeout', '30',`
			`'--tries', 'inf',`
			`'--domains', 'reddit.com',`
			`'--span-hosts',`
			`'--waitretry', '30',`
			`'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),`
			`'--warc-header', 'operator: Archive Team',`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`'--warc-header', 'x-wget-at-project-version: ' + VERSION,`
			`'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`'--warc-dedup-url-agnostic',`
			`'--warc-compression-use-zstd',`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`'--warc-zstd-dict-no-include',`
Version 20231127.01. Use --ciphers SECURE256. 2023-11-27 00:37:38 +00:00			`'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',`
Version 20231127.02. New --ciphers value. 2023-11-28 01:35:20 +00:00			`'--ciphers', 'SECURE128'`
pipeline.py 2015-07-05 10:03:02 +00:00			`]`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`dict_data = ZstdDict.get_dict()`
			`with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:`
			`f.write(dict_data['dict'])`
			`item['dict_id'] = dict_data['id']`
Version 20200726.06. Fix project name for ZSTD dictionary request. 2020-07-26 21:44:05 +00:00			`item['dict_project'] = 'reddit'`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`wget_args.extend([`
			`'--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),`
			`])`

Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`for item_name in item['item_name'].split('\0'):`
			`wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])`
			`wget_args.append('item-name://'+item_name)`
			`item_type, item_value = item_name.split(':', 1)`
Version 20220729.01. Queue media URLs back to reddit project and download individually. 2022-07-28 16:09:04 +00:00			`if item_type == 'post':`
			`wget_args.extend(['--warc-header', 'reddit-post: '+item_value])`
			`wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)`
			`elif item_type == 'comment':`
			`wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])`
			`wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)`
			`elif item_type == 'url':`
			`wget_args.extend(['--warc-header', 'reddit-media-url: '+item_value])`
			`wget_args.append(item_value)`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`else:`
			`raise Exception('Unknown item')`

			`item['item_name_newline'] = item['item_name'].replace('\0', '\n')`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00
pipeline.py 2015-07-05 10:03:02 +00:00			`if 'bind_address' in globals():`
			`wget_args.extend(['--bind-address', globals()['bind_address']])`
			`print('')`
			`print('* Wget will bind address at {0} *'.format(`
			`globals()['bind_address']))`
			`print('')`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00
pipeline.py 2015-07-05 10:03:02 +00:00			`return realize(wget_args, item)`

			`###########################################################################`
			`# Initialize the project.`
			`#`
			`# This will be shown in the warrior management panel. The logo should not`
			`# be too big. The deadline is optional.`
			`project = Project(`
rewrite 2019-02-22 00:15:18 +00:00			`title='reddit',`
			`project_html='''`
			`<img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>`
			`<h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> · <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>`
			`<p>Archiving everything from reddit.</p>`
			`'''`
pipeline.py 2015-07-05 10:03:02 +00:00			`)`

			`pipeline = Pipeline(`
			`CheckIP(),`
Use multi items. 2021-01-08 21:40:09 +00:00			`GetItemFromTracker('http://{}/{}/multi={}/'`
			`.format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),`
			`downloader, VERSION),`
rewrite 2019-02-22 00:15:18 +00:00			`PrepareDirectories(warc_prefix='reddit'),`
pipeline.py 2015-07-05 10:03:02 +00:00			`WgetDownload(`
			`WgetArgs(),`
			`max_tries=2,`
rewrite 2019-02-22 00:15:18 +00:00			`accept_on_exit_code=[0, 4, 8],`
pipeline.py 2015-07-05 10:03:02 +00:00			`env={`
rewrite 2019-02-22 00:15:18 +00:00			`'item_dir': ItemValue('item_dir'),`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 2021-01-08 22:18:48 +00:00			`'item_names': ItemValue('item_name_newline'),`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`'warc_file_base': ItemValue('warc_file_base'),`
pipeline.py 2015-07-05 10:03:02 +00:00			`}`
			`),`
Version 20210130.01. Support &amp; in URL. Properly abort selected items. 2021-01-30 02:01:06 +00:00			`SetBadUrls(),`
pipeline.py 2015-07-05 10:03:02 +00:00			`PrepareStatsForTracker(`
rewrite 2019-02-22 00:15:18 +00:00			`defaults={'downloader': downloader, 'version': VERSION},`
pipeline.py 2015-07-05 10:03:02 +00:00			`file_groups={`
rewrite 2019-02-22 00:15:18 +00:00			`'data': [`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')`
pipeline.py 2015-07-05 10:03:02 +00:00			`]`
			`},`
			`id_function=stats_id_function,`
			`),`
Version 20210108.02. 2021-01-08 21:42:03 +00:00			`MoveFiles(),`
Use default upload concurrent of 2. 2020-06-30 23:12:32 +00:00			`LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',`
rewrite 2019-02-22 00:15:18 +00:00			`name='shared:rsync_threads', title='Rsync threads',`
			`description='The maximum number of concurrent uploads.'),`
pipeline.py 2015-07-05 10:03:02 +00:00			`UploadWithTracker(`
rewrite 2019-02-22 00:15:18 +00:00			`'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),`
pipeline.py 2015-07-05 10:03:02 +00:00			`downloader=downloader,`
			`version=VERSION,`
			`files=[`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),`
rewrite 2019-02-22 00:15:18 +00:00			`ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')`
pipeline.py 2015-07-05 10:03:02 +00:00			`],`
rewrite 2019-02-22 00:15:18 +00:00			`rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),`
pipeline.py 2015-07-05 10:03:02 +00:00			`rsync_extra_args=[`
rewrite 2019-02-22 00:15:18 +00:00			`'--recursive',`
			`'--min-size', '1',`
			`'--no-compress',`
			`'--compress-level', '0'`
pipeline.py 2015-07-05 10:03:02 +00:00			`]`
Use wget-at with ZSTD. 2020-06-30 23:11:06 +00:00			`),`
pipeline.py 2015-07-05 10:03:02 +00:00			`),`
Version 20210130.01. Support &amp; in URL. Properly abort selected items. 2021-01-30 02:01:06 +00:00			`MaybeSendDoneToTracker(`
rewrite 2019-02-22 00:15:18 +00:00			`tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),`
			`stats=ItemValue('stats')`
pipeline.py 2015-07-05 10:03:02 +00:00			`)`
			`)`