reddit-grab/pipeline.py

# encoding=utf8
import datetime
from distutils.version import StrictVersion
import hashlib
import os.path
import random
import re
from seesaw.config import realize, NumberConfigValue
from seesaw.externalprocess import ExternalProcess
from seesaw.item import ItemInterpolation, ItemValue
from seesaw.task import SimpleTask, LimitConcurrent
from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
    UploadWithTracker, SendDoneToTracker
import shutil
import socket
import subprocess
import sys
import time
import string

import seesaw
from seesaw.externalprocess import WgetDownload
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

from tornado import httpclient

import requests
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

class HigherVersion:
    def __init__(self, expression, min_version):
        self._expression = re.compile(expression)
        self._min_version = min_version

    def search(self, text):
        for result in self._expression.findall(text):
            if result >= self._min_version:
                print('Found version {}.'.format(result))
                return True

WGET_AT = find_executable(
    'Wget+AT',
    HigherVersion(
        r'(GNU Wget 1\.[0-9]{2}\.[0-9]{1}-at\.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',
        'GNU Wget 1.21.3-at.20231213.03'
    ),
    [
        './wget-at',
        '/home/warrior/data/wget-at-gnutls'
    ]
)

if not WGET_AT:
    raise Exception('No usable Wget+At found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20240216.01'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 100


###########################################################################
# This section defines project-specific tasks.
#
# Simple tasks (tasks that do not need any concurrency) are based on the
# SimpleTask class and have a process(item) method that is called for
# each item.
class CheckIP(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'CheckIP')
        self._counter = 0

    def process(self, item):
        # NEW for 2014! Check if we are behind firewall/proxy

        if self._counter <= 0:
            item.log_output('Checking IP address.')
            ip_set = set()

            ip_set.add(socket.gethostbyname('twitter.com'))
            #ip_set.add(socket.gethostbyname('facebook.com'))
            ip_set.add(socket.gethostbyname('youtube.com'))
            ip_set.add(socket.gethostbyname('microsoft.com'))
            ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))
            ip_set.add(socket.gethostbyname('archiveteam.org'))

            if len(ip_set) != 5:
                item.log_output('Got IP addresses: {0}'.format(ip_set))
                item.log_output(
                    'Are you behind a firewall/proxy? That is a big no-no!')
                raise Exception(
                    'Are you behind a firewall/proxy? That is a big no-no!')

        # Check only occasionally
        if self._counter <= 0:
            self._counter = 10
        else:
            self._counter -= 1


class PrepareDirectories(SimpleTask):
    def __init__(self, warc_prefix):
        SimpleTask.__init__(self, 'PrepareDirectories')
        self.warc_prefix = warc_prefix

    def process(self, item):
        item_name = item['item_name']
        item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
        escaped_item_name = item_name_hash
        dirname = '/'.join((item['data_dir'], escaped_item_name))

        if os.path.isdir(dirname):
            shutil.rmtree(dirname)

        os.makedirs(dirname)

        item['item_dir'] = dirname
        item['warc_file_base'] = '-'.join([
            self.warc_prefix,
            item_name_hash,
            time.strftime('%Y%m%d-%H%M%S')
        ])

        open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
        open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()

class MoveFiles(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'MoveFiles')

    def process(self, item):
        os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item,
              '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item)
        os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
              '%(data_dir)s/%(warc_file_base)s_data.txt' % item)

        shutil.rmtree('%(item_dir)s' % item)


class SetBadUrls(SimpleTask):
    def __init__(self):
        SimpleTask.__init__(self, 'SetBadUrls')

    def process(self, item):
        item['item_name_original'] = item['item_name']
        items = item['item_name'].split('\0')
        items_lower = [s.lower() for s in items]
        with open('%(item_dir)s/%(warc_file_base)s_bad-items.txt' % item, 'r') as f:
            for aborted_item in f:
                aborted_item = aborted_item.strip().lower()
                index = items_lower.index(aborted_item)
                item.log_output('Item {} is aborted.'.format(aborted_item))
                items.pop(index)
                items_lower.pop(index)
        item['item_name'] = '\0'.join(items)


class MaybeSendDoneToTracker(SendDoneToTracker):
    def enqueue(self, item):
        if len(item['item_name']) == 0:
            return self.complete_item(item)
        return super(MaybeSendDoneToTracker, self).enqueue(item)


def get_hash(filename):
    with open(filename, 'rb') as in_file:
        return hashlib.sha1(in_file.read()).hexdigest()

CWD = os.getcwd()
PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
LUA_SHA1 = get_hash(os.path.join(CWD, 'reddit.lua'))

def stats_id_function(item):
    d = {
        'pipeline_hash': PIPELINE_SHA1,
        'lua_hash': LUA_SHA1,
        'python_version': sys.version,
    }

    return d


class ZstdDict(object):
    created = 0
    data = None

    @classmethod
    def get_dict(cls):
        if cls.data is not None and time.time() - cls.created < 1800:
            return cls.data
        response = requests.get(
            'https://legacy-api.arpa.li/dictionary',
            params={
                'project': 'reddit'
            }
        )
        response.raise_for_status()
        response = response.json()
        if cls.data is not None and response['id'] == cls.data['id']:
            cls.created = time.time()
            return cls.data
        print('Downloading latest dictionary.')
        response_dict = requests.get(response['url'])
        response_dict.raise_for_status()
        raw_data = response_dict.content
        if hashlib.sha256(raw_data).hexdigest() != response['sha256']:
            raise ValueError('Hash of downloaded dictionary does not match.')
        if raw_data[:4] == b'\x28\xB5\x2F\xFD':
            raw_data = zstandard.ZstdDecompressor().decompress(raw_data)
        cls.data = {
            'id': response['id'],
            'dict': raw_data
        }
        cls.created = time.time()
        return cls.data


class WgetArgs(object):
    post_chars = string.digits + string.ascii_lowercase

    def int_to_str(self, i):
        d, m = divmod(i, 36)
        if d > 0:
            return self.int_to_str(d) + self.post_chars[m]
        return self.post_chars[m]

    def realize(self, item):
        with open('user-agents', 'r') as f:
            user_agent = random.choice(list(f)).strip()
        wget_args = [
            WGET_AT,
            '-U', user_agent,
            '-nv',
            '--host-lookups', 'dns',
            '--hosts-file', '/dev/null',
            '--resolvconf-file', '/dev/null',
            '--dns-servers', '9.9.9.10,149.112.112.10,2620:fe::10,2620:fe::fe:10',
            '--reject-reserved-subnets',
            '--load-cookies', 'cookies.txt',
            '--content-on-error',
            '--no-http-keep-alive',
            '--lua-script', 'reddit.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'reddit.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include',
            '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
            '--secure-protocol', 'TLSv1_2',
            #'--ciphers', '+ECDHE-RSA:+AES-256-CBC:+SHA384'
        ]
        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = 'reddit'
        wget_args.extend([
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        for item_name in item['item_name'].split('\0'):
          wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
          wget_args.append('item-name://'+item_name)
          item_type, item_value = item_name.split(':', 1)
          if item_type == 'post':
              wget_args.extend(['--warc-header', 'reddit-post: '+item_value])
              wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)
          elif item_type == 'comment':
              wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])
              wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)
          elif item_type == 'url':
              wget_args.extend(['--warc-header', 'reddit-media-url: '+item_value])
              wget_args.append(item_value)
          else:
              raise Exception('Unknown item')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title='reddit',
    project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Archiving everything from reddit.</p>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://{}/{}/multi={}/'
        .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
        downloader, VERSION),
    PrepareDirectories(warc_prefix='reddit'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'item_names': ItemValue('item_name_newline'),
            'warc_file_base': ItemValue('warc_file_base'),
        }
    ),
    SetBadUrls(),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
        name='shared:rsync_threads', title='Rsync threads',
        description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--min-size', '1',
                '--no-compress',
                '--compress-level', '0'
            ]
        ),
    ),
    MaybeSendDoneToTracker(
        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue('stats')
    )
)
pipeline.py 9 years ago			`# encoding=utf8`
			`import datetime`
			`from distutils.version import StrictVersion`
			`import hashlib`
			`import os.path`
			`import random`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 7 months ago			`import re`
pipeline.py 9 years ago			`from seesaw.config import realize, NumberConfigValue`
rewrite 5 years ago			`from seesaw.externalprocess import ExternalProcess`
pipeline.py 9 years ago			`from seesaw.item import ItemInterpolation, ItemValue`
			`from seesaw.task import SimpleTask, LimitConcurrent`
			`from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \`
			`UploadWithTracker, SendDoneToTracker`
			`import shutil`
			`import socket`
			`import subprocess`
			`import sys`
			`import time`
			`import string`
rewrite 5 years ago
pipeline.py 9 years ago			`import seesaw`
			`from seesaw.externalprocess import WgetDownload`
			`from seesaw.pipeline import Pipeline`
			`from seesaw.project import Project`
			`from seesaw.util import find_executable`

rewrite 5 years ago			`from tornado import httpclient`

Use wget-at with ZSTD. 4 years ago			`import requests`
			`import zstandard`
pipeline.py 9 years ago
rewrite 5 years ago			`if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):`
			`raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')`
pipeline.py 9 years ago

			`###########################################################################`
			`# Find a useful Wget+Lua executable.`
			`#`
Use wget-at with ZSTD. 4 years ago			`# WGET_AT will be set to the first path that`
pipeline.py 9 years ago			`# 1. does not crash with --version, and`
			`# 2. prints the required version string`
Use wget-at with ZSTD. 4 years ago
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 7 months ago			`class HigherVersion:`
			`def __init__(self, expression, min_version):`
			`self._expression = re.compile(expression)`
			`self._min_version = min_version`

			`def search(self, text):`
			`for result in self._expression.findall(text):`
			`if result >= self._min_version:`
			`print('Found version {}.'.format(result))`
			`return True`

Use wget-at with ZSTD. 4 years ago			`WGET_AT = find_executable(`
			`'Wget+AT',`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 7 months ago			`HigherVersion(`
Version 20231017.02. Use --secure-protocol=TLSv1_3. 7 months ago			`r'(GNU Wget 1\.[0-9]{2}\.[0-9]{1}-at\.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',`
Version 20240216.01. Use fixed minimum Wget version 1.21.3-at.20231213.03. Use TLSv1.2. Fix check on svc comment content check. 3 months ago			`'GNU Wget 1.21.3-at.20231213.03'`
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker. 7 months ago			`),`
Support new wget-at location 3 years ago			`[`
Version 20230607.01. Use GNU Wget 1.21.3-at.20230605.01 and arguments around DNS. 12 months ago			`'./wget-at',`
Version 20231118.01. Switch to gnutls. 6 months ago			`'/home/warrior/data/wget-at-gnutls'`
Support new wget-at location 3 years ago			`]`
pipeline.py 9 years ago			`)`

Use wget-at with ZSTD. 4 years ago			`if not WGET_AT:`
			`raise Exception('No usable Wget+At found.')`
pipeline.py 9 years ago

			`###########################################################################`
			`# The version number of this pipeline definition.`
			`#`
			`# Update this each time you make a non-cosmetic change.`
			`# It will be added to the WARC files and reported to the tracker.`
Version 20240216.01. Use fixed minimum Wget version 1.21.3-at.20231213.03. Use TLSv1.2. Fix check on svc comment content check. 3 months ago			`VERSION = '20240216.01'`
Version 20210108.07. Use tracker reddit. 3 years ago			`TRACKER_ID = 'reddit'`
Update tracker host 3 years ago			`TRACKER_HOST = 'legacy-api.arpa.li'`
Version 20230910.03. Increase hardcoded multi item size to 100, for soft limiting on tracker side. 8 months ago			`MULTI_ITEM_SIZE = 100`
pipeline.py 9 years ago

			`###########################################################################`
			`# This section defines project-specific tasks.`
			`#`
			`# Simple tasks (tasks that do not need any concurrency) are based on the`
			`# SimpleTask class and have a process(item) method that is called for`
			`# each item.`
			`class CheckIP(SimpleTask):`
			`def __init__(self):`
rewrite 5 years ago			`SimpleTask.__init__(self, 'CheckIP')`
pipeline.py 9 years ago			`self._counter = 0`

			`def process(self, item):`
			`# NEW for 2014! Check if we are behind firewall/proxy`

			`if self._counter <= 0:`
			`item.log_output('Checking IP address.')`
			`ip_set = set()`

			`ip_set.add(socket.gethostbyname('twitter.com'))`
Version 20211004.01. Do not check facebook.com while down at the moment. 3 years ago			`#ip_set.add(socket.gethostbyname('facebook.com'))`
pipeline.py 9 years ago			`ip_set.add(socket.gethostbyname('youtube.com'))`
			`ip_set.add(socket.gethostbyname('microsoft.com'))`
			`ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))`
			`ip_set.add(socket.gethostbyname('archiveteam.org'))`

Version 20211004.02. Fix incomplete facebook.com fix. 3 years ago			`if len(ip_set) != 5:`
pipeline.py 9 years ago			`item.log_output('Got IP addresses: {0}'.format(ip_set))`
			`item.log_output(`
			`'Are you behind a firewall/proxy? That is a big no-no!')`
			`raise Exception(`
			`'Are you behind a firewall/proxy? That is a big no-no!')`

			`# Check only occasionally`
			`if self._counter <= 0:`
			`self._counter = 10`
			`else:`
			`self._counter -= 1`


			`class PrepareDirectories(SimpleTask):`
			`def __init__(self, warc_prefix):`
rewrite 5 years ago			`SimpleTask.__init__(self, 'PrepareDirectories')`
pipeline.py 9 years ago			`self.warc_prefix = warc_prefix`

			`def process(self, item):`
rewrite 5 years ago			`item_name = item['item_name']`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()`
			`escaped_item_name = item_name_hash`
Use wget-at with ZSTD. 4 years ago			`dirname = '/'.join((item['data_dir'], escaped_item_name))`
pipeline.py 9 years ago
			`if os.path.isdir(dirname):`
			`shutil.rmtree(dirname)`

			`os.makedirs(dirname)`

rewrite 5 years ago			`item['item_dir'] = dirname`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`item['warc_file_base'] = '-'.join([`
			`self.warc_prefix,`
			`item_name_hash,`
			`time.strftime('%Y%m%d-%H%M%S')`
			`])`
pipeline.py 9 years ago
Use wget-at with ZSTD. 4 years ago			`open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()`
rewrite 5 years ago			`open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()`

pipeline.py 9 years ago			`class MoveFiles(SimpleTask):`
			`def __init__(self):`
rewrite 5 years ago			`SimpleTask.__init__(self, 'MoveFiles')`
pipeline.py 9 years ago
			`def process(self, item):`
Use wget-at with ZSTD. 4 years ago			`os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item,`
			`'%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item)`
rewrite 5 years ago			`os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,`
Use wget-at with ZSTD. 4 years ago			`'%(data_dir)s/%(warc_file_base)s_data.txt' % item)`
pipeline.py 9 years ago
rewrite 5 years ago			`shutil.rmtree('%(item_dir)s' % item)`
pipeline.py 9 years ago

Version 20210130.01. Support &amp; in URL. Properly abort selected items. 3 years ago			`class SetBadUrls(SimpleTask):`
			`def __init__(self):`
			`SimpleTask.__init__(self, 'SetBadUrls')`

			`def process(self, item):`
			`item['item_name_original'] = item['item_name']`
			`items = item['item_name'].split('\0')`
			`items_lower = [s.lower() for s in items]`
			`with open('%(item_dir)s/%(warc_file_base)s_bad-items.txt' % item, 'r') as f:`
			`for aborted_item in f:`
			`aborted_item = aborted_item.strip().lower()`
			`index = items_lower.index(aborted_item)`
			`item.log_output('Item {} is aborted.'.format(aborted_item))`
			`items.pop(index)`
			`items_lower.pop(index)`
			`item['item_name'] = '\0'.join(items)`


			`class MaybeSendDoneToTracker(SendDoneToTracker):`
			`def enqueue(self, item):`
			`if len(item['item_name']) == 0:`
			`return self.complete_item(item)`
			`return super(MaybeSendDoneToTracker, self).enqueue(item)`


pipeline.py 9 years ago			`def get_hash(filename):`
			`with open(filename, 'rb') as in_file:`
			`return hashlib.sha1(in_file.read()).hexdigest()`

			`CWD = os.getcwd()`
			`PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))`
			`LUA_SHA1 = get_hash(os.path.join(CWD, 'reddit.lua'))`

			`def stats_id_function(item):`
			`d = {`
			`'pipeline_hash': PIPELINE_SHA1,`
			`'lua_hash': LUA_SHA1,`
			`'python_version': sys.version,`
			`}`

			`return d`


Use wget-at with ZSTD. 4 years ago			`class ZstdDict(object):`
			`created = 0`
			`data = None`

			`@classmethod`
			`def get_dict(cls):`
			`if cls.data is not None and time.time() - cls.created < 1800:`
			`return cls.data`
			`response = requests.get(`
20210225.01: update dict url 3 years ago			`'https://legacy-api.arpa.li/dictionary',`
Use wget-at with ZSTD. 4 years ago			`params={`
Version 20200726.06. Fix project name for ZSTD dictionary request. 4 years ago			`'project': 'reddit'`
Use wget-at with ZSTD. 4 years ago			`}`
			`)`
			`response.raise_for_status()`
			`response = response.json()`
			`if cls.data is not None and response['id'] == cls.data['id']:`
			`cls.created = time.time()`
			`return cls.data`
			`print('Downloading latest dictionary.')`
			`response_dict = requests.get(response['url'])`
			`response_dict.raise_for_status()`
			`raw_data = response_dict.content`
			`if hashlib.sha256(raw_data).hexdigest() != response['sha256']:`
			`raise ValueError('Hash of downloaded dictionary does not match.')`
			`if raw_data[:4] == b'\x28\xB5\x2F\xFD':`
			`raw_data = zstandard.ZstdDecompressor().decompress(raw_data)`
			`cls.data = {`
			`'id': response['id'],`
			`'dict': raw_data`
			`}`
			`cls.created = time.time()`
			`return cls.data`


pipeline.py 9 years ago			`class WgetArgs(object):`
rewrite 5 years ago			`post_chars = string.digits + string.ascii_lowercase`

			`def int_to_str(self, i):`
			`d, m = divmod(i, 36)`
			`if d > 0:`
			`return self.int_to_str(d) + self.post_chars[m]`
			`return self.post_chars[m]`

pipeline.py 9 years ago			`def realize(self, item):`
Version 20210114.01. Use a random user-agent. 3 years ago			`with open('user-agents', 'r') as f:`
			`user_agent = random.choice(list(f)).strip()`
pipeline.py 9 years ago			`wget_args = [`
Use wget-at with ZSTD. 4 years ago			`WGET_AT,`
Version 20210114.01. Use a random user-agent. 3 years ago			`'-U', user_agent,`
rewrite 5 years ago			`'-nv',`
Version 20230607.01. Use GNU Wget 1.21.3-at.20230605.01 and arguments around DNS. 12 months ago			`'--host-lookups', 'dns',`
			`'--hosts-file', '/dev/null',`
			`'--resolvconf-file', '/dev/null',`
			`'--dns-servers', '9.9.9.10,149.112.112.10,2620:fe::10,2620:fe::fe:10',`
Version 20230727.02. Only allow GNU Wget 1.21.3-at.20230623.01. Use Wget-AT option --reject-reserved-subnets. Remove old Wget files. Update README to latest. 10 months ago			`'--reject-reserved-subnets',`
Version 20210114.04. Support cookies. 3 years ago			`'--load-cookies', 'cookies.txt',`
Use wget-at with ZSTD. 4 years ago			`'--content-on-error',`
Version 20210115.01. Use Connection: Close header. 3 years ago			`'--no-http-keep-alive',`
rewrite 5 years ago			`'--lua-script', 'reddit.lua',`
			`'-o', ItemInterpolation('%(item_dir)s/wget.log'),`
			`'--no-check-certificate',`
			`'--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),`
			`'--truncate-output',`
			`'-e', 'robots=off',`
			`'--rotate-dns',`
			`'--recursive', '--level=inf',`
			`'--no-parent',`
			`'--page-requisites',`
			`'--timeout', '30',`
			`'--tries', 'inf',`
			`'--domains', 'reddit.com',`
			`'--span-hosts',`
			`'--waitretry', '30',`
			`'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),`
			`'--warc-header', 'operator: Archive Team',`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`'--warc-header', 'x-wget-at-project-version: ' + VERSION,`
			`'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,`
Use wget-at with ZSTD. 4 years ago			`'--warc-dedup-url-agnostic',`
			`'--warc-compression-use-zstd',`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`'--warc-zstd-dict-no-include',`
Version 20231127.01. Use --ciphers SECURE256. 6 months ago			`'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',`
Version 20240216.01. Use fixed minimum Wget version 1.21.3-at.20231213.03. Use TLSv1.2. Fix check on svc comment content check. 3 months ago			`'--secure-protocol', 'TLSv1_2',`
Version 20231201.01. Change protocol. 6 months ago			`#'--ciphers', '+ECDHE-RSA:+AES-256-CBC:+SHA384'`
pipeline.py 9 years ago			`]`
Use wget-at with ZSTD. 4 years ago			`dict_data = ZstdDict.get_dict()`
			`with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:`
			`f.write(dict_data['dict'])`
			`item['dict_id'] = dict_data['id']`
Version 20200726.06. Fix project name for ZSTD dictionary request. 4 years ago			`item['dict_project'] = 'reddit'`
Use wget-at with ZSTD. 4 years ago			`wget_args.extend([`
			`'--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),`
			`])`

Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`for item_name in item['item_name'].split('\0'):`
			`wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])`
			`wget_args.append('item-name://'+item_name)`
			`item_type, item_value = item_name.split(':', 1)`
Version 20220729.01. Queue media URLs back to reddit project and download individually. 2 years ago			`if item_type == 'post':`
			`wget_args.extend(['--warc-header', 'reddit-post: '+item_value])`
			`wget_args.append('https://www.reddit.com/api/info.json?id=t3_'+item_value)`
			`elif item_type == 'comment':`
			`wget_args.extend(['--warc-header', 'reddit-comment: '+item_value])`
			`wget_args.append('https://www.reddit.com/api/info.json?id=t1_'+item_value)`
			`elif item_type == 'url':`
			`wget_args.extend(['--warc-header', 'reddit-media-url: '+item_value])`
			`wget_args.append(item_value)`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`else:`
			`raise Exception('Unknown item')`

			`item['item_name_newline'] = item['item_name'].replace('\0', '\n')`
Use wget-at with ZSTD. 4 years ago
pipeline.py 9 years ago			`if 'bind_address' in globals():`
			`wget_args.extend(['--bind-address', globals()['bind_address']])`
			`print('')`
			`print('* Wget will bind address at {0} *'.format(`
			`globals()['bind_address']))`
			`print('')`
Use wget-at with ZSTD. 4 years ago
pipeline.py 9 years ago			`return realize(wget_args, item)`

			`###########################################################################`
			`# Initialize the project.`
			`#`
			`# This will be shown in the warrior management panel. The logo should not`
			`# be too big. The deadline is optional.`
			`project = Project(`
rewrite 5 years ago			`title='reddit',`
			`project_html='''`
			`<img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>`
			`<h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> · <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>`
			`<p>Archiving everything from reddit.</p>`
			`'''`
pipeline.py 9 years ago			`)`

			`pipeline = Pipeline(`
			`CheckIP(),`
Use multi items. 3 years ago			`GetItemFromTracker('http://{}/{}/multi={}/'`
			`.format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),`
			`downloader, VERSION),`
rewrite 5 years ago			`PrepareDirectories(warc_prefix='reddit'),`
pipeline.py 9 years ago			`WgetDownload(`
			`WgetArgs(),`
			`max_tries=2,`
rewrite 5 years ago			`accept_on_exit_code=[0, 4, 8],`
pipeline.py 9 years ago			`env={`
rewrite 5 years ago			`'item_dir': ItemValue('item_dir'),`
Handle NULL byte seperated multi items. Support unicode chars in JSON permalink. 3 years ago			`'item_names': ItemValue('item_name_newline'),`
Use wget-at with ZSTD. 4 years ago			`'warc_file_base': ItemValue('warc_file_base'),`
pipeline.py 9 years ago			`}`
			`),`
Version 20210130.01. Support &amp; in URL. Properly abort selected items. 3 years ago			`SetBadUrls(),`
pipeline.py 9 years ago			`PrepareStatsForTracker(`
rewrite 5 years ago			`defaults={'downloader': downloader, 'version': VERSION},`
pipeline.py 9 years ago			`file_groups={`
rewrite 5 years ago			`'data': [`
Use wget-at with ZSTD. 4 years ago			`ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')`
pipeline.py 9 years ago			`]`
			`},`
			`id_function=stats_id_function,`
			`),`
Version 20210108.02. 3 years ago			`MoveFiles(),`
Use default upload concurrent of 2. 4 years ago			`LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',`
rewrite 5 years ago			`name='shared:rsync_threads', title='Rsync threads',`
			`description='The maximum number of concurrent uploads.'),`
pipeline.py 9 years ago			`UploadWithTracker(`
rewrite 5 years ago			`'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),`
pipeline.py 9 years ago			`downloader=downloader,`
			`version=VERSION,`
			`files=[`
Use wget-at with ZSTD. 4 years ago			`ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),`
rewrite 5 years ago			`ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')`
pipeline.py 9 years ago			`],`
rewrite 5 years ago			`rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),`
pipeline.py 9 years ago			`rsync_extra_args=[`
rewrite 5 years ago			`'--recursive',`
			`'--min-size', '1',`
			`'--no-compress',`
			`'--compress-level', '0'`
pipeline.py 9 years ago			`]`
Use wget-at with ZSTD. 4 years ago			`),`
pipeline.py 9 years ago			`),`
Version 20210130.01. Support &amp; in URL. Properly abort selected items. 3 years ago			`MaybeSendDoneToTracker(`
rewrite 5 years ago			`tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),`
			`stats=ItemValue('stats')`
pipeline.py 9 years ago			`)`
			`)`