mirror of
https://github.com/ArchiveTeam/reddit-grab
synced 2024-11-12 01:10:50 +00:00
Version 20231017.01. Use --secure-protocol=auto. Use new minimum Wget version checker.
This commit is contained in:
parent
a0e35bb72d
commit
b1bf682030
21
pipeline.py
21
pipeline.py
@ -4,6 +4,7 @@ from distutils.version import StrictVersion
|
||||
import hashlib
|
||||
import os.path
|
||||
import random
|
||||
import re
|
||||
from seesaw.config import realize, NumberConfigValue
|
||||
from seesaw.externalprocess import ExternalProcess
|
||||
from seesaw.item import ItemInterpolation, ItemValue
|
||||
@ -39,11 +40,23 @@ if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
|
||||
# 1. does not crash with --version, and
|
||||
# 2. prints the required version string
|
||||
|
||||
class HigherVersion:
|
||||
def __init__(self, expression, min_version):
|
||||
self._expression = re.compile(expression)
|
||||
self._min_version = min_version
|
||||
|
||||
def search(self, text):
|
||||
for result in self._expression.findall(text):
|
||||
if result >= self._min_version:
|
||||
print('Found version {}.'.format(result))
|
||||
return True
|
||||
|
||||
WGET_AT = find_executable(
|
||||
'Wget+AT',
|
||||
[
|
||||
HigherVersion(
|
||||
r'(GNU Wget 1.[0-9]{2}.[0-9]{1}-at.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',
|
||||
'GNU Wget 1.21.3-at.20230623.01'
|
||||
],
|
||||
),
|
||||
[
|
||||
'./wget-at',
|
||||
'/home/warrior/data/wget-at'
|
||||
@ -59,7 +72,7 @@ if not WGET_AT:
|
||||
#
|
||||
# Update this each time you make a non-cosmetic change.
|
||||
# It will be added to the WARC files and reported to the tracker.
|
||||
VERSION = '20230910.05'
|
||||
VERSION = '20231017.01'
|
||||
TRACKER_ID = 'reddit'
|
||||
TRACKER_HOST = 'legacy-api.arpa.li'
|
||||
MULTI_ITEM_SIZE = 100
|
||||
@ -268,7 +281,7 @@ class WgetArgs(object):
|
||||
'--warc-compression-use-zstd',
|
||||
'--warc-zstd-dict-no-include',
|
||||
'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
|
||||
'--secure-protocol', 'TLSv1_3'
|
||||
'--secure-protocol', 'auto'
|
||||
]
|
||||
dict_data = ZstdDict.get_dict()
|
||||
with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
|
||||
|
Loading…
Reference in New Issue
Block a user