Update pipeline.py

pull/1/head
Arkiver2 9 years ago
parent ff1bb532c6
commit 61dd537f15

@ -15,6 +15,8 @@ import subprocess
import sys import sys
import time import time
import string import string
import requests
import re
import seesaw import seesaw
from seesaw.externalprocess import WgetDownload from seesaw.externalprocess import WgetDownload
@ -198,8 +200,22 @@ class WgetArgs(object):
if item_type == '36comments': if item_type == '36comments':
suffixes = string.digits + string.ascii_lowercase suffixes = string.digits + string.ascii_lowercase
for url in ['https://www.reddit.com/comments/{0}{1}/'.format(item_value, a) for a in suffixes]: for suffix in suffixes:
wget_args.append(url) commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
print('Downloaded', html.status_code, getattr(html, 'reason'))
sys.stdout.flush()
if html.status_code == 200:
if not html.text:
raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
else:
for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
wget_args.append(origurl)
elif html.status_code == 404:
print('This url is 404.')
else:
raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
else: else:
raise Exception('Unknown item') raise Exception('Unknown item')

Loading…
Cancel
Save