|
|
@ -15,6 +15,8 @@ import subprocess
|
|
|
|
import sys
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
import string
|
|
|
|
import string
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
import seesaw
|
|
|
|
import seesaw
|
|
|
|
from seesaw.externalprocess import WgetDownload
|
|
|
|
from seesaw.externalprocess import WgetDownload
|
|
|
@ -198,8 +200,22 @@ class WgetArgs(object):
|
|
|
|
|
|
|
|
|
|
|
|
if item_type == '36comments':
|
|
|
|
if item_type == '36comments':
|
|
|
|
suffixes = string.digits + string.ascii_lowercase
|
|
|
|
suffixes = string.digits + string.ascii_lowercase
|
|
|
|
for url in ['https://www.reddit.com/comments/{0}{1}/'.format(item_value, a) for a in suffixes]:
|
|
|
|
for suffix in suffixes:
|
|
|
|
wget_args.append(url)
|
|
|
|
commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
|
|
|
|
|
|
|
|
html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
|
|
|
|
|
|
|
|
print('Downloaded', html.status_code, getattr(html, 'reason'))
|
|
|
|
|
|
|
|
sys.stdout.flush()
|
|
|
|
|
|
|
|
if html.status_code == 200:
|
|
|
|
|
|
|
|
if not html.text:
|
|
|
|
|
|
|
|
raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
|
|
|
|
|
|
|
|
if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
|
|
|
|
|
|
|
|
wget_args.append(origurl)
|
|
|
|
|
|
|
|
elif html.status_code == 404:
|
|
|
|
|
|
|
|
print('This url is 404.')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
raise Exception('Unknown item')
|
|
|
|
raise Exception('Unknown item')
|
|
|
|
|
|
|
|
|
|
|
|