From 61dd537f153f18ed20253bc08f4635af0d88291e Mon Sep 17 00:00:00 2001 From: Arkiver2 Date: Sun, 5 Jul 2015 17:48:53 +0200 Subject: [PATCH] Update pipeline.py --- pipeline.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index b2185bd..5ee1627 100644 --- a/pipeline.py +++ b/pipeline.py @@ -15,6 +15,8 @@ import subprocess import sys import time import string +import requests +import re import seesaw from seesaw.externalprocess import WgetDownload @@ -198,8 +200,22 @@ class WgetArgs(object): if item_type == '36comments': suffixes = string.digits + string.ascii_lowercase - for url in ['https://www.reddit.com/comments/{0}{1}/'.format(item_value, a) for a in suffixes]: - wget_args.append(url) + for suffix in suffixes: + commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix) + html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'}) + print('Downloaded', html.status_code, getattr(html, 'reason')) + sys.stdout.flush() + if html.status_code == 200: + if not html.text: + raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) + else: + for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text): + if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl): + wget_args.append(origurl) + elif html.status_code == 404: + print('This url is 404.') + else: + raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) else: raise Exception('Unknown item')