Update pipeline.py

9 years ago · 61dd537f15
parent ff1bb532c6
commit 61dd537f15
1 changed files with 18 additions and 2 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -15,6 +15,8 @@ import subprocess
 import sys
 import time
 import string
 import requests
 import re
 import seesaw
 from seesaw.externalprocess import WgetDownload
@ -198,8 +200,22 @@ class WgetArgs(object):
        if item_type == '36comments':
            suffixes = string.digits + string.ascii_lowercase
-            for url in ['https://www.reddit.com/comments/{0}{1}/'.format(item_value, a) for a in suffixes]:
+            for suffix in suffixes:
-                wget_args.append(url)
+                commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
                html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
                print('Downloaded', html.status_code, getattr(html, 'reason'))
                sys.stdout.flush()
                if html.status_code == 200:
                    if not html.text:
                        raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
                    else:
                        for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
                            if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
                                wget_args.append(origurl)
                elif html.status_code == 404:
                    print('This url is 404.')
                else:
                    raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
        else:
            raise Exception('Unknown item')