From 61dd537f153f18ed20253bc08f4635af0d88291e Mon Sep 17 00:00:00 2001
From: Arkiver2 <Arkiver@hotmail.com>
Date: Sun, 5 Jul 2015 17:48:53 +0200
Subject: [PATCH] Update pipeline.py

---
 pipeline.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index b2185bd..5ee1627 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -15,6 +15,8 @@ import subprocess
 import sys
 import time
 import string
+import requests
+import re
 
 import seesaw
 from seesaw.externalprocess import WgetDownload
@@ -198,8 +200,22 @@ class WgetArgs(object):
         
         if item_type == '36comments':
             suffixes = string.digits + string.ascii_lowercase
-            for url in ['https://www.reddit.com/comments/{0}{1}/'.format(item_value, a) for a in suffixes]:
-                wget_args.append(url)
+            for suffix in suffixes:
+                commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
+                html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
+                print('Downloaded', html.status_code, getattr(html, 'reason'))
+                sys.stdout.flush()
+                if html.status_code == 200:
+                    if not html.text:
+                        raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
+                    else:
+                        for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
+                            if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
+                                wget_args.append(origurl)
+                elif html.status_code == 404:
+                    print('This url is 404.')
+                else:
+                    raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
         else:
             raise Exception('Unknown item')