From 4820339d10fc500245c712d61dde7f64528b01bd Mon Sep 17 00:00:00 2001 From: nemobis Date: Tue, 5 Nov 2013 17:10:44 +0000 Subject: [PATCH] Fix r842, patch by balrog; Schbirid reported python error in CleanHTML git-svn-id: https://wikiteam.googlecode.com/svn/trunk@854 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 8e49c23..f1111d7 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -181,7 +181,7 @@ def getPageTitlesScraper(config={}): print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()}) - raw = urllib2.urlopen(req) + raw = urllib2.urlopen(req).read() raw = cleanHTML(raw) r_title = r'title="(?P[^>]+)">' @@ -217,7 +217,7 @@ def getPageTitlesScraper(config={}): if not name in checked_suballpages: checked_suballpages.append(name) #to avoid reload dupe subpages links req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()}) - raw2 = urllib2.urlopen(req) + raw2 = urllib2.urlopen(req).read() raw2 = cleanHTML(raw2) rawacum += raw2 #merge it after removed junk print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'