Fix r842, patch by balrog; Schbirid reported python error in CleanHTML

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@854 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
11 years ago · 4820339d10
parent 577e8034e6
commit 4820339d10
1 changed files with 2 additions and 2 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -181,7 +181,7 @@ def getPageTitlesScraper(config={}):
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
        req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
-        raw = urllib2.urlopen(req)
+        raw = urllib2.urlopen(req).read()
        raw = cleanHTML(raw)
        
        r_title = r'title="(?P<title>[^>]+)">'
@ -217,7 +217,7 @@ def getPageTitlesScraper(config={}):
                if not name in checked_suballpages:
                    checked_suballpages.append(name) #to avoid reload dupe subpages links
                    req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
-                    raw2 = urllib2.urlopen(req)
+                    raw2 = urllib2.urlopen(req).read()
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2 #merge it after removed junk
                    print '    Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'