diff --git a/dumpgenerator.py b/dumpgenerator.py index c62070d..fbb10ad 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -287,9 +287,9 @@ def getPageTitlesScraper(config={}, session=None): r_title = r'title="(?P[^>]+)">' r_suballpages = '' - r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' - r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' - r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' + r_suballpages1 = r'&from=(?P<from>[^>"]+)&to=(?P<to>[^>"]+)">' + r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">' + r_suballpages3 = r'&from=(?P<from>[^>"]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): @@ -299,7 +299,7 @@ def getPageTitlesScraper(config={}, session=None): else: pass # perhaps no subpages - # Should be enought subpages on Special:Allpages + # Should be enough subpages on Special:Allpages deep = 50 c = 0 oldfr = '' @@ -321,8 +321,8 @@ def getPageTitlesScraper(config={}, session=None): name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to) # do not put urllib.quote in fr or to - # fix, esta regexp no carga bien todas? o falla el r_title en - # este tipo de subpag? (wikiindex) + # fix, this regexp doesn't properly save everything? or does r_title fail on this + # type of subpage? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0]