Merge pull request #452 from yzqzss/patch-4

Update dumpgenerator.py
Merge pull request #439 from Pokechu22/page-title-scraper-fix
1 changed files with 10 additions and 10 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -170,7 +170,7 @@ def getNamespacesScraper(config={}, session=None):

        # [^>]*? to include selected="selected"
        m = re.compile(
-            r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
+            r'<option [^>]*?value=[\'"](?P<namespaceid>\d+)[\'"][^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
        if 'all' in namespaces:
            namespaces = []
            for i in m:
@ -287,9 +287,9 @@ def getPageTitlesScraper(config={}, session=None):

        r_title = r'title="(?P<title>[^>]+)">'
        r_suballpages = ''
-        r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
-        r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
-        r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
+        r_suballpages1 = r'&amp;from=(?P<from>[^>"]+)&amp;to=(?P<to>[^>"]+)">'
+        r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">'
+        r_suballpages3 = r'&amp;from=(?P<from>[^>"]+)" title="[^>]+">'
        if re.search(r_suballpages1, raw):
            r_suballpages = r_suballpages1
        elif re.search(r_suballpages2, raw):
@ -299,7 +299,7 @@ def getPageTitlesScraper(config={}, session=None):
        else:
            pass  # perhaps no subpages

-        # Should be enought subpages on Special:Allpages
+        # Should be enough subpages on Special:Allpages
        deep = 50
        c = 0
        oldfr = ''
@ -321,8 +321,8 @@ def getPageTitlesScraper(config={}, session=None):
                    name = '%s-%s' % (fr, to)
                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
                        config['index'], namespace, fr, to)  # do not put urllib.quote in fr or to
-                # fix, esta regexp no carga bien todas? o falla el r_title en
-                # este tipo de subpag? (wikiindex)
+                # fix, this regexp doesn't properly save everything? or does r_title fail on this
+                # type of subpage? (wikiindex)
                elif r_suballpages == r_suballpages2:
                    # clean &amp;namespace=\d, sometimes happens
                    fr = fr.split('&amp;namespace=')[0]
@ -1519,7 +1519,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
        title = u'Image:%s' % (filename)
        try:
            if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
-                r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
+                r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % urllib.parse.quote(title))
                xmlfiledesc = r.text
            else:
                xmlfiledesc = getXMLFileDesc(
@ -2250,7 +2250,7 @@ def resumePreviousDump(config={}, other={}):
            listdir = os.listdir('%s/images' % (config['path']))
        except:
            pass  # probably directory does not exist
-        listdir.sort()
+        listdir = set(listdir)
        complete = True
        lastfilename = ''
        lastfilename2 = ''
@ -2528,7 +2528,7 @@ def main(params=[]):
        print '\nWarning!: "%s" path exists' % (config['path'])
        reply = ''
        if config['failfast']:
-            retry = 'yes'
+            reply = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input(
                'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %
Author	SHA1	Message	Date
nemobis	c7150784c1	Merge pull request #452 from yzqzss/patch-4 Update dumpgenerator.py	12 months ago
nemobis	a977dc1a8b	Merge pull request #439 from Pokechu22/page-title-scraper-fix Fix infinite loop on page title scraper	12 months ago
nemobis	c56cbf1c12	Merge pull request #453 from yzqzss/patch-5 Speed up file scanning in `images/` dir	12 months ago
nemobis	674381c27c	Merge pull request #448 from yzqzss/patch-1 Match single quotes too when scraping namespaces	12 months ago
nemobis	8167987052	Merge pull request #451 from yzqzss/patch-2 Quote `title` to get correct file description	12 months ago
yzqzss	392fbce083	speed up file scanning use `set` instead of `list` to speed up the scanning of large numbers of files (>10000) in `images/`.	1 year ago
yzqzss	940d50bbac	Update dumpgenerator.py fix typo	1 year ago
yzqzss	ebac66f557	Update dumpgenerator.py	1 year ago
yzqzss	0be46c7427	quote `title`	1 year ago
yzqzss	331f8e122b	update regex to match `'` and `"` in <option> tag the new versions of MediaWiki use `'`, older use `"`.	1 year ago
Pokechu22	a1bd3b0851	Fix infinite loop on page title scraper	2 years ago