From 142b48cc6926a0ed4b3e0a97a1040932d0ec7c87 Mon Sep 17 00:00:00 2001
From: Fedora <fedora@wikiteam.localdomain>
Date: Mon, 7 May 2018 19:01:50 +0000
Subject: [PATCH 1/6] Add timeouts and retries to increase success rate

---
 dumpgenerator.py | 62 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index a045ace..a16173b 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -150,7 +150,7 @@ def getNamespacesScraper(config={}, session=None):
     namespacenames = {0: ''}  # main is 0, no prefix
     if namespaces:
         r = session.post(
-            url=config['index'], data={'title': 'Special:Allpages'})
+            url=config['index'], data={'title': 'Special:Allpages'}, timeout=30)
         raw = r.text
         delay(config=config, session=session)
 
@@ -191,7 +191,8 @@ def getNamespacesAPI(config={}, session=None):
                 'action': 'query',
                 'meta': 'siteinfo',
                 'siprop': 'namespaces',
-                'format': 'json'}
+                'format': 'json'},
+            timeout=30
         )
         result = getJSON(r)
         delay(config=config, session=session)
@@ -249,7 +250,7 @@ def getPageTitlesAPI(config={}, session=None):
             retryCount = 0
             while retryCount < config["retries"]:
                 try:
-                    r = session.post(url=config['api'], data=params)
+                    r = session.post(url=config['api'], data=params, timeout=30)
                     break
                 except ConnectionError as err:
                     print "Connection error: %s" % (str(err),)
@@ -301,7 +302,7 @@ def getPageTitlesScraper(config={}, session=None):
         print '    Retrieving titles in the namespace', namespace
         url = '%s?title=Special:Allpages&namespace=%s' % (
             config['index'], namespace)
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=30)
         raw = r.text
         raw = cleanHTML(raw)
 
@@ -353,7 +354,7 @@ def getPageTitlesScraper(config={}, session=None):
                     # to avoid reload dupe subpages links
                     checked_suballpages.append(name)
                     delay(config=config, session=session)
-                    r2 = session.get(url=url)
+                    r2 = session.get(url=url, timeout=10)
                     raw2 = r2.text
                     raw2 = cleanHTML(raw2)
                     rawacum += raw2  # merge it after removed junk
@@ -386,7 +387,7 @@ def getPageTitles(config={}, session=None):
 
     titles = []
     if 'api' in config and config['api']:
-        r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
+        r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
         test = getJSON(r)
         if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
                 and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
@@ -454,7 +455,8 @@ def getXMLHeader(config={}, session=None):
                     'action': 'query',
                     'meta': 'siteinfo',
                     'siprop': 'namespaces',
-                    'format': 'json'}
+                    'format': 'json'},
+                timeout=120
                 )
                 config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                     + ':Export'
@@ -550,7 +552,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                 return ''  # empty xml
         # FIXME HANDLE HTTP Errors HERE
         try:
-            r = session.post(url=config['index'], data=params, headers=headers)
+            r = session.post(url=config['index'], data=params, headers=headers, timeout=10)
             handleStatusCode(r)
             xml = fixBOM(r)
         except requests.exceptions.ConnectionError as e:
@@ -866,7 +868,8 @@ def getImageNamesScraper(config={}, session=None):
             data={
                 'title': 'Special:Imagelist',
                 'limit': limit,
-                'offset': offset})
+                'offset': offset},
+            timeout=30)
         raw = r.text
         delay(config=config, session=session)
         # delicate wiki
@@ -967,7 +970,7 @@ def getImageNamesAPI(config={}, session=None):
             'format': 'json',
             'ailimit': 500}
         # FIXME Handle HTTP Errors HERE
-        r = session.post(url=config['api'], data=params)
+        r = session.post(url=config['api'], data=params, timeout=30)
         handleStatusCode(r)
         jsonimages = getJSON(r)
         delay(config=config, session=session)
@@ -1025,7 +1028,7 @@ def getImageNamesAPI(config={}, session=None):
                 'iiprop': 'user|url',
                 'format': 'json'}
             # FIXME Handle HTTP Errors HERE
-            r = session.post(url=config['api'], data=params)
+            r = session.post(url=config['api'], data=params, timeout=30)
             handleStatusCode(r)
             jsonimages = getJSON(r)
             delay(config=config, session=session)
@@ -1351,11 +1354,22 @@ def getParameters(params=[]):
         print 'Using cookies from %s' % args.cookies
 
     session = requests.Session()
+    try:
+        from requests.packages.urllib3.util.retry import Retry
+        from requests.adapters import HTTPAdapter
+        # Courtesy datashaman https://stackoverflow.com/a/35504626
+        __retries__ = Retry(total=5,
+                        backoff_factor=2,
+                        status_forcelist=[500, 502, 503, 504])
+        session.mount('https://', HTTPAdapter(max_retries=__retries__))
+        session.mount('http://', HTTPAdapter(max_retries=__retries__))
+    except:
+        # Our urllib3/requests is too old
+        pass
     session.cookies = cj
     session.headers.update({'User-Agent': getUserAgent()})
     if args.user and args.password:
         session.auth = (args.user, args.password)
-    # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
 
     # check URLs
     for url in [args.api, args.index, args.wiki]:
@@ -1521,7 +1535,8 @@ def checkAPI(api=None, session=None):
             data={
                 'action': 'query',
                 'meta': 'siteinfo',
-                'format': 'json'}
+                'format': 'json'},
+            timeout=30
         )
         if r.url == api:
             break
@@ -1549,7 +1564,7 @@ def checkAPI(api=None, session=None):
 
 def checkIndex(index=None, cookies=None, session=None):
     """ Checking index.php availability """
-    r = session.post(url=index, data={'title': 'Special:Version'})
+    r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
     raw = r.text
     print 'Checking index.php...', index
     # Workaround for issue 71
@@ -1811,7 +1826,7 @@ def saveSpecialVersion(config={}, session=None):
     else:
         print 'Downloading Special:Version with extensions and other related info'
         r = session.post(
-            url=config['index'], data={'title': 'Special:Version'})
+            url=config['index'], data={'title': 'Special:Version'}, timeout=10)
         raw = r.text
         delay(config=config, session=session)
         raw = removeIP(raw=raw)
@@ -1826,7 +1841,7 @@ def saveIndexPHP(config={}, session=None):
         print 'index.html exists, do not overwrite'
     else:
         print 'Downloading index.php (Main Page) as index.html'
-        r = session.post(url=config['index'], data={})
+        r = session.post(url=config['index'], data={}, timeout=10)
         raw = r.text
         delay(config=config, session=session)
         raw = removeIP(raw=raw)
@@ -1851,7 +1866,8 @@ def saveSiteInfo(config={}, session=None):
                     'meta': 'siteinfo',
                     'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                     'sinumberingroup': 1,
-                    'format': 'json'})
+                    'format': 'json'},
+                timeout=10)
             # MediaWiki 1.11-1.12
             if not 'query' in getJSON(r):
                 r = session.post(
@@ -1860,7 +1876,8 @@ def saveSiteInfo(config={}, session=None):
                         'action': 'query',
                         'meta': 'siteinfo',
                         'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
-                        'format': 'json'})
+                        'format': 'json'},
+                    timeout=10)
             # MediaWiki 1.8-1.10
             if not 'query' in getJSON(r):
                 r = session.post(
@@ -1869,7 +1886,8 @@ def saveSiteInfo(config={}, session=None):
                         'action': 'query',
                         'meta': 'siteinfo',
                         'siprop': 'general|namespaces',
-                        'format': 'json'})
+                        'format': 'json'}
+                    timeout=10)
             result = getJSON(r)
             delay(config=config, session=session)
             with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@@ -1896,9 +1914,9 @@ def getWikiEngine(url=''):
 
     session = requests.Session()
     session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=30)
     if r.status_code == 405 or r.text == '':
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=120)
     result = r.text
 
     wikiengine = 'Unknown'
@@ -1981,7 +1999,7 @@ def mwGetAPIAndIndex(url=''):
     index = ''
     session = requests.Session()
     session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=120)
     result = r.text
 
     # API

From a8cbb357ff859ace128f342d92d148938124e4b4 Mon Sep 17 00:00:00 2001
From: Fedora <fedora@wikiteam.localdomain>
Date: Mon, 7 May 2018 19:05:26 +0000
Subject: [PATCH 2/6] First attempt of API-only export

---
 dumpgenerator.py | 137 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 39 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index a16173b..13ea271 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -39,11 +39,16 @@ except ImportError:             # Python 2.4 compatibility
     from md5 import new as md5
 import os
 import re
+import subprocess
 try:
     import requests
 except ImportError:
     print "Please install or update the Requests module."
     sys.exit(1)
+try:
+    import wikitools
+except ImportError:
+    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
 import time
 import urllib
 UTF8Writer = getwriter('utf8')
@@ -514,7 +519,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
         if c > 0 and c < maxretries:
             wait = increment * c < maxseconds and increment * \
                 c or maxseconds  # incremental until maxseconds
-            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
             time.sleep(wait)
             # reducing server load requesting smallest chunks (if curonly then
             # limit = 1 from mother function)
@@ -677,10 +682,9 @@ def cleanXML(xml=''):
 
 
 def generateXMLDump(config={}, titles=[], start=None, session=None):
-    """ Generates a XML dump for a list of titles """
+    """ Generates a XML dump for a list of titles or from revision IDs """
     # TODO: titles is now unused.
 
-    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
     header, config = getXMLHeader(config=config, session=session)
     footer = '</mediawiki>\n'  # new line at the end
     xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@@ -688,48 +692,100 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                                     config['curonly'] and 'current' or 'history')
     xmlfile = ''
     lock = True
-    if start:
-        print "Removing the last chunk of past XML dump: it is probably incomplete."
-        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
-            pass
-    else:
-        # requested complete xml dump
-        lock = False
+
+    if config['xmlrevisions']:
+        print 'Retrieving the XML for every page from the beginning'
         xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
         xmlfile.write(header.encode('utf-8'))
-        xmlfile.close()
-
-    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
-    c = 1
-    for title in readTitles(config, start):
-        if not title.strip():
-            continue
-        if title == start:  # start downloading from start, included
-            lock = False
-        if lock:
-            continue
-        delay(config=config, session=session)
-        if c % 10 == 0:
-            print 'Downloaded %d pages' % (c)
         try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            for xml in getXMLRevisions(config=config, session=session):
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
-        except PageMissingError:
-            logerror(
-                config=config,
-                text=u'The page "%s" was missing in the wiki (probably deleted)' %
-                (title.decode('utf-8'))
-            )
-        # here, XML is a correct <page> </page> chunk or
-        # an empty string due to a deleted page (logged in errors log) or
-        # an empty string due to an error while retrieving the page from server
-        # (logged in errors log)
-        c += 1
+        except AttributeError:
+            print "This wikitools module version is not working"
+            sys.exit()
+    else:
+        print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
+        if start:
+            print "Removing the last chunk of past XML dump: it is probably incomplete."
+            for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
+                pass
+        else:
+            # requested complete xml dump
+            lock = False
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+            xmlfile.write(header.encode('utf-8'))
+            xmlfile.close()
+
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
+        c = 1
+        for title in readTitles(config, start):
+            if not title.strip():
+                continue
+            if title == start:  # start downloading from start, included
+                lock = False
+            if lock:
+                continue
+            delay(config=config, session=session)
+            if c % 10 == 0:
+                print 'Downloaded %d pages' % (c)
+            try:
+                for xml in getXMLPage(config=config, title=title, session=session):
+                    xml = cleanXML(xml=xml)
+                    xmlfile.write(xml.encode('utf-8'))
+            except PageMissingError:
+                logerror(
+                    config=config,
+                    text=u'The page "%s" was missing in the wiki (probably deleted)' %
+                    (title.decode('utf-8'))
+                )
+            # here, XML is a correct <page> </page> chunk or
+            # an empty string due to a deleted page (logged in errors log) or
+            # an empty string due to an error while retrieving the page from server
+            # (logged in errors log)
+            c += 1
+
     xmlfile.write(footer)
     xmlfile.close()
     print 'XML dump saved at...', xmlfilename
 
+def getXMLRevisions(config={}, session=None):
+    site = wikitools.wiki.Wiki(config['api'])
+    if config['namespaces']:
+        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+    else:
+        namespaces = ['*']
+
+    for namespace in namespaces:
+        params = {
+            'action': 'query',
+            'generator': 'allrevisions',
+            'garvnamespace': namespace,
+            'garvlimit': 50,
+            'garvprop': 'ids',
+            'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
+            }
+        request = wikitools.api.APIRequest(site, params)
+        results = request.queryGen()
+        try:
+            for result in results:
+                yield result['query']['export']['*']
+        except wikitools.api.APIError:
+            # Falling back to allpages generator, the wiki is too old
+            params = {
+                'action': 'query',
+                'generator': 'allpages',
+                'gaplimit': 50,
+                'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
+            }
+            # allpages does not accept "*"
+            if namespace is not '*':
+                params['gapnamespace'] = namespace
+            request = wikitools.api.APIRequest(site, params)
+            results = request.queryGen()
+            for result in results:
+                yield result['query']['export']['*']
+
 def readTitles(config={}, start=None):
     """ Read title list from a file, from the title "start" """
 
@@ -1303,7 +1359,9 @@ def getParameters(params=[]):
         action='store_true',
         help="generates a full history XML dump (--xml --curonly for current revisions only)")
     groupDownload.add_argument('--curonly', action='store_true',
-                               help='store only the current version of pages')
+        help='store only the current version of pages; incompatible with --xmlrevisions')
+    groupDownload.add_argument('--xmlrevisions', action='store_true',
+                               help='download all revisions from an API generator')
     groupDownload.add_argument(
         '--images', action='store_true', help="generates an image dump")
     groupDownload.add_argument(
@@ -1502,6 +1560,7 @@ def getParameters(params=[]):
         'images': args.images,
         'logs': False,
         'xml': args.xml,
+        'xmlrevisions': args.xmlrevisions,
         'namespaces': namespaces,
         'exnamespaces': exnamespaces,
         'path': args.path and os.path.normpath(args.path) or '',
@@ -1547,7 +1606,7 @@ def checkAPI(api=None, session=None):
     try:
         result = getJSON(r)
         index = None
-        if result['query']:
+        if result:
             try:
                 index = result['query']['general']['server'] + \
                     result['query']['general']['script']
@@ -1886,7 +1945,7 @@ def saveSiteInfo(config={}, session=None):
                         'action': 'query',
                         'meta': 'siteinfo',
                         'siprop': 'general|namespaces',
-                        'format': 'json'}
+                        'format': 'json'},
                     timeout=10)
             result = getJSON(r)
             delay(config=config, session=session)

From be5ca12075c8de3a7d3d297ecff21fd85c661417 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 7 May 2018 20:03:22 +0000
Subject: [PATCH 3/6] Avoid generators in API-only export

---
 dumpgenerator.py | 58 +++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 13ea271..cf67bb8 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -698,7 +698,11 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
         xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
         xmlfile.write(header.encode('utf-8'))
         try:
+            r_timestamp = r'<timestamp>([^<]+)</timestamp>'
             for xml in getXMLRevisions(config=config, session=session):
+                numrevs = len(re.findall(r_timestamp, xml))
+                # Due to how generators work, it's expected this may be less
+                print "%d more revisions exported" % numrevs
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
         except AttributeError:
@@ -751,40 +755,43 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
 
 def getXMLRevisions(config={}, session=None):
     site = wikitools.wiki.Wiki(config['api'])
-    if config['namespaces']:
-        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
-    else:
-        namespaces = ['*']
+    #if config['namespaces']:
+    #    namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+    #else:
+    namespaces = ['*']
 
     for namespace in namespaces:
+        print "Exporting revisions from namespace %s" % namespace
+        # TODO: 500 would be nicer, but need to find the wiki's limits
         params = {
             'action': 'query',
-            'generator': 'allrevisions',
-            'garvnamespace': namespace,
-            'garvlimit': 50,
-            'garvprop': 'ids',
-            'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
+            'list': 'allrevisions',
+            'arvnamespace': '*',
+            'arvlimit': 50,
+            'arvprop': 'ids',
             }
         request = wikitools.api.APIRequest(site, params)
         results = request.queryGen()
         try:
             for result in results:
-                yield result['query']['export']['*']
+                revids = []
+                for page in result['query']['allrevisions']:
+                    for revision in page['revisions']:
+                        revids.append(str(revision['revid']))
+
+                print "50 more revisions listed, until %d" % revids[-1]
+                exportparams = {
+                    'action': 'query',
+                    'revids': '|'.join(revids),
+                    'export': '1',
+                }
+                exportrequest = wikitools.api.APIRequest(site, exportparams)
+                exportresults = exportrequest.queryGen()
+                for exportresult in exportresults:
+                    yield exportresult['query']['export']['*']
         except wikitools.api.APIError:
-            # Falling back to allpages generator, the wiki is too old
-            params = {
-                'action': 'query',
-                'generator': 'allpages',
-                'gaplimit': 50,
-                'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
-            }
-            # allpages does not accept "*"
-            if namespace is not '*':
-                params['gapnamespace'] = namespace
-            request = wikitools.api.APIRequest(site, params)
-            results = request.queryGen()
-            for result in results:
-                yield result['query']['export']['*']
+            print "This wikitools version seems not to work for us. Exiting."
+            sys.exit()
 
 def readTitles(config={}, start=None):
     """ Read title list from a file, from the title "start" """
@@ -1361,7 +1368,8 @@ def getParameters(params=[]):
     groupDownload.add_argument('--curonly', action='store_true',
         help='store only the current version of pages; incompatible with --xmlrevisions')
     groupDownload.add_argument('--xmlrevisions', action='store_true',
-                               help='download all revisions from an API generator')
+                               help='download all revisions from an API generator. Ignores the \
+                               namespace selection')
     groupDownload.add_argument(
         '--images', action='store_true', help="generates an image dump")
     groupDownload.add_argument(

From 33bb1c1f23a45f8e31fdf2cce80254cf3c43fc34 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 7 May 2018 21:19:27 +0000
Subject: [PATCH 4/6] Download image description from API when using
 --xmlrevisions

Fixes https://github.com/WikiTeam/wikiteam/issues/308

Also add --failfast option to sneak in all the hacks I use to run
the bulk downloads, so I can more easily sync the repos.
---
 dumpgenerator.py | 80 +++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index cf67bb8..53fb5c6 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -442,34 +442,38 @@ def getXMLHeader(config={}, session=None):
     # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    try:
-        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
-    except PageMissingError as pme:
-        # The <page> does not exist. Not a problem, if we get the <siteinfo>.
-        xml = pme.xml
-    # Issue 26: Account for missing "Special" namespace.
-    # Hope the canonical special name has not been removed.
-    # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
-    except ExportAbortedError:
+    if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+        r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap')
+        xml = r.text
+    else:
         try:
-            if config['api']:
-                print "Trying the local name for the Special namespace instead"
-                r = session.post(
-                url=config['api'],
-                data={
-                    'action': 'query',
-                    'meta': 'siteinfo',
-                    'siprop': 'namespaces',
-                    'format': 'json'},
-                timeout=120
-                )
-                config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
-                    + ':Export'
-                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
         except PageMissingError as pme:
+            # The <page> does not exist. Not a problem, if we get the <siteinfo>.
             xml = pme.xml
+        # Issue 26: Account for missing "Special" namespace.
+        # Hope the canonical special name has not been removed.
+        # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
         except ExportAbortedError:
-            pass
+            try:
+                if config['api']:
+                    print "Trying the local name for the Special namespace instead"
+                    r = session.post(
+                    url=config['api'],
+                    data={
+                        'action': 'query',
+                        'meta': 'siteinfo',
+                        'siprop': 'namespaces',
+                        'format': 'json'},
+                    timeout=120
+                    )
+                    config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+                        + ':Export'
+                    xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            except PageMissingError as pme:
+                xml = pme.xml
+            except ExportAbortedError:
+                pass
 
     header = xml.split('</mediawiki>')[0]
     if not re.match(r"\s*<mediawiki", xml):
@@ -528,6 +532,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
         if c >= maxretries:
             print '    We have retried %d times' % (c)
             print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
+            if config['failfast']:
+                print "Exit, it will be for another time"
+                sys.exit()
             # If it's not already what we tried: our last chance, preserve only the last revision...
             # config['curonly'] means that the whole dump is configured to save only the last,
             # params['curonly'] should mean that we've already tried this
@@ -766,7 +773,6 @@ def getXMLRevisions(config={}, session=None):
         params = {
             'action': 'query',
             'list': 'allrevisions',
-            'arvnamespace': '*',
             'arvlimit': 50,
             'arvprop': 'ids',
             }
@@ -779,7 +785,7 @@ def getXMLRevisions(config={}, session=None):
                     for revision in page['revisions']:
                         revids.append(str(revision['revid']))
 
-                print "50 more revisions listed, until %d" % revids[-1]
+                print "50 more revisions listed, until %s" % revids[-1]
                 exportparams = {
                     'action': 'query',
                     'revids': '|'.join(revids),
@@ -1178,10 +1184,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
         # saving description if any
         try:
             title = u'Image:%s' % (filename)
-            xmlfiledesc = getXMLFileDesc(
-                config=config,
-                title=title,
-                session=session)  # use Image: for backwards compatibility
+            if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+                r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
+                xml = r.text
+            else:
+                xmlfiledesc = getXMLFileDesc(
+                    config=config,
+                    title=title,
+                    session=session)  # use Image: for backwards compatibility
         except PageMissingError:
             xmlfiledesc = ''
             logerror(
@@ -1389,6 +1399,10 @@ def getParameters(params=[]):
         '--get-wiki-engine',
         action='store_true',
         help="returns the wiki engine")
+    groupMeta.add_argument(
+        '--failfast',
+        action='store_true',
+        help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
 
     args = parser.parse_args()
     # print args
@@ -1564,6 +1578,7 @@ def getParameters(params=[]):
         'curonly': args.curonly,
         'date': datetime.datetime.now().strftime('%Y%m%d'),
         'api': api,
+        'failfast': args.failfast,
         'index': index,
         'images': args.images,
         'logs': False,
@@ -2127,7 +2142,10 @@ def main(params=[]):
     # do not enter if resume is requested from begining
     while not other['resume'] and os.path.isdir(config['path']):
         print '\nWarning!: "%s" path exists' % (config['path'])
-        reply = ''
+        if config['failfast']:
+            retry = 'yes'
+        else:
+            reply = ''
         while reply.lower() not in ['yes', 'y', 'no', 'n']:
             reply = raw_input(
                 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

From 952fcc6bcf9c4096cbeb4a05324d947bb887c5be Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 7 May 2018 21:55:26 +0000
Subject: [PATCH 5/6] Up version to 0.4.0-alpha to signify disruption

---
 dumpgenerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 53fb5c6..8f4b820 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -54,7 +54,7 @@ import urllib
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)
 
-__VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org
+__VERSION__ = '0.4.0-alpha'  # major, minor, micro: semver.org
 
 class PageMissingError(Exception):
     def __init__(self, title, xml):

From 7c545d05b7effc240c8f20885dbcd7bad5632c94 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Tue, 8 May 2018 17:07:27 +0000
Subject: [PATCH 6/6] Fix UnboundLocalError and catch RetryError with
 --xmlrevisions

File "./dumpgenerator.py", line 1212, in generateImageDump
    if not re.search(r'</mediawiki>', xmlfiledesc):

UnboundLocalError: local variable 'xmlfiledesc' referenced before assignment
---
 dumpgenerator.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 8f4b820..5582fd1 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -443,8 +443,16 @@ def getXMLHeader(config={}, session=None):
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
     if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
-        r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap')
-        xml = r.text
+        xml = None
+        try:
+            r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
+            xml = r.text
+        except requests.exceptions.RetryError:
+            pass
+
+        if not xml:
+            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
+            xml = r.json()['query']['export']['*']
     else:
         try:
             xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
@@ -1186,7 +1194,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
             title = u'Image:%s' % (filename)
             if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
                 r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
-                xml = r.text
+                xmlfiledesc = r.text
             else:
                 xmlfiledesc = getXMLFileDesc(
                     config=config,