From 11507e931e4d6682d0c0a766fd0ea8833edb86e4 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 14:19:23 +0200
Subject: [PATCH 01/14] Initial switch to mwclient for the xmlrevisions option

* Still maintained and available for python 3 as well.
* Allows raw API requests as we need.
* Does not provide handy generators, we need to do continuation.
* Decides on its own which protocol and exact path to use, fails at it.
* Appears to use POST by default unless asked otherwise, what to do?
---
 dumpgenerator.py | 96 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 28 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index e2d8082..4a87ded 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -46,9 +46,9 @@ except ImportError:
     print "Please install or update the Requests module."
     sys.exit(1)
 try:
-    import wikitools
+    import mwclient
 except ImportError:
-    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
+    print "Please install the mwclient module if you want to use --xmlrevisions."
 try:
     from lxml import etree
     from lxml.builder import E
@@ -714,8 +714,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
            print '    %s, %d edits' % (title.strip(), numberofedits)
 
 
+def makeXmlPageFromRaw(xml):
+    """ Discard the metadata around a <page> element in <mediawiki> string"""
+    root = etree.XML(xml)
+    find = etree.XPath("//*[local-name() = 'page']")
+    # The tag will inherit the namespace, like:
+    # <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    # FIXME: pretty_print doesn't seem to work, only adds a newline
+    return etree.tostring(find(root)[0], pretty_print=True)
+
+
 def cleanXML(xml=''):
-    """ Trim redundant info """
+    """ Trim redundant info from the XML however it comes """
     # do not touch XML codification, leave AS IS
     if re.search(r'</siteinfo>\n', xml):
         xml = xml.split('</siteinfo>\n')[1]
@@ -748,8 +758,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                 print "%d more revisions exported" % numrevs
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
-        except AttributeError:
-            print "This wikitools module version is not working"
+        except AttributeError as e:
+            print(e)
+            print "This API library version is not working"
             sys.exit()
     else:
         print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
@@ -797,7 +808,10 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
     print 'XML dump saved at...', xmlfilename
 
 def getXMLRevisions(config={}, session=None, allpages=False):
-    site = wikitools.wiki.Wiki(config['api'])
+    apiurl = urlparse(config['api'])
+    # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
+    # https://github.com/WikiTeam/wikiteam/issues/358
+    site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""))
     if not 'all' in config['namespaces']:
         namespaces = config['namespaces']
     else:
@@ -806,6 +820,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
     try:
         for namespace in namespaces:
             print "Trying to export all revisions from namespace %s" % namespace
+            # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
             arvparams = {
                 'action': 'query',
                 'list': 'allrevisions',
@@ -817,46 +832,71 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 # Skip flags, presumably needed to add <minor/> which is in the schema.
                 # Also missing: parentid and contentformat.
                 arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
-                arvrequest = wikitools.api.APIRequest(site, arvparams)
-                results = arvrequest.queryGen()
-                for result in results:
-                    for page in result['query']['allrevisions']:
+                print("Trying to get wikitext from the allrevisions API and to build the XML")
+                while True:
+                    arvrequest = site.api(**arvparams)
+                    for page in arvrequest['query']['allrevisions']:
                         yield makeXmlFromPage(page)
+                    if 'continue' in arvrequest:
+                        arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
+                    else:
+                        # End of continuation. We are done with this namespace.
+                        break
+
             else:
+                # FIXME: this is not curonly, just different strategy to do all revisions
                 # Just cycle through revision IDs and use the XML as is
+                print("Trying to list the revisions and to export them one by one")
+                # We only need the revision ID, all the rest will come from the raw export
                 arvparams['arvprop'] = 'ids'
-                arvrequest = wikitools.api.APIRequest(site, arvparams)
-                arvresults = arvrequest.queryGen()
-                for result in arvresults:
+                # Repeat the arvrequest with new arvparams until done
+                while True:
+                    # Reset revision IDs from the previous batch from arv
                     revids = []
-                    for page in result['query']['allrevisions']:
+                    # Get the new ones
+                    arvrequest = site.api(**arvparams)
+                    for page in arvrequest['query']['allrevisions']:
                         for revision in page['revisions']:
                             revids.append(str(revision['revid']))
                     print "%d more revisions listed, until %s" % (len(revids), revids[-1])
-
+                    # We can now get the XML for one revision at a time
+                    # FIXME: we can actually get them in batches as we used to
+                    # but need to figure out the continuation and avoid that the API
+                    # chooses to give us only the latest for each page
                     exportparams = {
                         'action': 'query',
-                        'revids': '|'.join(revids),
                         'export': '1',
                     }
-                    exportrequest = wikitools.api.APIRequest(site, exportparams)
-                    exportresults = exportrequest.queryGen()
-                    for exportresult in exportresults:
-                        yield exportresult['query']['export']['*']
+                    for revid in revids:
+                        exportparams['revids'] = revid
+                        exportrequest = site.api(**exportparams)
+                        # This gives us a self-standing <mediawiki> element
+                        # but we only need the inner <page>: we can live with
+                        # duplication and non-ordering of page titles, but the
+                        # repeated header is confusing and would not even be valid
+                        xml = exportrequest['query']['export']['*']
+                        yield makeXmlPageFromRaw(xml)
+                        
+                    if 'continue' in arvrequest:
+                        arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
+                    else:
+                        # End of continuation. We are done with this namespace.
+                        break
+                    
 
     except KeyError:
-        print "Warning. Could not use allrevisions, wiki too old."
+        print "Warning. Could not use allrevisions. Wiki too old?"
         if config['curonly']:
+            # The raw XML export in the API gets a title and gives the latest revision
             for title in readTitles(config):
                 exportparams = {
                     'action': 'query',
                     'titles': title,
                     'export': '1',
                 }
-                exportrequest = wikitools.api.APIRequest(site, exportparams)
-                exportresults = exportrequest.queryGen()
-                for exportresult in exportresults:
-                    yield exportresult['query']['export']['*']
+                exportrequest = site.api(**exportparams)
+                xml = exportrequest['query']['export']['*']
+                yield makeXmlPageFromRaw(xml)
         else:
             for title in readTitles(config):
                 pparams = {
@@ -867,7 +907,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                     'rawcontinue': 'yes'
                 }
-                prequest = wikitools.api.APIRequest(site, pparams)
+                prequest = site.api(**pparams)
                 try:
                     results = prequest.query()
                     pages = results['query']['pages']
@@ -884,8 +924,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                         continue
                     yield xml
 
-    except wikitools.api.APIError:
-        print "This wikitools version seems not to work for us. Exiting."
+    except mwclient.errors.MwClientError:
+        print "This mwclient version seems not to work for us. Exiting."
         sys.exit()
 
 def makeXmlFromPage(page):

From 3760501f74bc8007b56a8d7c1e0581588131feb5 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 15:25:40 +0200
Subject: [PATCH 02/14] Add a couple comments

---
 dumpgenerator.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 4a87ded..59221bc 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -808,6 +808,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
     print 'XML dump saved at...', xmlfilename
 
 def getXMLRevisions(config={}, session=None, allpages=False):
+    # FIXME: actually figure out the various strategies for each MediaWiki version
     apiurl = urlparse(config['api'])
     # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
     # https://github.com/WikiTeam/wikiteam/issues/358
@@ -889,6 +890,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
         if config['curonly']:
             # The raw XML export in the API gets a title and gives the latest revision
             for title in readTitles(config):
+                # TODO: as we're doing one page and revision at a time,
+                # we might as well use xml format and exportnowrap=1
                 exportparams = {
                     'action': 'query',
                     'titles': title,

From f10adb71af60697d7c7c8860ae768b21c54840ba Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 15:27:10 +0200
Subject: [PATCH 03/14] Don't try to add revisions if the namespace has none

Traceback (most recent call last):
  File "dumpgenerator.py", line 2362, in <module>

  File "dumpgenerator.py", line 2354, in main
    resumePreviousDump(config=config, other=other)
  File "dumpgenerator.py", line 1921, in createNewDump
    getPageTitles(config=config, session=other['session'])
  File "dumpgenerator.py", line 755, in generateXMLDump
    for xml in getXMLRevisions(config=config, session=session):
  File "dumpgenerator.py", line 861, in getXMLRevisions
    revids.append(str(revision['revid']))
IndexError: list index out of range
---
 dumpgenerator.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 59221bc..08f6400 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -850,12 +850,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 print("Trying to list the revisions and to export them one by one")
                 # We only need the revision ID, all the rest will come from the raw export
                 arvparams['arvprop'] = 'ids'
+                arvrequest = site.api(**arvparams)
+                # Skip the namespace if it's empty
+                if len(arvrequest['query']['allrevisions']) < 1:
+                    continue
                 # Repeat the arvrequest with new arvparams until done
                 while True:
                     # Reset revision IDs from the previous batch from arv
                     revids = []
-                    # Get the new ones
-                    arvrequest = site.api(**arvparams)
                     for page in arvrequest['query']['allrevisions']:
                         for revision in page['revisions']:
                             revids.append(str(revision['revid']))
@@ -879,19 +881,22 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                         yield makeXmlPageFromRaw(xml)
                         
                     if 'continue' in arvrequest:
+                        # Get the new ones
                         arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
+                        arvrequest = site.api(**arvparams)
                     else:
                         # End of continuation. We are done with this namespace.
                         break
                     
-
     except KeyError:
+        # TODO: check whether the KeyError was really for a missing arv API
         print "Warning. Could not use allrevisions. Wiki too old?"
         if config['curonly']:
             # The raw XML export in the API gets a title and gives the latest revision
             for title in readTitles(config):
-                # TODO: as we're doing one page and revision at a time,
-                # we might as well use xml format and exportnowrap=1
+                # TODO: as we're doing one page and revision at a time, we might
+                # as well use xml format and exportnowrap=1 to use the string of,
+                # XML as is, but need to check how well the library handles it.
                 exportparams = {
                     'action': 'query',
                     'titles': title,
@@ -899,8 +904,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 }
                 exportrequest = site.api(**exportparams)
                 xml = exportrequest['query']['export']['*']
+                # Because we got the fancy XML from the JSON format, clean it:
                 yield makeXmlPageFromRaw(xml)
         else:
+            # This is the closest to what we usually do with Special:Export:
+            # take one title at a time and try to get all revisions exported.
+            # The XML needs to be made manually because the export=1 option
+            # refuses to return an arbitrary number of revisions (see above).
             for title in readTitles(config):
                 pparams = {
                     'action': 'query',
@@ -908,7 +918,6 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     'prop': 'revisions',
                     'rvlimit': 'max',
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
-                    'rawcontinue': 'yes'
                 }
                 prequest = site.api(**pparams)
                 try:

From 6b12e20a9d97417c13f00de77de9d8053b4a641e Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 15:46:05 +0200
Subject: [PATCH 04/14] Actually convert the titles query method to mwclient
 too

---
 dumpgenerator.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 08f6400..4054b10 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -892,7 +892,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
         # TODO: check whether the KeyError was really for a missing arv API
         print "Warning. Could not use allrevisions. Wiki too old?"
         if config['curonly']:
-            # The raw XML export in the API gets a title and gives the latest revision
+            # The raw XML export in the API gets a title and gives the latest revision.
+            # We could also use the allpages API as generator but let's be consistent.
             for title in readTitles(config):
                 # TODO: as we're doing one page and revision at a time, we might
                 # as well use xml format and exportnowrap=1 to use the string of,
@@ -909,6 +910,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
         else:
             # This is the closest to what we usually do with Special:Export:
             # take one title at a time and try to get all revisions exported.
+            # It differs from the allrevisions method because it actually needs
+            # to be input the page titles; otherwise, the requests are similar.
             # The XML needs to be made manually because the export=1 option
             # refuses to return an arbitrary number of revisions (see above).
             for title in readTitles(config):
@@ -920,21 +923,31 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                 }
                 prequest = site.api(**pparams)
+                # The array is called "pages" even if there's only one.
+                # TODO: we could actually batch titles a bit here if desired. How many?
                 try:
-                    results = prequest.query()
-                    pages = results['query']['pages']
+                    pages = prequest['query']['pages']
                 except KeyError:
                     raise PageMissingError(title, xml='')
-                for page in pages:
-                    try:
-                        xml = makeXmlFromPage(pages[page])
-                    except PageMissingError:
-                        logerror(
-                            config=config,
-                            text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
-                        )
-                        continue
-                    yield xml
+                # Be ready to iterate if there is continuation.
+                while True:
+                    # Go through the data we got to build the XML.
+                    for page in pages:
+                        try:
+                            xml = makeXmlFromPage(pages[page])
+                        except PageMissingError:
+                            logerror(
+                                config=config,
+                                text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
+                            )
+                            continue
+                        yield xml
+
+                    # Get next batch of revisions if there's more.
+                    if 'continue' in prequest:
+                        pparams['rvcontinue'] = prequest['rvcontinue']
+                        prequest = site.api(**pparams)
+
 
     except mwclient.errors.MwClientError:
         print "This mwclient version seems not to work for us. Exiting."

From 0f35d03929883aee374f2d2049df56b4a368bb93 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 16:06:54 +0200
Subject: [PATCH 05/14] Remove rvlimit=max, fails in MediaWiki 1.16

For instance:
"Exception Caught: Internal error in ApiResult::setElement: Attempting to add element revisions=50, existing value is 500"
https://wiki.rabenthal.net/api.php?action=query&prop=revisions&titles=Hauptseite&rvprop=ids&rvlimit=max
---
 dumpgenerator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 4054b10..f628843 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -915,11 +915,12 @@ def getXMLRevisions(config={}, session=None, allpages=False):
             # The XML needs to be made manually because the export=1 option
             # refuses to return an arbitrary number of revisions (see above).
             for title in readTitles(config):
+                # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
+                # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
                 pparams = {
                     'action': 'query',
                     'titles': title,
                     'prop': 'revisions',
-                    'rvlimit': 'max',
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                 }
                 prequest = site.api(**pparams)

From 9ec6ce42d33d2dcaa12d59e0389467e839e30ace Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 16:42:22 +0200
Subject: [PATCH 06/14] Finish xmlrevisions option for older wikis

* Actually proceed to the next page when no continuation.
* Provide the same output as with the usual per-page export.

Tested on a MediaWiki 1.16 wiki with success.
---
 dumpgenerator.py | 76 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index f628843..c6717df 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
             for xml in getXMLRevisions(config=config, session=session):
                 numrevs = len(re.findall(r_timestamp, xml))
                 # Due to how generators work, it's expected this may be less
+                # TODO: get the page title and reuse the usual format "X title, y edits"
                 print "%d more revisions exported" % numrevs
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
@@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
                 print("Trying to get wikitext from the allrevisions API and to build the XML")
                 while True:
-                    arvrequest = site.api(**arvparams)
+                    try:
+                        arvrequest = site.api(**arvparams)
+                    except requests.exceptions.ReadTimeout as err:
+                        # Hopefully temporary, just wait a bit and continue with the same request.
+                        # No point putting a limit to retries, we'd need to abort everything.
+                        # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
+                        # to use the retry adapter we use for our own requests session?
+                        print("ERROR: {}".format(str(err)))
+                        print("Sleeping for 20 seconds")
+                        time.sleep(20)
+                        continue
+
                     for page in arvrequest['query']['allrevisions']:
                         yield makeXmlFromPage(page)
                     if 'continue' in arvrequest:
@@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 # We only need the revision ID, all the rest will come from the raw export
                 arvparams['arvprop'] = 'ids'
                 arvrequest = site.api(**arvparams)
+                exportparams = {
+                    'action': 'query',
+                    'export': '1',
+                }
                 # Skip the namespace if it's empty
                 if len(arvrequest['query']['allrevisions']) < 1:
                     continue
@@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                         for revision in page['revisions']:
                             revids.append(str(revision['revid']))
                     print "%d more revisions listed, until %s" % (len(revids), revids[-1])
+
                     # We can now get the XML for one revision at a time
                     # FIXME: we can actually get them in batches as we used to
                     # but need to figure out the continuation and avoid that the API
                     # chooses to give us only the latest for each page
-                    exportparams = {
-                        'action': 'query',
-                        'export': '1',
-                    }
                     for revid in revids:
                         exportparams['revids'] = revid
                         exportrequest = site.api(**exportparams)
@@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     if 'continue' in arvrequest:
                         # Get the new ones
                         arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
-                        arvrequest = site.api(**arvparams)
+                        try:
+                            arvrequest = site.api(**arvparams)
+                        except requests.exceptions.ReadTimeout as err:
+                            # As above
+                            print("ERROR: {}".format(str(err)))
+                            print("Sleeping for 20 seconds")
+                            time.sleep(20)
+                            # But avoid rewriting the same revisions
+                            arvrequest['query']['allrevisions'] = []
+                            continue
                     else:
                         # End of continuation. We are done with this namespace.
                         break
@@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
         if config['curonly']:
             # The raw XML export in the API gets a title and gives the latest revision.
             # We could also use the allpages API as generator but let's be consistent.
+            print("Getting titles to export the latest revision for each")
+            c = 0
             for title in readTitles(config):
+                # TODO: respect verbose flag, reuse output from getXMLPage
+                print('    {}'.format(title.strip()))
                 # TODO: as we're doing one page and revision at a time, we might
                 # as well use xml format and exportnowrap=1 to use the string of,
                 # XML as is, but need to check how well the library handles it.
@@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 }
                 exportrequest = site.api(**exportparams)
                 xml = exportrequest['query']['export']['*']
+                c += 1
+                if c % 10 == 0:
+                    print('Downloaded {} pages'.format(c))
                 # Because we got the fancy XML from the JSON format, clean it:
                 yield makeXmlPageFromRaw(xml)
         else:
@@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False):
             # to be input the page titles; otherwise, the requests are similar.
             # The XML needs to be made manually because the export=1 option
             # refuses to return an arbitrary number of revisions (see above).
+            print("Getting titles to export all the revisions of each")
+            c = 0
             for title in readTitles(config):
+                print('    {}'.format(title.strip()))
                 # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
                 # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
                 pparams = {
                     'action': 'query',
                     'titles': title,
                     'prop': 'revisions',
+                    'rvlimit': 50,
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                 }
                 prequest = site.api(**pparams)
+                c += 1
+                if c % 10 == 0:
+                    print('Downloaded {} pages'.format(c))
                 # The array is called "pages" even if there's only one.
                 # TODO: we could actually batch titles a bit here if desired. How many?
                 try:
@@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     for page in pages:
                         try:
                             xml = makeXmlFromPage(pages[page])
+                            yield xml
                         except PageMissingError:
                             logerror(
                                 config=config,
                                 text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
                             )
                             continue
-                        yield xml
 
                     # Get next batch of revisions if there's more.
                     if 'continue' in prequest:
+                        print("Getting more revisions for page {}".format(title))
                         pparams['rvcontinue'] = prequest['rvcontinue']
                         prequest = site.api(**pparams)
+                    else:
+                        break
 
 
     except mwclient.errors.MwClientError:
@@ -958,30 +997,41 @@ def makeXmlFromPage(page):
     """ Output an XML document as a string from a page as in the API JSON """
     try:
         p = E.page(
-                E.title(page['title']),
+                E.title(to_unicode(page['title'])),
                 E.ns(to_unicode(page['ns'])),
                 E.id(to_unicode(page['pageid'])),
         )
         for rev in page['revisions']:
+            # Older releases like MediaWiki 1.16 do not return all fields.
+            if 'userid' in rev:
+                userid = rev['userid']
+            else:
+                userid = 0
+            if 'size' in rev:
+                size = rev['size']
+            else:
+                size = 0
             revision = E.revision(
                 E.id(to_unicode(rev['revid'])),
                 E.parentid(to_unicode(rev['parentid'])),
                 E.timestamp(rev['timestamp']),
                 E.contributor(
-                        E.id(to_unicode(rev['userid'])),
+                        E.id(to_unicode(userid)),
                         E.username(to_unicode(rev['user'])),
                 ),
-                E.comment(rev['comment']),
-                E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
+                E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
             )
+            if 'comment' in rev:
+                revision.append(E.comment(to_unicode(rev['comment'])))
             if 'contentmodel' in rev:
                 revision.append(E.model(rev['contentmodel']))
             # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
             if 'sha1' in rev:
                 revision.append(E.sha1(rev['sha1']))
             p.append(revision)
-    except KeyError:
-        raise PageMissingError(page['title'], '')
+    except KeyError as e:
+        print(e)
+        raise PageMissingError(page['title'], e)
     return etree.tostring(p, pretty_print=True)
 
 def readTitles(config={}, start=None):

From f0436ee57cf92a1c3d303102797111d8400c7c43 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 17:59:03 +0200
Subject: [PATCH 07/14] Make mwclient respect the provided HTTP/HTTPS scheme

Fixes https://github.com/WikiTeam/wikiteam/issues/358
---
 dumpgenerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index c6717df..6aa2c9e 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -813,7 +813,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
     apiurl = urlparse(config['api'])
     # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
     # https://github.com/WikiTeam/wikiteam/issues/358
-    site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""))
+    site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
     if not 'all' in config['namespaces']:
         namespaces = config['namespaces']
     else:

From becd01b2714a51cd25e2f2f06800c4df5e2ccfec Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 18:00:43 +0200
Subject: [PATCH 08/14] Use defined requests.exceptions.ConnectionError

Fixes https://github.com/WikiTeam/wikiteam/issues/356
---
 dumpgenerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 6aa2c9e..87eca91 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -273,7 +273,7 @@ def getPageTitlesAPI(config={}, session=None):
                 try:
                     r = session.get(url=config['api'], params=params, timeout=30)
                     break
-                except ConnectionError as err:
+                except requests.exceptions.ConnectionError as err:
                     print "Connection error: %s" % (str(err),)
                     retryCount += 1
                     time.sleep(20)

From 0b37b39923c7258ac044730a2920a3263bfbd17f Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 18:05:42 +0200
Subject: [PATCH 09/14] Define xml header as empty first so that it can fail
 graciously

Fixes https://github.com/WikiTeam/wikiteam/issues/355
---
 dumpgenerator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 87eca91..1b863ed 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -463,6 +463,7 @@ def getXMLHeader(config={}, session=None):
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
     print config['api']
+    xml = ''
     if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
         xml = None
         try:

From 1645c1d83272b726017dfbdcf4a5a5516135e85b Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 18:18:26 +0200
Subject: [PATCH 10/14] More robust XML header fetch for getXMLHeader()

Avoid UnboundLocalError: local variable 'xml' referenced before assignment

If the page exists, its XML export is returned by the API; otherwise only
the header that we were looking for.

Fixes https://github.com/WikiTeam/wikiteam/issues/355
---
 dumpgenerator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 1b863ed..07301bf 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -468,10 +468,12 @@ def getXMLHeader(config={}, session=None):
         xml = None
         try:
             print 'Getting the XML header from the API'
-            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
-            xml = r.json()['query']['export']['*']
+            # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18
+            r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10)
+            xml = r.text
             if not xml:
-                r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
+                # Do without a generator, use our usual trick of a random page title
+                r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10)
                 xml = r.text
         except requests.exceptions.RetryError:
             pass

From 92da7388b0301cb496526c027b624066295f5f61 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 18:33:34 +0200
Subject: [PATCH 11/14] Avoid asking allpages API if API not available

So that it doesn't have to iterate among non-existing titles.

Fixes https://github.com/WikiTeam/wikiteam/issues/348
---
 dumpgenerator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 07301bf..2fc8374 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -213,7 +213,7 @@ def getNamespacesAPI(config={}, session=None):
         try:
             nsquery = result['query']['namespaces']
         except KeyError:
-            print "Error: could not get namespaces from the API request"
+            print "Error: could not get namespaces from the API request."
             print "HTTP %d" % r.status_code
             print r.text
             return None
@@ -1741,6 +1741,7 @@ def getParameters(params=[]):
     else:
         if index and not args.wiki:
             print 'API not available. Trying with index.php only.'
+            args.api = None
         else:
             print 'Error in API. Please, provide a correct path to API'
             sys.exit(1)

From 8b5378f9910d433343aecb35c3f7501c2f465ddd Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 19:33:10 +0200
Subject: [PATCH 12/14] Fix query prop=revisions continuation in MediaWiki 1.22

This wiki has the old query-continue format but it's not exposes here.
---
 dumpgenerator.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 2fc8374..58c55cf 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -984,10 +984,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                             continue
 
                     # Get next batch of revisions if there's more.
-                    if 'continue' in prequest:
+                    if 'continue' in prequest.keys():
                         print("Getting more revisions for page {}".format(title))
-                        pparams['rvcontinue'] = prequest['rvcontinue']
+                        pparams['rvcontinue'] = prequest['continue']['rvcontinue']
                         prequest = site.api(**pparams)
+                    # mwclient seems to rewrite query-continue
+                    #if 'query-continue' in prequest.keys():
+                    #    pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue']
                     else:
                         break
 

From 49017e3f209db2e6a897ac19fc6ade92431fcab8 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 20:50:02 +0200
Subject: [PATCH 13/14] Catch HTTP Error 405 and switch from POST to GET for
 API requests

Seen on http://wiki.ainigma.eu/index.php?title=Hlavn%C3%AD_strana:
HTTPError: HTTP Error 405: Method Not Allowed
---
 dumpgenerator.py | 58 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 58c55cf..3ba9222 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -840,7 +840,12 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 print("Trying to get wikitext from the allrevisions API and to build the XML")
                 while True:
                     try:
-                        arvrequest = site.api(**arvparams)
+                        arvrequest = site.api(http_method=config['http_method'], **arvparams)
+                    except requests.exceptions.HTTPError as e:
+                        if e.response.status_code == 405 and config['http_method'] == "POST":
+                            print("POST request to the API failed, retrying with GET")
+                            config['http_method'] = "GET"
+                            continue
                     except requests.exceptions.ReadTimeout as err:
                         # Hopefully temporary, just wait a bit and continue with the same request.
                         # No point putting a limit to retries, we'd need to abort everything.
@@ -865,7 +870,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 print("Trying to list the revisions and to export them one by one")
                 # We only need the revision ID, all the rest will come from the raw export
                 arvparams['arvprop'] = 'ids'
-                arvrequest = site.api(**arvparams)
+                try:
+                    arvrequest = site.api(http_method=config['http_method'], **arvparams)
+                except requests.exceptions.HTTPError as e:
+                    if e.response.status_code == 405 and config['http_method'] == "POST":
+                        print("POST request to the API failed, retrying with GET")
+                        config['http_method'] = "GET"
+                        continue
                 exportparams = {
                     'action': 'query',
                     'export': '1',
@@ -888,7 +899,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     # chooses to give us only the latest for each page
                     for revid in revids:
                         exportparams['revids'] = revid
-                        exportrequest = site.api(**exportparams)
+                        try:
+                            exportrequest = site.api(http_method=config['http_method'], **exportparams)
+                        except requests.exceptions.HTTPError as e:
+                            if e.response.status_code == 405 and config['http_method'] == "POST":
+                                print("POST request to the API failed, retrying with GET")
+                                config['http_method'] = "GET"
+                                exportrequest = site.api(http_method=config['http_method'], **exportparams)
+
                         # This gives us a self-standing <mediawiki> element
                         # but we only need the inner <page>: we can live with
                         # duplication and non-ordering of page titles, but the
@@ -900,7 +918,12 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                         # Get the new ones
                         arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
                         try:
-                            arvrequest = site.api(**arvparams)
+                            arvrequest = site.api(http_method=config['http_method'], **arvparams)
+                        except requests.exceptions.HTTPError as e:
+                            if e.response.status_code == 405 and config['http_method'] == "POST":
+                                print("POST request to the API failed, retrying with GET")
+                                config['http_method'] = "GET"
+                                arvrequest = site.api(http_method=config['http_method'], **arvparams)
                         except requests.exceptions.ReadTimeout as err:
                             # As above
                             print("ERROR: {}".format(str(err)))
@@ -932,7 +955,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     'titles': title,
                     'export': '1',
                 }
-                exportrequest = site.api(**exportparams)
+                try:
+                    exportrequest = site.api(http_method=config['http_method'], **exportparams)
+                except requests.exceptions.HTTPError as e:
+                    if e.response.status_code == 405 and config['http_method'] == "POST":
+                        print("POST request to the API failed, retrying with GET")
+                        config['http_method'] = "GET"
+                        exportrequest = site.api(http_method=config['http_method'], **exportparams)
+
                 xml = exportrequest['query']['export']['*']
                 c += 1
                 if c % 10 == 0:
@@ -959,7 +989,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     'rvlimit': 50,
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                 }
-                prequest = site.api(**pparams)
+                try:
+                    prequest = site.api(http_method=config['http_method'], **pparams)
+                except requests.exceptions.HTTPError as e:
+                    if e.response.status_code == 405 and config['http_method'] == "POST":
+                        print("POST request to the API failed, retrying with GET")
+                        config['http_method'] = "GET"
+                        exportrequest = site.api(http_method=config['http_method'], **exportparams)
+
                 c += 1
                 if c % 10 == 0:
                     print('Downloaded {} pages'.format(c))
@@ -987,7 +1024,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     if 'continue' in prequest.keys():
                         print("Getting more revisions for page {}".format(title))
                         pparams['rvcontinue'] = prequest['continue']['rvcontinue']
-                        prequest = site.api(**pparams)
+                        try:
+                            prequest = site.api(http_method=config['http_method'], **pparams)
+                        except requests.exceptions.HTTPError as e:
+                            if e.response.status_code == 405 and config['http_method'] == "POST":
+                                print("POST request to the API failed, retrying with GET")
+                                config['http_method'] = "GET"
+                                prequest = site.api(http_method=config['http_method'], **pparams)
                     # mwclient seems to rewrite query-continue
                     #if 'query-continue' in prequest.keys():
                     #    pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue']
@@ -1826,6 +1869,7 @@ def getParameters(params=[]):
         'date': datetime.datetime.now().strftime('%Y%m%d'),
         'api': api,
         'failfast': args.failfast,
+        'http_method': "POST",
         'index': index,
         'images': args.images,
         'logs': False,

From faf0e31b4e33067b80f292186a39c8da91dbab59 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 21:19:01 +0200
Subject: [PATCH 14/14] Don't set apfrom in initial allpages request, use
 suggested continuation

Helps with recent MediaWiki versions like 1.31 where variants of "!" can
give a bad title error and the continuation wants apcontinue anyway.
---
 dumpgenerator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 3ba9222..b197fb6 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -257,14 +257,14 @@ def getPageTitlesAPI(config={}, session=None):
 
         c = 0
         print '    Retrieving titles in the namespace %d' % (namespace)
-        apfrom = '!'
+        apfrom = ''
         while apfrom:
             sys.stderr.write('.')  # progress
             params = {
                 'action': 'query',
                 'list': 'allpages',
                 'apnamespace': namespace,
-                'apfrom': apfrom.encode('utf-8'),
+                'apfrom': apfrom,
                 'format': 'json',
                 'aplimit': 500}