From 9ec6ce42d33d2dcaa12d59e0389467e839e30ace Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Mon, 10 Feb 2020 16:42:22 +0200
Subject: [PATCH] Finish xmlrevisions option for older wikis

* Actually proceed to the next page when no continuation.
* Provide the same output as with the usual per-page export.

Tested on a MediaWiki 1.16 wiki with success.
---
 dumpgenerator.py | 76 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index f628843..c6717df 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
             for xml in getXMLRevisions(config=config, session=session):
                 numrevs = len(re.findall(r_timestamp, xml))
                 # Due to how generators work, it's expected this may be less
+                # TODO: get the page title and reuse the usual format "X title, y edits"
                 print "%d more revisions exported" % numrevs
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
@@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
                 print("Trying to get wikitext from the allrevisions API and to build the XML")
                 while True:
-                    arvrequest = site.api(**arvparams)
+                    try:
+                        arvrequest = site.api(**arvparams)
+                    except requests.exceptions.ReadTimeout as err:
+                        # Hopefully temporary, just wait a bit and continue with the same request.
+                        # No point putting a limit to retries, we'd need to abort everything.
+                        # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
+                        # to use the retry adapter we use for our own requests session?
+                        print("ERROR: {}".format(str(err)))
+                        print("Sleeping for 20 seconds")
+                        time.sleep(20)
+                        continue
+
                     for page in arvrequest['query']['allrevisions']:
                         yield makeXmlFromPage(page)
                     if 'continue' in arvrequest:
@@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 # We only need the revision ID, all the rest will come from the raw export
                 arvparams['arvprop'] = 'ids'
                 arvrequest = site.api(**arvparams)
+                exportparams = {
+                    'action': 'query',
+                    'export': '1',
+                }
                 # Skip the namespace if it's empty
                 if len(arvrequest['query']['allrevisions']) < 1:
                     continue
@@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                         for revision in page['revisions']:
                             revids.append(str(revision['revid']))
                     print "%d more revisions listed, until %s" % (len(revids), revids[-1])
+
                     # We can now get the XML for one revision at a time
                     # FIXME: we can actually get them in batches as we used to
                     # but need to figure out the continuation and avoid that the API
                     # chooses to give us only the latest for each page
-                    exportparams = {
-                        'action': 'query',
-                        'export': '1',
-                    }
                     for revid in revids:
                         exportparams['revids'] = revid
                         exportrequest = site.api(**exportparams)
@@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     if 'continue' in arvrequest:
                         # Get the new ones
                         arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
-                        arvrequest = site.api(**arvparams)
+                        try:
+                            arvrequest = site.api(**arvparams)
+                        except requests.exceptions.ReadTimeout as err:
+                            # As above
+                            print("ERROR: {}".format(str(err)))
+                            print("Sleeping for 20 seconds")
+                            time.sleep(20)
+                            # But avoid rewriting the same revisions
+                            arvrequest['query']['allrevisions'] = []
+                            continue
                     else:
                         # End of continuation. We are done with this namespace.
                         break
@@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
         if config['curonly']:
             # The raw XML export in the API gets a title and gives the latest revision.
             # We could also use the allpages API as generator but let's be consistent.
+            print("Getting titles to export the latest revision for each")
+            c = 0
             for title in readTitles(config):
+                # TODO: respect verbose flag, reuse output from getXMLPage
+                print('    {}'.format(title.strip()))
                 # TODO: as we're doing one page and revision at a time, we might
                 # as well use xml format and exportnowrap=1 to use the string of,
                 # XML as is, but need to check how well the library handles it.
@@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                 }
                 exportrequest = site.api(**exportparams)
                 xml = exportrequest['query']['export']['*']
+                c += 1
+                if c % 10 == 0:
+                    print('Downloaded {} pages'.format(c))
                 # Because we got the fancy XML from the JSON format, clean it:
                 yield makeXmlPageFromRaw(xml)
         else:
@@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False):
             # to be input the page titles; otherwise, the requests are similar.
             # The XML needs to be made manually because the export=1 option
             # refuses to return an arbitrary number of revisions (see above).
+            print("Getting titles to export all the revisions of each")
+            c = 0
             for title in readTitles(config):
+                print('    {}'.format(title.strip()))
                 # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
                 # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
                 pparams = {
                     'action': 'query',
                     'titles': title,
                     'prop': 'revisions',
+                    'rvlimit': 50,
                     'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                 }
                 prequest = site.api(**pparams)
+                c += 1
+                if c % 10 == 0:
+                    print('Downloaded {} pages'.format(c))
                 # The array is called "pages" even if there's only one.
                 # TODO: we could actually batch titles a bit here if desired. How many?
                 try:
@@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                     for page in pages:
                         try:
                             xml = makeXmlFromPage(pages[page])
+                            yield xml
                         except PageMissingError:
                             logerror(
                                 config=config,
                                 text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
                             )
                             continue
-                        yield xml
 
                     # Get next batch of revisions if there's more.
                     if 'continue' in prequest:
+                        print("Getting more revisions for page {}".format(title))
                         pparams['rvcontinue'] = prequest['rvcontinue']
                         prequest = site.api(**pparams)
+                    else:
+                        break
 
 
     except mwclient.errors.MwClientError:
@@ -958,30 +997,41 @@ def makeXmlFromPage(page):
     """ Output an XML document as a string from a page as in the API JSON """
     try:
         p = E.page(
-                E.title(page['title']),
+                E.title(to_unicode(page['title'])),
                 E.ns(to_unicode(page['ns'])),
                 E.id(to_unicode(page['pageid'])),
         )
         for rev in page['revisions']:
+            # Older releases like MediaWiki 1.16 do not return all fields.
+            if 'userid' in rev:
+                userid = rev['userid']
+            else:
+                userid = 0
+            if 'size' in rev:
+                size = rev['size']
+            else:
+                size = 0
             revision = E.revision(
                 E.id(to_unicode(rev['revid'])),
                 E.parentid(to_unicode(rev['parentid'])),
                 E.timestamp(rev['timestamp']),
                 E.contributor(
-                        E.id(to_unicode(rev['userid'])),
+                        E.id(to_unicode(userid)),
                         E.username(to_unicode(rev['user'])),
                 ),
-                E.comment(rev['comment']),
-                E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
+                E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
             )
+            if 'comment' in rev:
+                revision.append(E.comment(to_unicode(rev['comment'])))
             if 'contentmodel' in rev:
                 revision.append(E.model(rev['contentmodel']))
             # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
             if 'sha1' in rev:
                 revision.append(E.sha1(rev['sha1']))
             p.append(revision)
-    except KeyError:
-        raise PageMissingError(page['title'], '')
+    except KeyError as e:
+        print(e)
+        raise PageMissingError(page['title'], e)
     return etree.tostring(p, pretty_print=True)
 
 def readTitles(config={}, start=None):