From a664b17a9cc202a4207394d9a0f23e7c39016418 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Thu, 13 Feb 2020 17:13:16 +0200
Subject: [PATCH 1/2] Handle deleted contributor name in --xmlrevisions

Avoids failure in https://deployment.wikimedia.beta.wmflabs.org/w/api.php
for revision https://deployment.wikimedia.beta.wmflabs.org/?oldid=2349 .
---
 dumpgenerator.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 3278848..b6adcc6 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -1033,12 +1033,16 @@ def makeXmlFromPage(page):
                 E.id(to_unicode(rev['revid'])),
                 E.parentid(to_unicode(rev['parentid'])),
                 E.timestamp(rev['timestamp']),
-                E.contributor(
-                        E.username(to_unicode(rev['user'])),
-                        E.id(to_unicode(userid)),
-                ),
                 E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
             )
+            # The username may be deleted/suppressed
+            if 'user' in rev:
+                revision.append(E.contributor(
+                        E.username(to_unicode(rev['user'])),
+                        E.id(to_unicode(userid)),
+                ))
+            else:
+                revision.append(E.contributor(deleted="deleted"))
             if 'comment' in rev:
                 revision.append(E.comment(to_unicode(rev['comment'])))
             if 'contentmodel' in rev:

From 9ac1e6d0f12d2f066e976ddb82e175a9bff44d63 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Fri, 14 Feb 2020 12:50:02 +0200
Subject: [PATCH 2/2] Implement resume in --xmlrevisions (but not yet with
 list=allrevisions)

Tested with a partial dumps over 100 MB:
https://tinyvillage.fandom.com/api.php
(grepped <title> to see the previously downloaded ones were kept and the
new ones continued from expected; did not validate a final XML).
---
 dumpgenerator.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index b6adcc6..0238d1f 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -715,12 +715,16 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
     lock = True
 
     if config['xmlrevisions']:
-        print 'Retrieving the XML for every page from the beginning'
-        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
-        xmlfile.write(header.encode('utf-8'))
+        if start:
+            print("WARNING: will try to start the download from title: {}".format(start))
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
+        else:
+            print 'Retrieving the XML for every page from the beginning'
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+            xmlfile.write(header.encode('utf-8'))
         try:
             r_timestamp = r'<timestamp>([^<]+)</timestamp>'
-            for xml in getXMLRevisions(config=config, session=session):
+            for xml in getXMLRevisions(config=config, session=session, start=start):
                 numrevs = len(re.findall(r_timestamp, xml))
                 # Due to how generators work, it's expected this may be less
                 # TODO: get the page title and reuse the usual format "X title, y edits"
@@ -776,7 +780,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
     xmlfile.close()
     print 'XML dump saved at...', xmlfilename
 
-def getXMLRevisions(config={}, session=None, allpages=False):
+def getXMLRevisions(config={}, session=None, allpages=False, start=None):
     # FIXME: actually figure out the various strategies for each MediaWiki version
     apiurl = urlparse(config['api'])
     # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
@@ -790,7 +794,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
 
     try:
         for namespace in namespaces:
-            print "Trying to export all revisions from namespace %s" % namespace
+            print("Trying to export all revisions from namespace %s" % namespace)
             # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
             arvparams = {
                 'action': 'query',
@@ -910,7 +914,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
             # We could also use the allpages API as generator but let's be consistent.
             print("Getting titles to export the latest revision for each")
             c = 0
-            for title in readTitles(config):
+            for title in readTitles(config, start=start):
                 # TODO: respect verbose flag, reuse output from getXMLPage
                 print('    {}'.format(title.strip()))
                 # TODO: as we're doing one page and revision at a time, we might
@@ -944,7 +948,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
             # refuses to return an arbitrary number of revisions (see above).
             print("Getting titles to export all the revisions of each")
             c = 0
-            for title in readTitles(config):
+            for title in readTitles(config, start=start):
                 print('    {}'.format(title.strip()))
                 # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
                 # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}