From a664b17a9cc202a4207394d9a0f23e7c39016418 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Thu, 13 Feb 2020 17:13:16 +0200 Subject: [PATCH 1/2] Handle deleted contributor name in --xmlrevisions Avoids failure in https://deployment.wikimedia.beta.wmflabs.org/w/api.php for revision https://deployment.wikimedia.beta.wmflabs.org/?oldid=2349 . --- dumpgenerator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 3278848..b6adcc6 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1033,12 +1033,16 @@ def makeXmlFromPage(page): E.id(to_unicode(rev['revid'])), E.parentid(to_unicode(rev['parentid'])), E.timestamp(rev['timestamp']), - E.contributor( - E.username(to_unicode(rev['user'])), - E.id(to_unicode(userid)), - ), E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)), ) + # The username may be deleted/suppressed + if 'user' in rev: + revision.append(E.contributor( + E.username(to_unicode(rev['user'])), + E.id(to_unicode(userid)), + )) + else: + revision.append(E.contributor(deleted="deleted")) if 'comment' in rev: revision.append(E.comment(to_unicode(rev['comment']))) if 'contentmodel' in rev: From 9ac1e6d0f12d2f066e976ddb82e175a9bff44d63 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Fri, 14 Feb 2020 12:50:02 +0200 Subject: [PATCH 2/2] Implement resume in --xmlrevisions (but not yet with list=allrevisions) Tested with a partial dumps over 100 MB: https://tinyvillage.fandom.com/api.php (grepped to see the previously downloaded ones were kept and the new ones continued from expected; did not validate a final XML). --- dumpgenerator.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index b6adcc6..0238d1f 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -715,12 +715,16 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): lock = True if config['xmlrevisions']: - print 'Retrieving the XML for every page from the beginning' - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') - xmlfile.write(header.encode('utf-8')) + if start: + print("WARNING: will try to start the download from title: {}".format(start)) + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') + else: + print 'Retrieving the XML for every page from the beginning' + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') + xmlfile.write(header.encode('utf-8')) try: r_timestamp = r'<timestamp>([^<]+)</timestamp>' - for xml in getXMLRevisions(config=config, session=session): + for xml in getXMLRevisions(config=config, session=session, start=start): numrevs = len(re.findall(r_timestamp, xml)) # Due to how generators work, it's expected this may be less # TODO: get the page title and reuse the usual format "X title, y edits" @@ -776,7 +780,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): xmlfile.close() print 'XML dump saved at...', xmlfilename -def getXMLRevisions(config={}, session=None, allpages=False): +def getXMLRevisions(config={}, session=None, allpages=False, start=None): # FIXME: actually figure out the various strategies for each MediaWiki version apiurl = urlparse(config['api']) # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? @@ -790,7 +794,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): try: for namespace in namespaces: - print "Trying to export all revisions from namespace %s" % namespace + print("Trying to export all revisions from namespace %s" % namespace) # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!) arvparams = { 'action': 'query', @@ -910,7 +914,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): # We could also use the allpages API as generator but let's be consistent. print("Getting titles to export the latest revision for each") c = 0 - for title in readTitles(config): + for title in readTitles(config, start=start): # TODO: respect verbose flag, reuse output from getXMLPage print(' {}'.format(title.strip())) # TODO: as we're doing one page and revision at a time, we might @@ -944,7 +948,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): # refuses to return an arbitrary number of revisions (see above). print("Getting titles to export all the revisions of each") c = 0 - for title in readTitles(config): + for title in readTitles(config, start=start): print(' {}'.format(title.strip())) # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}