From f10adb71af60697d7c7c8860ae768b21c54840ba Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 15:27:10 +0200 Subject: [PATCH] Don't try to add revisions if the namespace has none Traceback (most recent call last): File "dumpgenerator.py", line 2362, in File "dumpgenerator.py", line 2354, in main resumePreviousDump(config=config, other=other) File "dumpgenerator.py", line 1921, in createNewDump getPageTitles(config=config, session=other['session']) File "dumpgenerator.py", line 755, in generateXMLDump for xml in getXMLRevisions(config=config, session=session): File "dumpgenerator.py", line 861, in getXMLRevisions revids.append(str(revision['revid'])) IndexError: list index out of range --- dumpgenerator.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 59221bc..08f6400 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -850,12 +850,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to list the revisions and to export them one by one") # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' + arvrequest = site.api(**arvparams) + # Skip the namespace if it's empty + if len(arvrequest['query']['allrevisions']) < 1: + continue # Repeat the arvrequest with new arvparams until done while True: # Reset revision IDs from the previous batch from arv revids = [] - # Get the new ones - arvrequest = site.api(**arvparams) for page in arvrequest['query']['allrevisions']: for revision in page['revisions']: revids.append(str(revision['revid'])) @@ -879,19 +881,22 @@ def getXMLRevisions(config={}, session=None, allpages=False): yield makeXmlPageFromRaw(xml) if 'continue' in arvrequest: + # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + arvrequest = site.api(**arvparams) else: # End of continuation. We are done with this namespace. break - except KeyError: + # TODO: check whether the KeyError was really for a missing arv API print "Warning. Could not use allrevisions. Wiki too old?" if config['curonly']: # The raw XML export in the API gets a title and gives the latest revision for title in readTitles(config): - # TODO: as we're doing one page and revision at a time, - # we might as well use xml format and exportnowrap=1 + # TODO: as we're doing one page and revision at a time, we might + # as well use xml format and exportnowrap=1 to use the string of, + # XML as is, but need to check how well the library handles it. exportparams = { 'action': 'query', 'titles': title, @@ -899,8 +904,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): } exportrequest = site.api(**exportparams) xml = exportrequest['query']['export']['*'] + # Because we got the fancy XML from the JSON format, clean it: yield makeXmlPageFromRaw(xml) else: + # This is the closest to what we usually do with Special:Export: + # take one title at a time and try to get all revisions exported. + # The XML needs to be made manually because the export=1 option + # refuses to return an arbitrary number of revisions (see above). for title in readTitles(config): pparams = { 'action': 'query', @@ -908,7 +918,6 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'prop': 'revisions', 'rvlimit': 'max', 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', - 'rawcontinue': 'yes' } prequest = site.api(**pparams) try: