From 9ec6ce42d33d2dcaa12d59e0389467e839e30ace Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 16:42:22 +0200 Subject: [PATCH] Finish xmlrevisions option for older wikis * Actually proceed to the next page when no continuation. * Provide the same output as with the usual per-page export. Tested on a MediaWiki 1.16 wiki with success. --- dumpgenerator.py | 76 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index f628843..c6717df 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): for xml in getXMLRevisions(config=config, session=session): numrevs = len(re.findall(r_timestamp, xml)) # Due to how generators work, it's expected this may be less + # TODO: get the page title and reuse the usual format "X title, y edits" print "%d more revisions exported" % numrevs xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) @@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False): arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content' print("Trying to get wikitext from the allrevisions API and to build the XML") while True: - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(**arvparams) + except requests.exceptions.ReadTimeout as err: + # Hopefully temporary, just wait a bit and continue with the same request. + # No point putting a limit to retries, we'd need to abort everything. + # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient + # to use the retry adapter we use for our own requests session? + print("ERROR: {}".format(str(err))) + print("Sleeping for 20 seconds") + time.sleep(20) + continue + for page in arvrequest['query']['allrevisions']: yield makeXmlFromPage(page) if 'continue' in arvrequest: @@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False): # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' arvrequest = site.api(**arvparams) + exportparams = { + 'action': 'query', + 'export': '1', + } # Skip the namespace if it's empty if len(arvrequest['query']['allrevisions']) < 1: continue @@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False): for revision in page['revisions']: revids.append(str(revision['revid'])) print "%d more revisions listed, until %s" % (len(revids), revids[-1]) + # We can now get the XML for one revision at a time # FIXME: we can actually get them in batches as we used to # but need to figure out the continuation and avoid that the API # chooses to give us only the latest for each page - exportparams = { - 'action': 'query', - 'export': '1', - } for revid in revids: exportparams['revids'] = revid exportrequest = site.api(**exportparams) @@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False): if 'continue' in arvrequest: # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(**arvparams) + except requests.exceptions.ReadTimeout as err: + # As above + print("ERROR: {}".format(str(err))) + print("Sleeping for 20 seconds") + time.sleep(20) + # But avoid rewriting the same revisions + arvrequest['query']['allrevisions'] = [] + continue else: # End of continuation. We are done with this namespace. break @@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False): if config['curonly']: # The raw XML export in the API gets a title and gives the latest revision. # We could also use the allpages API as generator but let's be consistent. + print("Getting titles to export the latest revision for each") + c = 0 for title in readTitles(config): + # TODO: respect verbose flag, reuse output from getXMLPage + print(' {}'.format(title.strip())) # TODO: as we're doing one page and revision at a time, we might # as well use xml format and exportnowrap=1 to use the string of, # XML as is, but need to check how well the library handles it. @@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False): } exportrequest = site.api(**exportparams) xml = exportrequest['query']['export']['*'] + c += 1 + if c % 10 == 0: + print('Downloaded {} pages'.format(c)) # Because we got the fancy XML from the JSON format, clean it: yield makeXmlPageFromRaw(xml) else: @@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False): # to be input the page titles; otherwise, the requests are similar. # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). + print("Getting titles to export all the revisions of each") + c = 0 for title in readTitles(config): + print(' {}'.format(title.strip())) # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} pparams = { 'action': 'query', 'titles': title, 'prop': 'revisions', + 'rvlimit': 50, 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } prequest = site.api(**pparams) + c += 1 + if c % 10 == 0: + print('Downloaded {} pages'.format(c)) # The array is called "pages" even if there's only one. # TODO: we could actually batch titles a bit here if desired. How many? try: @@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False): for page in pages: try: xml = makeXmlFromPage(pages[page]) + yield xml except PageMissingError: logerror( config=config, text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) ) continue - yield xml # Get next batch of revisions if there's more. if 'continue' in prequest: + print("Getting more revisions for page {}".format(title)) pparams['rvcontinue'] = prequest['rvcontinue'] prequest = site.api(**pparams) + else: + break except mwclient.errors.MwClientError: @@ -958,30 +997,41 @@ def makeXmlFromPage(page): """ Output an XML document as a string from a page as in the API JSON """ try: p = E.page( - E.title(page['title']), + E.title(to_unicode(page['title'])), E.ns(to_unicode(page['ns'])), E.id(to_unicode(page['pageid'])), ) for rev in page['revisions']: + # Older releases like MediaWiki 1.16 do not return all fields. + if 'userid' in rev: + userid = rev['userid'] + else: + userid = 0 + if 'size' in rev: + size = rev['size'] + else: + size = 0 revision = E.revision( E.id(to_unicode(rev['revid'])), E.parentid(to_unicode(rev['parentid'])), E.timestamp(rev['timestamp']), E.contributor( - E.id(to_unicode(rev['userid'])), + E.id(to_unicode(userid)), E.username(to_unicode(rev['user'])), ), - E.comment(rev['comment']), - E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])), + E.text(rev['*'], space="preserve", bytes=to_unicode(size)), ) + if 'comment' in rev: + revision.append(E.comment(to_unicode(rev['comment']))) if 'contentmodel' in rev: revision.append(E.model(rev['contentmodel'])) # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). if 'sha1' in rev: revision.append(E.sha1(rev['sha1'])) p.append(revision) - except KeyError: - raise PageMissingError(page['title'], '') + except KeyError as e: + print(e) + raise PageMissingError(page['title'], e) return etree.tostring(p, pretty_print=True) def readTitles(config={}, start=None):