From 11507e931e4d6682d0c0a766fd0ea8833edb86e4 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 14:19:23 +0200 Subject: [PATCH] Initial switch to mwclient for the xmlrevisions option * Still maintained and available for python 3 as well. * Allows raw API requests as we need. * Does not provide handy generators, we need to do continuation. * Decides on its own which protocol and exact path to use, fails at it. * Appears to use POST by default unless asked otherwise, what to do? --- dumpgenerator.py | 96 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index e2d8082..4a87ded 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -46,9 +46,9 @@ except ImportError: print "Please install or update the Requests module." sys.exit(1) try: - import wikitools + import mwclient except ImportError: - print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions." + print "Please install the mwclient module if you want to use --xmlrevisions." try: from lxml import etree from lxml.builder import E @@ -714,8 +714,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None): print ' %s, %d edits' % (title.strip(), numberofedits) +def makeXmlPageFromRaw(xml): + """ Discard the metadata around a element in string""" + root = etree.XML(xml) + find = etree.XPath("//*[local-name() = 'page']") + # The tag will inherit the namespace, like: + # + # FIXME: pretty_print doesn't seem to work, only adds a newline + return etree.tostring(find(root)[0], pretty_print=True) + + def cleanXML(xml=''): - """ Trim redundant info """ + """ Trim redundant info from the XML however it comes """ # do not touch XML codification, leave AS IS if re.search(r'\n', xml): xml = xml.split('\n')[1] @@ -748,8 +758,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): print "%d more revisions exported" % numrevs xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) - except AttributeError: - print "This wikitools module version is not working" + except AttributeError as e: + print(e) + print "This API library version is not working" sys.exit() else: print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') @@ -797,7 +808,10 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): print 'XML dump saved at...', xmlfilename def getXMLRevisions(config={}, session=None, allpages=False): - site = wikitools.wiki.Wiki(config['api']) + apiurl = urlparse(config['api']) + # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? + # https://github.com/WikiTeam/wikiteam/issues/358 + site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", "")) if not 'all' in config['namespaces']: namespaces = config['namespaces'] else: @@ -806,6 +820,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): try: for namespace in namespaces: print "Trying to export all revisions from namespace %s" % namespace + # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!) arvparams = { 'action': 'query', 'list': 'allrevisions', @@ -817,46 +832,71 @@ def getXMLRevisions(config={}, session=None, allpages=False): # Skip flags, presumably needed to add which is in the schema. # Also missing: parentid and contentformat. arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content' - arvrequest = wikitools.api.APIRequest(site, arvparams) - results = arvrequest.queryGen() - for result in results: - for page in result['query']['allrevisions']: + print("Trying to get wikitext from the allrevisions API and to build the XML") + while True: + arvrequest = site.api(**arvparams) + for page in arvrequest['query']['allrevisions']: yield makeXmlFromPage(page) + if 'continue' in arvrequest: + arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + else: + # End of continuation. We are done with this namespace. + break + else: + # FIXME: this is not curonly, just different strategy to do all revisions # Just cycle through revision IDs and use the XML as is + print("Trying to list the revisions and to export them one by one") + # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' - arvrequest = wikitools.api.APIRequest(site, arvparams) - arvresults = arvrequest.queryGen() - for result in arvresults: + # Repeat the arvrequest with new arvparams until done + while True: + # Reset revision IDs from the previous batch from arv revids = [] - for page in result['query']['allrevisions']: + # Get the new ones + arvrequest = site.api(**arvparams) + for page in arvrequest['query']['allrevisions']: for revision in page['revisions']: revids.append(str(revision['revid'])) print "%d more revisions listed, until %s" % (len(revids), revids[-1]) - + # We can now get the XML for one revision at a time + # FIXME: we can actually get them in batches as we used to + # but need to figure out the continuation and avoid that the API + # chooses to give us only the latest for each page exportparams = { 'action': 'query', - 'revids': '|'.join(revids), 'export': '1', } - exportrequest = wikitools.api.APIRequest(site, exportparams) - exportresults = exportrequest.queryGen() - for exportresult in exportresults: - yield exportresult['query']['export']['*'] + for revid in revids: + exportparams['revids'] = revid + exportrequest = site.api(**exportparams) + # This gives us a self-standing element + # but we only need the inner : we can live with + # duplication and non-ordering of page titles, but the + # repeated header is confusing and would not even be valid + xml = exportrequest['query']['export']['*'] + yield makeXmlPageFromRaw(xml) + + if 'continue' in arvrequest: + arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + else: + # End of continuation. We are done with this namespace. + break + except KeyError: - print "Warning. Could not use allrevisions, wiki too old." + print "Warning. Could not use allrevisions. Wiki too old?" if config['curonly']: + # The raw XML export in the API gets a title and gives the latest revision for title in readTitles(config): exportparams = { 'action': 'query', 'titles': title, 'export': '1', } - exportrequest = wikitools.api.APIRequest(site, exportparams) - exportresults = exportrequest.queryGen() - for exportresult in exportresults: - yield exportresult['query']['export']['*'] + exportrequest = site.api(**exportparams) + xml = exportrequest['query']['export']['*'] + yield makeXmlPageFromRaw(xml) else: for title in readTitles(config): pparams = { @@ -867,7 +907,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', 'rawcontinue': 'yes' } - prequest = wikitools.api.APIRequest(site, pparams) + prequest = site.api(**pparams) try: results = prequest.query() pages = results['query']['pages'] @@ -884,8 +924,8 @@ def getXMLRevisions(config={}, session=None, allpages=False): continue yield xml - except wikitools.api.APIError: - print "This wikitools version seems not to work for us. Exiting." + except mwclient.errors.MwClientError: + print "This mwclient version seems not to work for us. Exiting." sys.exit() def makeXmlFromPage(page):