From b307de6cb791605e87b4c0e1e22ea0d1fcfd05ba Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Sat, 19 May 2018 03:13:42 +0300 Subject: [PATCH] Make --xmlrevisions work on Wikia * Do not try exportnowrap first: it returns a blank page. * Add an allpages option, which simply uses readTitles but cannot resume. FIXME: this only exports the current revision! --- dumpgenerator.py | 55 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index bfa77b6..0d22314 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -425,7 +425,7 @@ def getPageTitles(config={}, session=None): print '%d page titles loaded' % (c) return titlesfilename - + def getImageNames(config={}, session=None): """ Get list of image names """ @@ -454,14 +454,14 @@ def getXMLHeader(config={}, session=None): xml = None try: print 'Getting the XML header from the API' - r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10) - xml = r.text + r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10) + xml = r.json()['query']['export']['*'] + if not xml: + r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10) + xml = r.text except requests.exceptions.RetryError: pass - if not xml: - r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10) - xml = r.json()['query']['export']['*'] else: try: xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) @@ -783,25 +783,27 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): xmlfile.close() print 'XML dump saved at...', xmlfilename -def getXMLRevisions(config={}, session=None): +def getXMLRevisions(config={}, session=None, allpages=False): site = wikitools.wiki.Wiki(config['api']) - #if config['namespaces']: - # namespaces, namespacenames = getNamespacesAPI(config=config, session=session) - #else: - namespaces = ['*'] + if not 'all' in config['namespaces']: + namespaces = config['namespaces'] + print namespaces + else: + namespaces = ['*'] for namespace in namespaces: print "Exporting revisions from namespace %s" % namespace - # TODO: 500 would be nicer, but need to find the wiki's limits - params = { - 'action': 'query', - 'list': 'allrevisions', - 'arvlimit': 50, - 'arvprop': 'ids', - } - request = wikitools.api.APIRequest(site, params) - results = request.queryGen() try: + # TODO: 500 would be nicer, but need to find the wiki's limits + params = { + 'action': 'query', + 'list': 'allrevisions', + 'arvlimit': 50, + 'arvprop': 'ids', + 'arvnamespace': '*' + } + request = wikitools.api.APIRequest(site, params) + results = request.queryGen() for result in results: revids = [] for page in result['query']['allrevisions']: @@ -818,6 +820,19 @@ def getXMLRevisions(config={}, session=None): exportresults = exportrequest.queryGen() for exportresult in exportresults: yield exportresult['query']['export']['*'] + except KeyError: + print "Error. Is the allrevisions module missing? Trying allpages." + for title in readTitles(config): + exportparams = { + 'action': 'query', + 'titles': title, + 'export': '1', + } + exportrequest = wikitools.api.APIRequest(site, exportparams) + exportresults = exportrequest.queryGen() + for exportresult in exportresults: + yield exportresult['query']['export']['*'] + except wikitools.api.APIError: print "This wikitools version seems not to work for us. Exiting." sys.exit()