Make --xmlrevisions work on Wikia

* Do not try exportnowrap first: it returns a blank page.
* Add an allpages option, which simply uses readTitles but cannot resume.

FIXME: this only exports the current revision!
pull/319/head
Federico Leva 6 years ago
parent 680145e6a5
commit b307de6cb7

@ -425,7 +425,7 @@ def getPageTitles(config={}, session=None):
print '%d page titles loaded' % (c)
return titlesfilename
def getImageNames(config={}, session=None):
""" Get list of image names """
@ -454,14 +454,14 @@ def getXMLHeader(config={}, session=None):
xml = None
try:
print 'Getting the XML header from the API'
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
except requests.exceptions.RetryError:
pass
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
@ -783,25 +783,27 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None):
def getXMLRevisions(config={}, session=None, allpages=False):
site = wikitools.wiki.Wiki(config['api'])
#if config['namespaces']:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
#else:
namespaces = ['*']
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
print namespaces
else:
namespaces = ['*']
for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
params = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 50,
'arvprop': 'ids',
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
try:
# TODO: 500 would be nicer, but need to find the wiki's limits
params = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 50,
'arvprop': 'ids',
'arvnamespace': '*'
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
for result in results:
revids = []
for page in result['query']['allrevisions']:
@ -818,6 +820,19 @@ def getXMLRevisions(config={}, session=None):
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except KeyError:
print "Error. Is the allrevisions module missing? Trying allpages."
for title in readTitles(config):
exportparams = {
'action': 'query',
'titles': title,
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()

Loading…
Cancel
Save