Avoid generators in API-only export

pull/287/merge
Federico Leva 6 years ago
parent ebc02a3b45
commit be5ca12075

@ -698,7 +698,11 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
try:
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except AttributeError:
@ -751,40 +755,43 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
def getXMLRevisions(config={}, session=None):
site = wikitools.wiki.Wiki(config['api'])
if config['namespaces']:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
else:
namespaces = ['*']
#if config['namespaces']:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
#else:
namespaces = ['*']
for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
params = {
'action': 'query',
'generator': 'allrevisions',
'garvnamespace': namespace,
'garvlimit': 50,
'garvprop': 'ids',
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
'list': 'allrevisions',
'arvnamespace': '*',
'arvlimit': 50,
'arvprop': 'ids',
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
try:
for result in results:
yield result['query']['export']['*']
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "50 more revisions listed, until %d" % revids[-1]
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except wikitools.api.APIError:
# Falling back to allpages generator, the wiki is too old
params = {
'action': 'query',
'generator': 'allpages',
'gaplimit': 50,
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
}
# allpages does not accept "*"
if namespace is not '*':
params['gapnamespace'] = namespace
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
for result in results:
yield result['query']['export']['*']
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -1361,7 +1368,8 @@ def getParameters(params=[]):
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages; incompatible with --xmlrevisions')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator')
help='download all revisions from an API generator. Ignores the \
namespace selection')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(

Loading…
Cancel
Save