Avoid generators in API-only export

pull/287/merge
Federico Leva 6 years ago
parent ebc02a3b45
commit be5ca12075

@ -698,7 +698,11 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8')) xmlfile.write(header.encode('utf-8'))
try: try:
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session): for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml) xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8')) xmlfile.write(xml.encode('utf-8'))
except AttributeError: except AttributeError:
@ -751,40 +755,43 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
def getXMLRevisions(config={}, session=None): def getXMLRevisions(config={}, session=None):
site = wikitools.wiki.Wiki(config['api']) site = wikitools.wiki.Wiki(config['api'])
if config['namespaces']: #if config['namespaces']:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session) # namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
else: #else:
namespaces = ['*'] namespaces = ['*']
for namespace in namespaces: for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
params = { params = {
'action': 'query', 'action': 'query',
'generator': 'allrevisions', 'list': 'allrevisions',
'garvnamespace': namespace, 'arvnamespace': '*',
'garvlimit': 50, 'arvlimit': 50,
'garvprop': 'ids', 'arvprop': 'ids',
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
} }
request = wikitools.api.APIRequest(site, params) request = wikitools.api.APIRequest(site, params)
results = request.queryGen() results = request.queryGen()
try: try:
for result in results: for result in results:
yield result['query']['export']['*'] revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "50 more revisions listed, until %d" % revids[-1]
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except wikitools.api.APIError: except wikitools.api.APIError:
# Falling back to allpages generator, the wiki is too old print "This wikitools version seems not to work for us. Exiting."
params = { sys.exit()
'action': 'query',
'generator': 'allpages',
'gaplimit': 50,
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
}
# allpages does not accept "*"
if namespace is not '*':
params['gapnamespace'] = namespace
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
for result in results:
yield result['query']['export']['*']
def readTitles(config={}, start=None): def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """ """ Read title list from a file, from the title "start" """
@ -1361,7 +1368,8 @@ def getParameters(params=[]):
groupDownload.add_argument('--curonly', action='store_true', groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages; incompatible with --xmlrevisions') help='store only the current version of pages; incompatible with --xmlrevisions')
groupDownload.add_argument('--xmlrevisions', action='store_true', groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator') help='download all revisions from an API generator. Ignores the \
namespace selection')
groupDownload.add_argument( groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump") '--images', action='store_true', help="generates an image dump")
groupDownload.add_argument( groupDownload.add_argument(

Loading…
Cancel
Save