Actually export all revisions in --xmlrevisions: build XML manually!

pull/319/head
Federico Leva 6 years ago
parent 50c6786f84
commit 7143f7efb1

@ -49,6 +49,11 @@ try:
import wikitools import wikitools
except ImportError: except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions." print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
try:
from lxml import etree
from lxml.builder import E
except ImportError:
print "Please install the lxml module if you want to use --xmlrevisions."
import time import time
import urllib import urllib
try: try:
@ -281,7 +286,7 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = jsontitles['continue']['apcontinue'] apfrom = jsontitles['continue']['apcontinue']
elif 'apfrom' in jsontitles['continue']: elif 'apfrom' in jsontitles['continue']:
apfrom = jsontitles['continue']['apfrom'] apfrom = jsontitles['continue']['apfrom']
# print apfrom # print apfrom
# print jsontitles # print jsontitles
allpages = jsontitles['query']['allpages'] allpages = jsontitles['query']['allpages']
@ -782,39 +787,64 @@ def getXMLRevisions(config={}, session=None, allpages=False):
site = wikitools.wiki.Wiki(config['api']) site = wikitools.wiki.Wiki(config['api'])
if not 'all' in config['namespaces']: if not 'all' in config['namespaces']:
namespaces = config['namespaces'] namespaces = config['namespaces']
print namespaces
else: else:
namespaces = ['*'] namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
print namespaces
for namespace in namespaces: for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace print "Exporting revisions from namespace %s" % namespace
try: try:
# TODO: 500 would be nicer, but need to find the wiki's limits # TODO: 500 is nicer than 50, but need to find the wiki's limits
params = { params = {
'action': 'query', 'action': 'query',
'list': 'allrevisions', 'list': 'allrevisions',
'arvlimit': 50, 'arvlimit': 500,
'arvprop': 'ids', # Skip flags, presumably needed to add <minor/> which is in the schema.
'arvnamespace': '*' 'arvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
'arvnamespace': namespace
} }
request = wikitools.api.APIRequest(site, params) request = wikitools.api.APIRequest(site, params)
results = request.queryGen() results = request.queryGen()
for result in results: for result in results:
revids = [] if config['curonly']:
for page in result['query']['allrevisions']: revids = []
for revision in page['revisions']: for page in result['query']['allrevisions']:
revids.append(str(revision['revid'])) for revision in page['revisions']:
revids.append(str(revision['revid']))
print "50 more revisions listed, until %s" % revids[-1]
exportparams = { print "%d more revisions listed, until %s" % (len(revids), revids[-1])
'action': 'query', exportparams = {
'revids': '|'.join(revids), 'action': 'query',
'export': '1', 'revids': '|'.join(revids),
} 'export': '1',
exportrequest = wikitools.api.APIRequest(site, exportparams) }
exportresults = exportrequest.queryGen() exportrequest = wikitools.api.APIRequest(site, exportparams)
for exportresult in exportresults: exportresults = exportrequest.queryGen()
yield exportresult['query']['export']['*'] for exportresult in exportresults:
yield exportresult['query']['export']['*']
else:
# We have to build the XML manually...
for page in result['query']['allrevisions']:
p = E.page(
E.title(page['title']),
E.ns(str(page['ns'])),
E.id(str(page['pageid'])),
)
for rev in page['revisions']:
p.append(
E.revision(
E.id(str(rev['revid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(str(rev['userid'])),
E.username(str(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
E.sha1(rev['sha1']),
)
)
yield etree.tostring(p, pretty_print=True)
except KeyError: except KeyError:
print "Error. Is the allrevisions module missing? Trying allpages." print "Error. Is the allrevisions module missing? Trying allpages."
for title in readTitles(config): for title in readTitles(config):
@ -828,9 +858,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for exportresult in exportresults: for exportresult in exportresults:
yield exportresult['query']['export']['*'] yield exportresult['query']['export']['*']
except wikitools.api.APIError: #except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting." # print "This wikitools version seems not to work for us. Exiting."
sys.exit() # sys.exit()
def readTitles(config={}, start=None): def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """ """ Read title list from a file, from the title "start" """

Loading…
Cancel
Save