Finish xmlrevisions option for older wikis

* Actually proceed to the next page when no continuation.
* Provide the same output as with the usual per-page export.

Tested on a MediaWiki 1.16 wiki with success.
pull/359/head
Federico Leva 4 years ago
parent 0f35d03929
commit 9ec6ce42d3

@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
for xml in getXMLRevisions(config=config, session=session): for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml)) numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less # Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
print "%d more revisions exported" % numrevs print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml) xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8')) xmlfile.write(xml.encode('utf-8'))
@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False):
arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content' arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
print("Trying to get wikitext from the allrevisions API and to build the XML") print("Trying to get wikitext from the allrevisions API and to build the XML")
while True: while True:
arvrequest = site.api(**arvparams) try:
arvrequest = site.api(**arvparams)
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print("ERROR: {}".format(str(err)))
print("Sleeping for 20 seconds")
time.sleep(20)
continue
for page in arvrequest['query']['allrevisions']: for page in arvrequest['query']['allrevisions']:
yield makeXmlFromPage(page) yield makeXmlFromPage(page)
if 'continue' in arvrequest: if 'continue' in arvrequest:
@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# We only need the revision ID, all the rest will come from the raw export # We only need the revision ID, all the rest will come from the raw export
arvparams['arvprop'] = 'ids' arvparams['arvprop'] = 'ids'
arvrequest = site.api(**arvparams) arvrequest = site.api(**arvparams)
exportparams = {
'action': 'query',
'export': '1',
}
# Skip the namespace if it's empty # Skip the namespace if it's empty
if len(arvrequest['query']['allrevisions']) < 1: if len(arvrequest['query']['allrevisions']) < 1:
continue continue
@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for revision in page['revisions']: for revision in page['revisions']:
revids.append(str(revision['revid'])) revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1]) print "%d more revisions listed, until %s" % (len(revids), revids[-1])
# We can now get the XML for one revision at a time # We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to # FIXME: we can actually get them in batches as we used to
# but need to figure out the continuation and avoid that the API # but need to figure out the continuation and avoid that the API
# chooses to give us only the latest for each page # chooses to give us only the latest for each page
exportparams = {
'action': 'query',
'export': '1',
}
for revid in revids: for revid in revids:
exportparams['revids'] = revid exportparams['revids'] = revid
exportrequest = site.api(**exportparams) exportrequest = site.api(**exportparams)
@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if 'continue' in arvrequest: if 'continue' in arvrequest:
# Get the new ones # Get the new ones
arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
arvrequest = site.api(**arvparams) try:
arvrequest = site.api(**arvparams)
except requests.exceptions.ReadTimeout as err:
# As above
print("ERROR: {}".format(str(err)))
print("Sleeping for 20 seconds")
time.sleep(20)
# But avoid rewriting the same revisions
arvrequest['query']['allrevisions'] = []
continue
else: else:
# End of continuation. We are done with this namespace. # End of continuation. We are done with this namespace.
break break
@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if config['curonly']: if config['curonly']:
# The raw XML export in the API gets a title and gives the latest revision. # The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent. # We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config): for title in readTitles(config):
# TODO: respect verbose flag, reuse output from getXMLPage
print(' {}'.format(title.strip()))
# TODO: as we're doing one page and revision at a time, we might # TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of, # as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it. # XML as is, but need to check how well the library handles it.
@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
} }
exportrequest = site.api(**exportparams) exportrequest = site.api(**exportparams)
xml = exportrequest['query']['export']['*'] xml = exportrequest['query']['export']['*']
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
# Because we got the fancy XML from the JSON format, clean it: # Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml) yield makeXmlPageFromRaw(xml)
else: else:
@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# to be input the page titles; otherwise, the requests are similar. # to be input the page titles; otherwise, the requests are similar.
# The XML needs to be made manually because the export=1 option # The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above). # refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
for title in readTitles(config): for title in readTitles(config):
print(' {}'.format(title.strip()))
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
pparams = { pparams = {
'action': 'query', 'action': 'query',
'titles': title, 'titles': title,
'prop': 'revisions', 'prop': 'revisions',
'rvlimit': 50,
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
} }
prequest = site.api(**pparams) prequest = site.api(**pparams)
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
# The array is called "pages" even if there's only one. # The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many? # TODO: we could actually batch titles a bit here if desired. How many?
try: try:
@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for page in pages: for page in pages:
try: try:
xml = makeXmlFromPage(pages[page]) xml = makeXmlFromPage(pages[page])
yield xml
except PageMissingError: except PageMissingError:
logerror( logerror(
config=config, config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
) )
continue continue
yield xml
# Get next batch of revisions if there's more. # Get next batch of revisions if there's more.
if 'continue' in prequest: if 'continue' in prequest:
print("Getting more revisions for page {}".format(title))
pparams['rvcontinue'] = prequest['rvcontinue'] pparams['rvcontinue'] = prequest['rvcontinue']
prequest = site.api(**pparams) prequest = site.api(**pparams)
else:
break
except mwclient.errors.MwClientError: except mwclient.errors.MwClientError:
@ -958,30 +997,41 @@ def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """ """ Output an XML document as a string from a page as in the API JSON """
try: try:
p = E.page( p = E.page(
E.title(page['title']), E.title(to_unicode(page['title'])),
E.ns(to_unicode(page['ns'])), E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])), E.id(to_unicode(page['pageid'])),
) )
for rev in page['revisions']: for rev in page['revisions']:
# Older releases like MediaWiki 1.16 do not return all fields.
if 'userid' in rev:
userid = rev['userid']
else:
userid = 0
if 'size' in rev:
size = rev['size']
else:
size = 0
revision = E.revision( revision = E.revision(
E.id(to_unicode(rev['revid'])), E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])), E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']), E.timestamp(rev['timestamp']),
E.contributor( E.contributor(
E.id(to_unicode(rev['userid'])), E.id(to_unicode(userid)),
E.username(to_unicode(rev['user'])), E.username(to_unicode(rev['user'])),
), ),
E.comment(rev['comment']), E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
) )
if 'comment' in rev:
revision.append(E.comment(to_unicode(rev['comment'])))
if 'contentmodel' in rev: if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel'])) revision.append(E.model(rev['contentmodel']))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev: if 'sha1' in rev:
revision.append(E.sha1(rev['sha1'])) revision.append(E.sha1(rev['sha1']))
p.append(revision) p.append(revision)
except KeyError: except KeyError as e:
raise PageMissingError(page['title'], '') print(e)
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True) return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None): def readTitles(config={}, start=None):

Loading…
Cancel
Save