Actually convert the titles query method to mwclient too

pull/359/head
Federico Leva 4 years ago
parent f10adb71af
commit 6b12e20a9d

@ -892,7 +892,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# TODO: check whether the KeyError was really for a missing arv API # TODO: check whether the KeyError was really for a missing arv API
print "Warning. Could not use allrevisions. Wiki too old?" print "Warning. Could not use allrevisions. Wiki too old?"
if config['curonly']: if config['curonly']:
# The raw XML export in the API gets a title and gives the latest revision # The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
for title in readTitles(config): for title in readTitles(config):
# TODO: as we're doing one page and revision at a time, we might # TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of, # as well use xml format and exportnowrap=1 to use the string of,
@ -909,6 +910,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
else: else:
# This is the closest to what we usually do with Special:Export: # This is the closest to what we usually do with Special:Export:
# take one title at a time and try to get all revisions exported. # take one title at a time and try to get all revisions exported.
# It differs from the allrevisions method because it actually needs
# to be input the page titles; otherwise, the requests are similar.
# The XML needs to be made manually because the export=1 option # The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above). # refuses to return an arbitrary number of revisions (see above).
for title in readTitles(config): for title in readTitles(config):
@ -920,21 +923,31 @@ def getXMLRevisions(config={}, session=None, allpages=False):
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
} }
prequest = site.api(**pparams) prequest = site.api(**pparams)
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try: try:
results = prequest.query() pages = prequest['query']['pages']
pages = results['query']['pages']
except KeyError: except KeyError:
raise PageMissingError(title, xml='') raise PageMissingError(title, xml='')
for page in pages: # Be ready to iterate if there is continuation.
try: while True:
xml = makeXmlFromPage(pages[page]) # Go through the data we got to build the XML.
except PageMissingError: for page in pages:
logerror( try:
config=config, xml = makeXmlFromPage(pages[page])
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) except PageMissingError:
) logerror(
continue config=config,
yield xml text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
)
continue
yield xml
# Get next batch of revisions if there's more.
if 'continue' in prequest:
pparams['rvcontinue'] = prequest['rvcontinue']
prequest = site.api(**pparams)
except mwclient.errors.MwClientError: except mwclient.errors.MwClientError:
print "This mwclient version seems not to work for us. Exiting." print "This mwclient version seems not to work for us. Exiting."

Loading…
Cancel
Save