Implement continuation for --xmlrevisions with prop=revisions in MW 1.19

pull/364/head
Federico Leva 4 years ago
parent 8b58599645
commit 8fef62d46e

@ -962,9 +962,6 @@ def getXMLRevisions(config={}, session=None, allpages=False):
config['http_method'] = "GET"
exportrequest = site.api(http_method=config['http_method'], **exportparams)
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try:
@ -974,9 +971,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# Be ready to iterate if there is continuation.
while True:
# Go through the data we got to build the XML.
for page in pages:
for pageid in pages:
try:
xml = makeXmlFromPage(pages[page])
xml = makeXmlFromPage(pages[pageid])
yield xml
except PageMissingError:
logerror(
@ -989,19 +986,24 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if 'continue' in prequest.keys():
print("Getting more revisions for page {}".format(title))
pparams['rvcontinue'] = prequest['continue']['rvcontinue']
try:
prequest = site.api(http_method=config['http_method'], **pparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 405 and config['http_method'] == "POST":
print("POST request to the API failed, retrying with GET")
config['http_method'] = "GET"
prequest = site.api(http_method=config['http_method'], **pparams)
# mwclient seems to rewrite query-continue
#if 'query-continue' in prequest.keys():
# pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue']
elif 'query-continue' in prequest.keys():
rvstartid = prequest['query-continue']['revisions']['rvstartid']
pparams['rvstartid'] = rvstartid
else:
break
try:
prequest = site.api(http_method=config['http_method'], **pparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 405 and config['http_method'] == "POST":
print("POST request to the API failed, retrying with GET")
config['http_method'] = "GET"
prequest = site.api(http_method=config['http_method'], **pparams)
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
except mwclient.errors.MwClientError as e:
print(e)

@ -1,4 +1,5 @@
argparse>=1.2.1
requests>=2.3.0
internetarchive
kitchen
mwclient
requests>=2.3.0

Loading…
Cancel
Save