diff --git a/dumpgenerator.py b/dumpgenerator.py index 9f9863e..0cc5896 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -962,9 +962,6 @@ def getXMLRevisions(config={}, session=None, allpages=False): config['http_method'] = "GET" exportrequest = site.api(http_method=config['http_method'], **exportparams) - c += 1 - if c % 10 == 0: - print('Downloaded {} pages'.format(c)) # The array is called "pages" even if there's only one. # TODO: we could actually batch titles a bit here if desired. How many? try: @@ -974,9 +971,9 @@ def getXMLRevisions(config={}, session=None, allpages=False): # Be ready to iterate if there is continuation. while True: # Go through the data we got to build the XML. - for page in pages: + for pageid in pages: try: - xml = makeXmlFromPage(pages[page]) + xml = makeXmlFromPage(pages[pageid]) yield xml except PageMissingError: logerror( @@ -989,19 +986,24 @@ def getXMLRevisions(config={}, session=None, allpages=False): if 'continue' in prequest.keys(): print("Getting more revisions for page {}".format(title)) pparams['rvcontinue'] = prequest['continue']['rvcontinue'] - try: - prequest = site.api(http_method=config['http_method'], **pparams) - except requests.exceptions.HTTPError as e: - if e.response.status_code == 405 and config['http_method'] == "POST": - print("POST request to the API failed, retrying with GET") - config['http_method'] = "GET" - prequest = site.api(http_method=config['http_method'], **pparams) - # mwclient seems to rewrite query-continue - #if 'query-continue' in prequest.keys(): - # pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue'] + elif 'query-continue' in prequest.keys(): + rvstartid = prequest['query-continue']['revisions']['rvstartid'] + pparams['rvstartid'] = rvstartid else: break + try: + prequest = site.api(http_method=config['http_method'], **pparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + prequest = site.api(http_method=config['http_method'], **pparams) + + c += 1 + if c % 10 == 0: + print('Downloaded {} pages'.format(c)) + except mwclient.errors.MwClientError as e: print(e) diff --git a/requirements.txt b/requirements.txt index 14a3769..ff7ce86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ argparse>=1.2.1 -requests>=2.3.0 internetarchive kitchen +mwclient +requests>=2.3.0