diff --git a/dumpgenerator.py b/dumpgenerator.py index 58c55cf..3ba9222 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -840,7 +840,12 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to get wikitext from the allrevisions API and to build the XML") while True: try: - arvrequest = site.api(**arvparams) + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + continue except requests.exceptions.ReadTimeout as err: # Hopefully temporary, just wait a bit and continue with the same request. # No point putting a limit to retries, we'd need to abort everything. @@ -865,7 +870,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to list the revisions and to export them one by one") # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + continue exportparams = { 'action': 'query', 'export': '1', @@ -888,7 +899,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): # chooses to give us only the latest for each page for revid in revids: exportparams['revids'] = revid - exportrequest = site.api(**exportparams) + try: + exportrequest = site.api(http_method=config['http_method'], **exportparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + # This gives us a self-standing element # but we only need the inner : we can live with # duplication and non-ordering of page titles, but the @@ -900,7 +918,12 @@ def getXMLRevisions(config={}, session=None, allpages=False): # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] try: - arvrequest = site.api(**arvparams) + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + arvrequest = site.api(http_method=config['http_method'], **arvparams) except requests.exceptions.ReadTimeout as err: # As above print("ERROR: {}".format(str(err))) @@ -932,7 +955,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'titles': title, 'export': '1', } - exportrequest = site.api(**exportparams) + try: + exportrequest = site.api(http_method=config['http_method'], **exportparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + xml = exportrequest['query']['export']['*'] c += 1 if c % 10 == 0: @@ -959,7 +989,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'rvlimit': 50, 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } - prequest = site.api(**pparams) + try: + prequest = site.api(http_method=config['http_method'], **pparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + c += 1 if c % 10 == 0: print('Downloaded {} pages'.format(c)) @@ -987,7 +1024,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): if 'continue' in prequest.keys(): print("Getting more revisions for page {}".format(title)) pparams['rvcontinue'] = prequest['continue']['rvcontinue'] - prequest = site.api(**pparams) + try: + prequest = site.api(http_method=config['http_method'], **pparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + prequest = site.api(http_method=config['http_method'], **pparams) # mwclient seems to rewrite query-continue #if 'query-continue' in prequest.keys(): # pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue'] @@ -1826,6 +1869,7 @@ def getParameters(params=[]): 'date': datetime.datetime.now().strftime('%Y%m%d'), 'api': api, 'failfast': args.failfast, + 'http_method': "POST", 'index': index, 'images': args.images, 'logs': False,