From 142b48cc6926a0ed4b3e0a97a1040932d0ec7c87 Mon Sep 17 00:00:00 2001 From: Fedora Date: Mon, 7 May 2018 19:01:50 +0000 Subject: [PATCH 1/6] Add timeouts and retries to increase success rate --- dumpgenerator.py | 62 +++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a045ace..a16173b 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -150,7 +150,7 @@ def getNamespacesScraper(config={}, session=None): namespacenames = {0: ''} # main is 0, no prefix if namespaces: r = session.post( - url=config['index'], data={'title': 'Special:Allpages'}) + url=config['index'], data={'title': 'Special:Allpages'}, timeout=30) raw = r.text delay(config=config, session=session) @@ -191,7 +191,8 @@ def getNamespacesAPI(config={}, session=None): 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', - 'format': 'json'} + 'format': 'json'}, + timeout=30 ) result = getJSON(r) delay(config=config, session=session) @@ -249,7 +250,7 @@ def getPageTitlesAPI(config={}, session=None): retryCount = 0 while retryCount < config["retries"]: try: - r = session.post(url=config['api'], data=params) + r = session.post(url=config['api'], data=params, timeout=30) break except ConnectionError as err: print "Connection error: %s" % (str(err),) @@ -301,7 +302,7 @@ def getPageTitlesScraper(config={}, session=None): print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % ( config['index'], namespace) - r = session.get(url=url) + r = session.get(url=url, timeout=30) raw = r.text raw = cleanHTML(raw) @@ -353,7 +354,7 @@ def getPageTitlesScraper(config={}, session=None): # to avoid reload dupe subpages links checked_suballpages.append(name) delay(config=config, session=session) - r2 = session.get(url=url) + r2 = session.get(url=url, timeout=10) raw2 = r2.text raw2 = cleanHTML(raw2) rawacum += raw2 # merge it after removed junk @@ -386,7 +387,7 @@ def getPageTitles(config={}, session=None): titles = [] if 'api' in config and config['api']: - r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}) + r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30) test = getJSON(r) if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages'] and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'): @@ -454,7 +455,8 @@ def getXMLHeader(config={}, session=None): 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', - 'format': 'json'} + 'format': 'json'}, + timeout=120 ) config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + ':Export' @@ -550,7 +552,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): return '' # empty xml # FIXME HANDLE HTTP Errors HERE try: - r = session.post(url=config['index'], data=params, headers=headers) + r = session.post(url=config['index'], data=params, headers=headers, timeout=10) handleStatusCode(r) xml = fixBOM(r) except requests.exceptions.ConnectionError as e: @@ -866,7 +868,8 @@ def getImageNamesScraper(config={}, session=None): data={ 'title': 'Special:Imagelist', 'limit': limit, - 'offset': offset}) + 'offset': offset}, + timeout=30) raw = r.text delay(config=config, session=session) # delicate wiki @@ -967,7 +970,7 @@ def getImageNamesAPI(config={}, session=None): 'format': 'json', 'ailimit': 500} # FIXME Handle HTTP Errors HERE - r = session.post(url=config['api'], data=params) + r = session.post(url=config['api'], data=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) delay(config=config, session=session) @@ -1025,7 +1028,7 @@ def getImageNamesAPI(config={}, session=None): 'iiprop': 'user|url', 'format': 'json'} # FIXME Handle HTTP Errors HERE - r = session.post(url=config['api'], data=params) + r = session.post(url=config['api'], data=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) delay(config=config, session=session) @@ -1351,11 +1354,22 @@ def getParameters(params=[]): print 'Using cookies from %s' % args.cookies session = requests.Session() + try: + from requests.packages.urllib3.util.retry import Retry + from requests.adapters import HTTPAdapter + # Courtesy datashaman https://stackoverflow.com/a/35504626 + __retries__ = Retry(total=5, + backoff_factor=2, + status_forcelist=[500, 502, 503, 504]) + session.mount('https://', HTTPAdapter(max_retries=__retries__)) + session.mount('http://', HTTPAdapter(max_retries=__retries__)) + except: + # Our urllib3/requests is too old + pass session.cookies = cj session.headers.update({'User-Agent': getUserAgent()}) if args.user and args.password: session.auth = (args.user, args.password) - # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) # check URLs for url in [args.api, args.index, args.wiki]: @@ -1521,7 +1535,8 @@ def checkAPI(api=None, session=None): data={ 'action': 'query', 'meta': 'siteinfo', - 'format': 'json'} + 'format': 'json'}, + timeout=30 ) if r.url == api: break @@ -1549,7 +1564,7 @@ def checkAPI(api=None, session=None): def checkIndex(index=None, cookies=None, session=None): """ Checking index.php availability """ - r = session.post(url=index, data={'title': 'Special:Version'}) + r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30) raw = r.text print 'Checking index.php...', index # Workaround for issue 71 @@ -1811,7 +1826,7 @@ def saveSpecialVersion(config={}, session=None): else: print 'Downloading Special:Version with extensions and other related info' r = session.post( - url=config['index'], data={'title': 'Special:Version'}) + url=config['index'], data={'title': 'Special:Version'}, timeout=10) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) @@ -1826,7 +1841,7 @@ def saveIndexPHP(config={}, session=None): print 'index.html exists, do not overwrite' else: print 'Downloading index.php (Main Page) as index.html' - r = session.post(url=config['index'], data={}) + r = session.post(url=config['index'], data={}, timeout=10) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) @@ -1851,7 +1866,8 @@ def saveSiteInfo(config={}, session=None): 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'sinumberingroup': 1, - 'format': 'json'}) + 'format': 'json'}, + timeout=10) # MediaWiki 1.11-1.12 if not 'query' in getJSON(r): r = session.post( @@ -1860,7 +1876,8 @@ def saveSiteInfo(config={}, session=None): 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', - 'format': 'json'}) + 'format': 'json'}, + timeout=10) # MediaWiki 1.8-1.10 if not 'query' in getJSON(r): r = session.post( @@ -1869,7 +1886,8 @@ def saveSiteInfo(config={}, session=None): 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', - 'format': 'json'}) + 'format': 'json'} + timeout=10) result = getJSON(r) delay(config=config, session=session) with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: @@ -1896,9 +1914,9 @@ def getWikiEngine(url=''): session = requests.Session() session.headers.update({'User-Agent': getUserAgent()}) - r = session.post(url=url) + r = session.post(url=url, timeout=30) if r.status_code == 405 or r.text == '': - r = session.get(url=url) + r = session.get(url=url, timeout=120) result = r.text wikiengine = 'Unknown' @@ -1981,7 +1999,7 @@ def mwGetAPIAndIndex(url=''): index = '' session = requests.Session() session.headers.update({'User-Agent': getUserAgent()}) - r = session.post(url=url) + r = session.post(url=url, timeout=120) result = r.text # API From a8cbb357ff859ace128f342d92d148938124e4b4 Mon Sep 17 00:00:00 2001 From: Fedora Date: Mon, 7 May 2018 19:05:26 +0000 Subject: [PATCH 2/6] First attempt of API-only export --- dumpgenerator.py | 137 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 98 insertions(+), 39 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a16173b..13ea271 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -39,11 +39,16 @@ except ImportError: # Python 2.4 compatibility from md5 import new as md5 import os import re +import subprocess try: import requests except ImportError: print "Please install or update the Requests module." sys.exit(1) +try: + import wikitools +except ImportError: + print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions." import time import urllib UTF8Writer = getwriter('utf8') @@ -514,7 +519,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): if c > 0 and c < maxretries: wait = increment * c < maxseconds and increment * \ c or maxseconds # incremental until maxseconds - print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait) + print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait) time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) @@ -677,10 +682,9 @@ def cleanXML(xml=''): def generateXMLDump(config={}, titles=[], start=None, session=None): - """ Generates a XML dump for a list of titles """ + """ Generates a XML dump for a list of titles or from revision IDs """ # TODO: titles is now unused. - print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') header, config = getXMLHeader(config=config, session=session) footer = '\n' # new line at the end xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), @@ -688,48 +692,100 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): config['curonly'] and 'current' or 'history') xmlfile = '' lock = True - if start: - print "Removing the last chunk of past XML dump: it is probably incomplete." - for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): - pass - else: - # requested complete xml dump - lock = False + + if config['xmlrevisions']: + print 'Retrieving the XML for every page from the beginning' xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header.encode('utf-8')) - xmlfile.close() - - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') - c = 1 - for title in readTitles(config, start): - if not title.strip(): - continue - if title == start: # start downloading from start, included - lock = False - if lock: - continue - delay(config=config, session=session) - if c % 10 == 0: - print 'Downloaded %d pages' % (c) try: - for xml in getXMLPage(config=config, title=title, session=session): + for xml in getXMLRevisions(config=config, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) - except PageMissingError: - logerror( - config=config, - text=u'The page "%s" was missing in the wiki (probably deleted)' % - (title.decode('utf-8')) - ) - # here, XML is a correct chunk or - # an empty string due to a deleted page (logged in errors log) or - # an empty string due to an error while retrieving the page from server - # (logged in errors log) - c += 1 + except AttributeError: + print "This wikitools module version is not working" + sys.exit() + else: + print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') + if start: + print "Removing the last chunk of past XML dump: it is probably incomplete." + for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): + pass + else: + # requested complete xml dump + lock = False + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') + xmlfile.write(header.encode('utf-8')) + xmlfile.close() + + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') + c = 1 + for title in readTitles(config, start): + if not title.strip(): + continue + if title == start: # start downloading from start, included + lock = False + if lock: + continue + delay(config=config, session=session) + if c % 10 == 0: + print 'Downloaded %d pages' % (c) + try: + for xml in getXMLPage(config=config, title=title, session=session): + xml = cleanXML(xml=xml) + xmlfile.write(xml.encode('utf-8')) + except PageMissingError: + logerror( + config=config, + text=u'The page "%s" was missing in the wiki (probably deleted)' % + (title.decode('utf-8')) + ) + # here, XML is a correct chunk or + # an empty string due to a deleted page (logged in errors log) or + # an empty string due to an error while retrieving the page from server + # (logged in errors log) + c += 1 + xmlfile.write(footer) xmlfile.close() print 'XML dump saved at...', xmlfilename +def getXMLRevisions(config={}, session=None): + site = wikitools.wiki.Wiki(config['api']) + if config['namespaces']: + namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + else: + namespaces = ['*'] + + for namespace in namespaces: + params = { + 'action': 'query', + 'generator': 'allrevisions', + 'garvnamespace': namespace, + 'garvlimit': 50, + 'garvprop': 'ids', + 'export': 1 # Just to make sure the parameter is passed. Empty is fine too. + } + request = wikitools.api.APIRequest(site, params) + results = request.queryGen() + try: + for result in results: + yield result['query']['export']['*'] + except wikitools.api.APIError: + # Falling back to allpages generator, the wiki is too old + params = { + 'action': 'query', + 'generator': 'allpages', + 'gaplimit': 50, + 'export': 1 # Just to make sure the parameter is passed. Empty is fine too. + } + # allpages does not accept "*" + if namespace is not '*': + params['gapnamespace'] = namespace + request = wikitools.api.APIRequest(site, params) + results = request.queryGen() + for result in results: + yield result['query']['export']['*'] + def readTitles(config={}, start=None): """ Read title list from a file, from the title "start" """ @@ -1303,7 +1359,9 @@ def getParameters(params=[]): action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)") groupDownload.add_argument('--curonly', action='store_true', - help='store only the current version of pages') + help='store only the current version of pages; incompatible with --xmlrevisions') + groupDownload.add_argument('--xmlrevisions', action='store_true', + help='download all revisions from an API generator') groupDownload.add_argument( '--images', action='store_true', help="generates an image dump") groupDownload.add_argument( @@ -1502,6 +1560,7 @@ def getParameters(params=[]): 'images': args.images, 'logs': False, 'xml': args.xml, + 'xmlrevisions': args.xmlrevisions, 'namespaces': namespaces, 'exnamespaces': exnamespaces, 'path': args.path and os.path.normpath(args.path) or '', @@ -1547,7 +1606,7 @@ def checkAPI(api=None, session=None): try: result = getJSON(r) index = None - if result['query']: + if result: try: index = result['query']['general']['server'] + \ result['query']['general']['script'] @@ -1886,7 +1945,7 @@ def saveSiteInfo(config={}, session=None): 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', - 'format': 'json'} + 'format': 'json'}, timeout=10) result = getJSON(r) delay(config=config, session=session) From be5ca12075c8de3a7d3d297ecff21fd85c661417 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 7 May 2018 20:03:22 +0000 Subject: [PATCH 3/6] Avoid generators in API-only export --- dumpgenerator.py | 58 +++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 13ea271..cf67bb8 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -698,7 +698,11 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header.encode('utf-8')) try: + r_timestamp = r'([^<]+)' for xml in getXMLRevisions(config=config, session=session): + numrevs = len(re.findall(r_timestamp, xml)) + # Due to how generators work, it's expected this may be less + print "%d more revisions exported" % numrevs xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except AttributeError: @@ -751,40 +755,43 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): def getXMLRevisions(config={}, session=None): site = wikitools.wiki.Wiki(config['api']) - if config['namespaces']: - namespaces, namespacenames = getNamespacesAPI(config=config, session=session) - else: - namespaces = ['*'] + #if config['namespaces']: + # namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + #else: + namespaces = ['*'] for namespace in namespaces: + print "Exporting revisions from namespace %s" % namespace + # TODO: 500 would be nicer, but need to find the wiki's limits params = { 'action': 'query', - 'generator': 'allrevisions', - 'garvnamespace': namespace, - 'garvlimit': 50, - 'garvprop': 'ids', - 'export': 1 # Just to make sure the parameter is passed. Empty is fine too. + 'list': 'allrevisions', + 'arvnamespace': '*', + 'arvlimit': 50, + 'arvprop': 'ids', } request = wikitools.api.APIRequest(site, params) results = request.queryGen() try: for result in results: - yield result['query']['export']['*'] + revids = [] + for page in result['query']['allrevisions']: + for revision in page['revisions']: + revids.append(str(revision['revid'])) + + print "50 more revisions listed, until %d" % revids[-1] + exportparams = { + 'action': 'query', + 'revids': '|'.join(revids), + 'export': '1', + } + exportrequest = wikitools.api.APIRequest(site, exportparams) + exportresults = exportrequest.queryGen() + for exportresult in exportresults: + yield exportresult['query']['export']['*'] except wikitools.api.APIError: - # Falling back to allpages generator, the wiki is too old - params = { - 'action': 'query', - 'generator': 'allpages', - 'gaplimit': 50, - 'export': 1 # Just to make sure the parameter is passed. Empty is fine too. - } - # allpages does not accept "*" - if namespace is not '*': - params['gapnamespace'] = namespace - request = wikitools.api.APIRequest(site, params) - results = request.queryGen() - for result in results: - yield result['query']['export']['*'] + print "This wikitools version seems not to work for us. Exiting." + sys.exit() def readTitles(config={}, start=None): """ Read title list from a file, from the title "start" """ @@ -1361,7 +1368,8 @@ def getParameters(params=[]): groupDownload.add_argument('--curonly', action='store_true', help='store only the current version of pages; incompatible with --xmlrevisions') groupDownload.add_argument('--xmlrevisions', action='store_true', - help='download all revisions from an API generator') + help='download all revisions from an API generator. Ignores the \ + namespace selection') groupDownload.add_argument( '--images', action='store_true', help="generates an image dump") groupDownload.add_argument( From 33bb1c1f23a45f8e31fdf2cce80254cf3c43fc34 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 7 May 2018 21:19:27 +0000 Subject: [PATCH 4/6] Download image description from API when using --xmlrevisions Fixes https://github.com/WikiTeam/wikiteam/issues/308 Also add --failfast option to sneak in all the hacks I use to run the bulk downloads, so I can more easily sync the repos. --- dumpgenerator.py | 80 +++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index cf67bb8..53fb5c6 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -442,34 +442,38 @@ def getXMLHeader(config={}, session=None): # similar to: does not exist. Not a problem, if we get the . - xml = pme.xml - # Issue 26: Account for missing "Special" namespace. - # Hope the canonical special name has not been removed. - # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases - except ExportAbortedError: + if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): + r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap') + xml = r.text + else: try: - if config['api']: - print "Trying the local name for the Special namespace instead" - r = session.post( - url=config['api'], - data={ - 'action': 'query', - 'meta': 'siteinfo', - 'siprop': 'namespaces', - 'format': 'json'}, - timeout=120 - ) - config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ - + ':Export' - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: + # The does not exist. Not a problem, if we get the . xml = pme.xml + # Issue 26: Account for missing "Special" namespace. + # Hope the canonical special name has not been removed. + # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases except ExportAbortedError: - pass + try: + if config['api']: + print "Trying the local name for the Special namespace instead" + r = session.post( + url=config['api'], + data={ + 'action': 'query', + 'meta': 'siteinfo', + 'siprop': 'namespaces', + 'format': 'json'}, + timeout=120 + ) + config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + + ':Export' + xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + except PageMissingError as pme: + xml = pme.xml + except ExportAbortedError: + pass header = xml.split('')[0] if not re.match(r"\s*= maxretries: print ' We have retried %d times' % (c) print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) + if config['failfast']: + print "Exit, it will be for another time" + sys.exit() # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save only the last, # params['curonly'] should mean that we've already tried this @@ -766,7 +773,6 @@ def getXMLRevisions(config={}, session=None): params = { 'action': 'query', 'list': 'allrevisions', - 'arvnamespace': '*', 'arvlimit': 50, 'arvprop': 'ids', } @@ -779,7 +785,7 @@ def getXMLRevisions(config={}, session=None): for revision in page['revisions']: revids.append(str(revision['revid'])) - print "50 more revisions listed, until %d" % revids[-1] + print "50 more revisions listed, until %s" % revids[-1] exportparams = { 'action': 'query', 'revids': '|'.join(revids), @@ -1178,10 +1184,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): # saving description if any try: title = u'Image:%s' % (filename) - xmlfiledesc = getXMLFileDesc( - config=config, - title=title, - session=session) # use Image: for backwards compatibility + if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): + r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title) + xml = r.text + else: + xmlfiledesc = getXMLFileDesc( + config=config, + title=title, + session=session) # use Image: for backwards compatibility except PageMissingError: xmlfiledesc = '' logerror( @@ -1389,6 +1399,10 @@ def getParameters(params=[]): '--get-wiki-engine', action='store_true', help="returns the wiki engine") + groupMeta.add_argument( + '--failfast', + action='store_true', + help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.") args = parser.parse_args() # print args @@ -1564,6 +1578,7 @@ def getParameters(params=[]): 'curonly': args.curonly, 'date': datetime.datetime.now().strftime('%Y%m%d'), 'api': api, + 'failfast': args.failfast, 'index': index, 'images': args.images, 'logs': False, @@ -2127,7 +2142,10 @@ def main(params=[]): # do not enter if resume is requested from begining while not other['resume'] and os.path.isdir(config['path']): print '\nWarning!: "%s" path exists' % (config['path']) - reply = '' + if config['failfast']: + retry = 'yes' + else: + reply = '' while reply.lower() not in ['yes', 'y', 'no', 'n']: reply = raw_input( 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % From 952fcc6bcf9c4096cbeb4a05324d947bb887c5be Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 7 May 2018 21:55:26 +0000 Subject: [PATCH 5/6] Up version to 0.4.0-alpha to signify disruption --- dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 53fb5c6..8f4b820 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -54,7 +54,7 @@ import urllib UTF8Writer = getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) -__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org +__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org class PageMissingError(Exception): def __init__(self, title, xml): From 7c545d05b7effc240c8f20885dbcd7bad5632c94 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Tue, 8 May 2018 17:07:27 +0000 Subject: [PATCH 6/6] Fix UnboundLocalError and catch RetryError with --xmlrevisions File "./dumpgenerator.py", line 1212, in generateImageDump if not re.search(r'', xmlfiledesc): UnboundLocalError: local variable 'xmlfiledesc' referenced before assignment --- dumpgenerator.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 8f4b820..5582fd1 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -443,8 +443,16 @@ def getXMLHeader(config={}, session=None): # xmlns:x.... randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): - r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap') - xml = r.text + xml = None + try: + r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10) + xml = r.text + except requests.exceptions.RetryError: + pass + + if not xml: + r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10) + xml = r.json()['query']['export']['*'] else: try: xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) @@ -1186,7 +1194,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): title = u'Image:%s' % (filename) if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title) - xml = r.text + xmlfiledesc = r.text else: xmlfiledesc = getXMLFileDesc( config=config,