when full history fails, retrieve only the last version; various server errors handled

@ -210,23 +210,41 @@ def getUserAgent():
useragents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv: Gecko/20060508 Firefox/']
return useragents[0]
def logerror(config={}, text=''):
if text:
f = open('%s/errors.log' % (config['path']), 'a')
f.write('%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text))
def getXMLPageCore(headers={}, params={}, config={}):
#returns a full (or current only) xml ending in </mediawiki>
#if retrieving the full history of a page, returns a current only version
#if all fail, returns the empty string
xml = ''
c = 0
maxseconds = 600 #seconds
maxretries = 10 # x retries and skip
increment = 60 #increment every retry
maxseconds = 10 #max seconds to wait in a single sleeping
maxretries = 5 # x retries and skip
increment = 20 #increment every retry
while not re.search(r'</mediawiki>', xml):
if c > 0:
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
print ' XML for this page is wrong. Waiting %d seconds and reloading...' % (wait)
print ' XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait)
if c > maxretries:
print ' We have retry %d times' % (c)
print ' MediaWiki error for this page, network error or whatever... Skiping this page...'
if params['limit'] > 100:
params['limit'] = params['limit'] - (c * 100)
if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
params['limit'] = params['limit'] / 2 # half
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
if not config['curonly']: #our last chance, preserve only the last revision...
print ' Trying to save only the last revision for this page...'
params['curonly'] = 1
logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages']))
return getXMLPageCore(headers=headers, params=params, config=config)
print ' Saving in the errors log, and skiping...'
logerror(config=config, text='Error while retrieving the last revision of "%s". Skiping.' % (params['pages']))
return '' # empty xml
data = urllib.urlencode(params)
req = urllib2.Request(url=config['index'], data=data, headers=headers)
@ -301,6 +319,7 @@ def getXMLPage(config={}, title='', verbose=True):
def cleanXML(xml=''):
#do not touch xml codification, as is
if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
xml = xml.split('</siteinfo>\n')[1]
xml = xml.split('</mediawiki>')[0]
return xml
@ -353,6 +372,11 @@ def generateXMLDump(config={}, titles=[], start=''):
print 'Downloaded %d pages' % (c)
xml = getXMLPage(config=config, title=title)
xml = cleanXML(xml=xml)
if not xml:
logerror(config=config, text='The page "%s" was missing in the wiki (probably deleted)' % (title))
#here, XML is a correct <page> </page> chunk or
#an empty string due to a deleted page (logged in errors log) or
#an empty string due to an error while retrieving the page from server (logged in errors log)
c += 1
@ -870,7 +894,9 @@ def main():
#save index.php as html, for license details at the bootom of the page
if not os.path.exists('%s/index.html' % (config['path'])):
if os.path.exists('%s/index.html' % (config['path'])):
print 'index.html exists, do not overwrite'
print 'Downloading index.php (Main Page)'
f = urllib.urlopen(config['index'])
raw = f.read()
@ -880,7 +906,9 @@ def main():
#save special:Version as html, for extensions details
if not os.path.exists('%s/Special:Version.html' % (config['path'])):
if os.path.exists('%s/Special:Version.html' % (config['path'])):
print 'Special:Version.html exists, do not overwrite'
print 'Downloading Special:Version with extensions and other related info'
f = urllib.urlopen('%s?title=Special:Version' % (config['index']))
raw = f.read()
