when full history fails, retrieve only the last version; various server errors handled

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@139 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
14 years ago · 6e45398878
parent e4b233cc37
commit 6e45398878
1 changed files with 43 additions and 15 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -210,23 +210,41 @@ def getUserAgent():
    useragents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4']
    return useragents[0]

+def logerror(config={}, text=''):
+    if text:
+        f = open('%s/errors.log' % (config['path']), 'a')
+        f.write('%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text))
+        f.close()
+
 def getXMLPageCore(headers={}, params={}, config={}):
+    #returns a full (or current only) xml ending in </mediawiki>
+    #if retrieving the full history of a page, returns a current only version
+    #if all fail, returns the empty string
    xml = ''
    c = 0
-    maxseconds = 600 #seconds
-    maxretries = 10 # x retries and skip
-    increment = 60 #increment every retry
+    maxseconds = 10 #max seconds to wait in a single sleeping
+    maxretries = 5 # x retries and skip
+    increment = 20 #increment every retry
    while not re.search(r'</mediawiki>', xml):
-        if c > 0:
+        if c > 0 and c < maxretries:
            wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
-            print '    XML for this page is wrong. Waiting %d seconds and reloading...' % (wait)
+            print '    XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait)
            time.sleep(wait)
-        if c > maxretries:
-            print '    We have retry %d times' % (c)
-            print '    MediaWiki error for this page, network error or whatever... Skiping this page...'
-            sys.exit()
-        if params['limit'] > 100:
-            params['limit'] = params['limit'] - (c * 100)
+            if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
+                params['limit'] = params['limit'] / 2 # half
+        if c >= maxretries:
+            print '    We have retried %d times' % (c)
+            print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
+            if not config['curonly']: #our last chance, preserve only the last revision...
+                print '    Trying to save only the last revision for this page...'
+                params['curonly'] = 1
+                logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages']))
+                return getXMLPageCore(headers=headers, params=params, config=config)
+            else:
+                print '    Saving in the errors log, and skiping...'
+                logerror(config=config, text='Error while retrieving the last revision of "%s". Skiping.' % (params['pages']))
+                return '' # empty xml
+        
        data = urllib.urlencode(params)
        req = urllib2.Request(url=config['index'], data=data, headers=headers)
        try:
@ -301,6 +319,7 @@ def getXMLPage(config={}, title='', verbose=True):

 def cleanXML(xml=''):
    #do not touch xml codification, as is
+    if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
        xml = xml.split('</siteinfo>\n')[1]
        xml = xml.split('</mediawiki>')[0]
    return xml
@ -353,6 +372,11 @@ def generateXMLDump(config={}, titles=[], start=''):
            print 'Downloaded %d pages' % (c)
        xml = getXMLPage(config=config, title=title)
        xml = cleanXML(xml=xml)
+        if not xml:
+            logerror(config=config, text='The page "%s" was missing in the wiki (probably deleted)' % (title))
+        #here, XML is a correct <page> </page> chunk or 
+        #an empty string due to a deleted page (logged in errors log) or
+        #an empty string due to an error while retrieving the page from server (logged in errors log)
        xmlfile.write(xml)
        c += 1
    xmlfile.write(footer)
@ -870,7 +894,9 @@ def main():
            saveLogs(config=config)
    
    #save index.php as html, for license details at the bootom of the page
-    if not os.path.exists('%s/index.html' % (config['path'])):
+    if os.path.exists('%s/index.html' % (config['path'])):
+        print 'index.html exists, do not overwrite'
+    else:
        print 'Downloading index.php (Main Page)'
        f = urllib.urlopen(config['index'])
        raw = f.read()
@ -880,7 +906,9 @@ def main():
        f.close()
    
    #save special:Version as html, for extensions details
-    if not os.path.exists('%s/Special:Version.html' % (config['path'])):
+    if os.path.exists('%s/Special:Version.html' % (config['path'])):
+        print 'Special:Version.html exists, do not overwrite'
+    else:
        print 'Downloading Special:Version with extensions and other related info'
        f = urllib.urlopen('%s?title=Special:Version' % (config['index']))
        raw = f.read()