retries for image list retrieve;

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@224 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
13 years ago · 76897b09b2
parent 31ed2aff33
commit 76897b09b2
1 changed files with 14 additions and 4 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -440,16 +440,25 @@ def getImageFilenamesURL(config={}):
    images = []
    offset = '29990101000000' #january 1, 2999
    limit = 5000
+    retries = 5
    while offset:
        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent()})
        f = urllib2.urlopen(req)
        raw = f.read()
        f.close()
-        if limit > 10 and re.search(ur'(?i)allowed memory size of \d+ bytes exhausted', raw): # delicated wiki
-            print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
-            limit = limit/10
-            continue
+        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicated wiki
+            if limit > 10:
+                print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
+                limit = limit/10
+                continue
+            elif retries > 0: # waste retries, then exit
+                retries -= 1
+                print 'Retrying...'
+                continue
+            else:
+                print 'No more retries, exit...'
+                break
        
        raw = cleanHTML(raw)
        #archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
@ -494,6 +503,7 @@ def getImageFilenamesURL(config={}):
        
        if re.search(r_next, raw):
            offset = re.findall(r_next, raw)[0]
+            retries += 5 # add more retries if we got a page with offset
        else:
            offset = ''