From ec6a8718eb7762e3e0be4fecac776ad96cac869b Mon Sep 17 00:00:00 2001 From: emijrp Date: Tue, 28 Jun 2011 13:44:50 +0000 Subject: [PATCH] improved fail download management; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@170 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- wikipediadownloader.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/wikipediadownloader.py b/wikipediadownloader.py index 876f379..fffea4d 100644 --- a/wikipediadownloader.py +++ b/wikipediadownloader.py @@ -31,17 +31,20 @@ for i in m: projects.reverse() #oldest project dump, download first #projects = [['enwiki', '20110405']] + for project, date in projects: time.sleep(1) #ctrl-c f = urllib.urlopen('http://dumps.wikimedia.org/%s/%s/' % (project, date)) - raw = f.read() - #print raw + htmlproj = f.read() + #print htmlproj f.close() for dumpclass in ['pages-meta-history\d*\.xml\.7z']: corrupted = True - while corrupted: - m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(raw) + maxretries = 3 + while corrupted and maxretries > 0: + maxretries -= 1 + m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj) urldumps = [] for i in m: urldumps.append(i.group('urldump')) #enwiki is splitted in several files