diff --git a/wikipediadownloader.py b/wikipediadownloader.py index 876f379..fffea4d 100644 --- a/wikipediadownloader.py +++ b/wikipediadownloader.py @@ -31,17 +31,20 @@ for i in m: projects.reverse() #oldest project dump, download first #projects = [['enwiki', '20110405']] + for project, date in projects: time.sleep(1) #ctrl-c f = urllib.urlopen('http://dumps.wikimedia.org/%s/%s/' % (project, date)) - raw = f.read() - #print raw + htmlproj = f.read() + #print htmlproj f.close() for dumpclass in ['pages-meta-history\d*\.xml\.7z']: corrupted = True - while corrupted: - m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(raw) + maxretries = 3 + while corrupted and maxretries > 0: + maxretries -= 1 + m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj) urldumps = [] for i in m: urldumps.append(i.group('urldump')) #enwiki is splitted in several files