|
|
@ -31,17 +31,20 @@ for i in m:
|
|
|
|
projects.reverse() #oldest project dump, download first
|
|
|
|
projects.reverse() #oldest project dump, download first
|
|
|
|
|
|
|
|
|
|
|
|
#projects = [['enwiki', '20110405']]
|
|
|
|
#projects = [['enwiki', '20110405']]
|
|
|
|
|
|
|
|
|
|
|
|
for project, date in projects:
|
|
|
|
for project, date in projects:
|
|
|
|
time.sleep(1) #ctrl-c
|
|
|
|
time.sleep(1) #ctrl-c
|
|
|
|
f = urllib.urlopen('http://dumps.wikimedia.org/%s/%s/' % (project, date))
|
|
|
|
f = urllib.urlopen('http://dumps.wikimedia.org/%s/%s/' % (project, date))
|
|
|
|
raw = f.read()
|
|
|
|
htmlproj = f.read()
|
|
|
|
#print raw
|
|
|
|
#print htmlproj
|
|
|
|
f.close()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
for dumpclass in ['pages-meta-history\d*\.xml\.7z']:
|
|
|
|
for dumpclass in ['pages-meta-history\d*\.xml\.7z']:
|
|
|
|
corrupted = True
|
|
|
|
corrupted = True
|
|
|
|
while corrupted:
|
|
|
|
maxretries = 3
|
|
|
|
m = re.compile(r'<a href="(?P<urldump>http://[^/>]+/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(raw)
|
|
|
|
while corrupted and maxretries > 0:
|
|
|
|
|
|
|
|
maxretries -= 1
|
|
|
|
|
|
|
|
m = re.compile(r'<a href="(?P<urldump>http://[^/>]+/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
|
|
|
|
urldumps = []
|
|
|
|
urldumps = []
|
|
|
|
for i in m:
|
|
|
|
for i in m:
|
|
|
|
urldumps.append(i.group('urldump')) #enwiki is splitted in several files
|
|
|
|
urldumps.append(i.group('urldump')) #enwiki is splitted in several files
|
|
|
|