2011-06-25 22:24:41 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# Copyright (C) 2011 WikiTeam
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
import re
|
2011-06-28 21:27:29 +00:00
|
|
|
import sys
|
2011-06-25 22:24:41 +00:00
|
|
|
import os
|
|
|
|
import time
|
|
|
|
import urllib
|
|
|
|
|
2012-08-07 08:27:45 +00:00
|
|
|
dumpsdomain = 'http://dumps.wikimedia.org'
|
|
|
|
f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
|
2011-06-25 22:24:41 +00:00
|
|
|
raw = f.read()
|
|
|
|
f.close()
|
|
|
|
|
2011-06-26 12:48:26 +00:00
|
|
|
m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)
|
2011-06-25 22:24:41 +00:00
|
|
|
projects = []
|
|
|
|
for i in m:
|
2011-06-26 12:48:26 +00:00
|
|
|
projects.append([i.group('project'), i.group('date')])
|
2011-06-25 22:24:41 +00:00
|
|
|
projects.reverse() #oldest project dump, download first
|
2011-06-26 12:48:26 +00:00
|
|
|
#projects = [['enwiki', '20110405']]
|
2011-06-28 13:44:50 +00:00
|
|
|
|
2011-06-28 21:27:29 +00:00
|
|
|
start = ''
|
|
|
|
if len(sys.argv) == 2:
|
|
|
|
start = sys.argv[1].lower()
|
|
|
|
|
2011-06-26 12:48:26 +00:00
|
|
|
for project, date in projects:
|
2011-06-28 21:27:29 +00:00
|
|
|
if start:
|
|
|
|
if start != project:
|
|
|
|
print 'Skipping %s, %s' % (project, date)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
start = '' #reset
|
|
|
|
|
2012-08-07 08:27:45 +00:00
|
|
|
print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
|
2011-06-25 22:24:41 +00:00
|
|
|
time.sleep(1) #ctrl-c
|
2012-08-07 08:27:45 +00:00
|
|
|
f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
|
2011-06-28 13:44:50 +00:00
|
|
|
htmlproj = f.read()
|
|
|
|
#print htmlproj
|
2011-06-25 22:24:41 +00:00
|
|
|
f.close()
|
|
|
|
|
2011-06-26 12:48:26 +00:00
|
|
|
for dumpclass in ['pages-meta-history\d*\.xml\.7z']:
|
2011-06-25 22:24:41 +00:00
|
|
|
corrupted = True
|
2011-06-28 13:44:50 +00:00
|
|
|
maxretries = 3
|
|
|
|
while corrupted and maxretries > 0:
|
|
|
|
maxretries -= 1
|
2012-08-07 08:27:45 +00:00
|
|
|
m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
|
2011-06-26 12:48:26 +00:00
|
|
|
urldumps = []
|
2012-08-07 08:27:45 +00:00
|
|
|
for i in m: #enwiki is splitted in several files, thats why we need a loop here
|
|
|
|
urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
|
2011-06-26 12:48:26 +00:00
|
|
|
|
|
|
|
#print urldumps
|
|
|
|
for urldump in urldumps:
|
2011-06-25 22:24:41 +00:00
|
|
|
dumpfilename = urldump.split('/')[-1]
|
2011-06-26 12:48:26 +00:00
|
|
|
path = '%s/%s' % (dumpfilename[0], project)
|
2011-06-25 22:24:41 +00:00
|
|
|
if not os.path.exists(path):
|
|
|
|
os.makedirs(path)
|
|
|
|
os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
|
|
|
|
|
|
|
|
#md5check
|
|
|
|
os.system('md5sum %s/%s > md5' % (path, dumpfilename))
|
|
|
|
f = open('md5', 'r')
|
|
|
|
raw = f.read()
|
|
|
|
f.close()
|
|
|
|
md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
|
|
|
|
print md51
|
|
|
|
|
2012-08-07 08:27:45 +00:00
|
|
|
f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
|
2011-06-25 22:24:41 +00:00
|
|
|
raw = f.read()
|
|
|
|
f.close()
|
2011-06-26 12:48:26 +00:00
|
|
|
f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
|
2011-06-25 22:24:41 +00:00
|
|
|
f.write(raw)
|
|
|
|
f.close()
|
|
|
|
md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
|
|
|
|
print md52
|
|
|
|
|
|
|
|
if md51 == md52:
|
|
|
|
print 'md5sum is correct for this file, horay! \o/'
|
|
|
|
print '\n'*3
|
|
|
|
corrupted = False
|
|
|
|
else:
|
|
|
|
os.remove('%s/%s' % (path, dumpfilename))
|