@ -22,11 +22,15 @@ import sys
import time
import urllib
def main ( ) :
parser = argparse . ArgumentParser ( description = ' Downloader of Wikimedia dumps ' )
parser = argparse . ArgumentParser (
description = ' Downloader of Wikimedia dumps ' )
#parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
parser . add_argument ( ' -r ' , ' --maxretries ' , help = ' Max retries to download a dump when md5sum doesn \' t fit. Default: 3 ' , required = False )
parser . add_argument ( ' -s ' , ' --start ' , help = ' Start to download from this project (e.g.: eswiki, itwikisource, etc) ' , required = False )
parser . add_argument (
' -r ' , ' --maxretries ' , help = ' Max retries to download a dump when md5sum doesn \' t fit. Default: 3 ' , required = False )
parser . add_argument (
' -s ' , ' --start ' , help = ' Start to download from this project (e.g.: eswiki, itwikisource, etc) ' , required = False )
args = parser . parse_args ( )
maxretries = 3
@ -38,11 +42,12 @@ def main():
raw = f . read ( )
f . close ( )
m = re . compile ( r ' <a href= " (?P<project>[^>]+)/(?P<date> \ d+) " >[^<]+</a>: <span class= \' done \' >Dump complete</span> ' ) . finditer ( raw )
m = re . compile (
r ' <a href= " (?P<project>[^>]+)/(?P<date> \ d+) " >[^<]+</a>: <span class= \' done \' >Dump complete</span> ' ) . finditer ( raw )
projects = [ ]
for i in m :
projects . append ( [ i . group ( ' project ' ) , i . group ( ' date ' ) ] )
projects . reverse ( ) # download oldest dumps first
projects . reverse ( ) # download oldest dumps first
#projects = [['enwiki', '20130805']]
start = args . start
@ -52,13 +57,13 @@ def main():
print ' Skipping %s , %s ' % ( project , date )
continue
else :
start = ' ' # reset
start = ' ' # reset
print ' - ' * 50 , ' \n ' , ' Checking ' , project , date , ' \n ' , ' - ' * 50
time . sleep ( 1 ) # ctrl-c
print ' - ' * 50 , ' \n ' , ' Checking ' , project , date , ' \n ' , ' - ' * 50
time . sleep ( 1 ) # ctrl-c
f = urllib . urlopen ( ' %s / %s / %s / ' % ( dumpsdomain , project , date ) )
htmlproj = f . read ( )
# print htmlproj
# print htmlproj
f . close ( )
for dumpclass in [ ' pages-meta-history \ d* \ .xml[^ \ .]* \ .7z ' ] :
@ -66,39 +71,48 @@ def main():
maxretries2 = maxretries
while corrupted and maxretries2 > 0 :
maxretries2 - = 1
m = re . compile ( r ' <a href= " (?P<urldump>/ %s / %s / %s - %s - %s ) " > ' % ( project , date , project , date , dumpclass ) ) . finditer ( htmlproj )
m = re . compile ( r ' <a href= " (?P<urldump>/ %s / %s / %s - %s - %s ) " > ' %
( project , date , project , date , dumpclass ) ) . finditer ( htmlproj )
urldumps = [ ]
for i in m : #enwiki is splitted in several files, thats why we need a loop here
urldumps . append ( ' %s / %s ' % ( dumpsdomain , i . group ( ' urldump ' ) ) )
# enwiki is splitted in several files, thats why we need a loop
# here
for i in m :
urldumps . append (
' %s / %s ' % ( dumpsdomain , i . group ( ' urldump ' ) ) )
#print urldumps
# print urldumps
for urldump in urldumps :
dumpfilename = urldump . split ( ' / ' ) [ - 1 ]
path = ' %s / %s ' % ( dumpfilename [ 0 ] , project )
if not os . path . exists ( path ) :
os . makedirs ( path )
os . system ( ' wget -c %s -O %s / %s ' % ( urldump , path , dumpfilename ) )
os . system ( ' wget -c %s -O %s / %s ' %
( urldump , path , dumpfilename ) )
# md5check
# md5check
os . system ( ' md5sum %s / %s > md5 ' % ( path , dumpfilename ) )
f = open ( ' md5 ' , ' r ' )
raw = f . read ( )
f . close ( )
md51 = re . findall ( r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s / %s ' % ( path , dumpfilename ) , raw ) [ 0 ]
md51 = re . findall (
r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s / %s ' % ( path , dumpfilename ) , raw ) [ 0 ]
print md51
f = urllib . urlopen ( ' %s / %s / %s / %s - %s -md5sums.txt ' % ( dumpsdomain , project , date , project , date ) )
f = urllib . urlopen (
' %s / %s / %s / %s - %s -md5sums.txt ' % ( dumpsdomain , project , date , project , date ) )
raw = f . read ( )
f . close ( )
f = open ( ' %s / %s - %s -md5sums.txt ' % ( path , project , date ) , ' w ' )
f = open ( ' %s / %s - %s -md5sums.txt ' %
( path , project , date ) , ' w ' )
f . write ( raw )
f . close ( )
md52 = re . findall ( r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s ' % ( dumpfilename ) , raw ) [ 0 ]
md52 = re . findall (
r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s ' % ( dumpfilename ) , raw ) [ 0 ]
print md52
if md51 == md52 :
print ' md5sum is correct for this file, horay! \ o/ '
print ' \n ' * 3
print ' \n ' * 3
corrupted = False
else :
os . remove ( ' %s / %s ' % ( path , dumpfilename ) )