2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2011-06-25 22:24:41 +00:00
# -*- coding: utf-8 -*-
2014-06-25 15:32:24 +00:00
# Copyright (C) 2011-2014 WikiTeam
2011-06-25 22:24:41 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-06-25 15:32:24 +00:00
import argparse
import os
2011-06-25 22:24:41 +00:00
import re
2011-06-28 21:27:29 +00:00
import sys
2011-06-25 22:24:41 +00:00
import time
import urllib
2014-06-25 15:32:24 +00:00
def main ( ) :
parser = argparse . ArgumentParser ( description = ' Downloader of Wikimedia dumps ' )
#parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
parser . add_argument ( ' -r ' , ' --maxretries ' , help = ' Max retries to download a dump when md5sum doesn \' t fit. Default: 3 ' , required = False )
parser . add_argument ( ' -s ' , ' --start ' , help = ' Start to download from this project (e.g.: eswiki, itwikisource, etc) ' , required = False )
args = parser . parse_args ( )
2011-06-28 21:27:29 +00:00
2014-06-25 15:32:24 +00:00
maxretries = 3
if args . maxretries and int ( args . maxretries ) > = 0 :
maxretries = int ( args . maxretries )
2011-06-25 22:24:41 +00:00
2014-06-25 15:32:24 +00:00
dumpsdomain = ' http://dumps.wikimedia.org '
f = urllib . urlopen ( ' %s /backup-index.html ' % ( dumpsdomain ) )
raw = f . read ( )
f . close ( )
m = re . compile ( r ' <a href= " (?P<project>[^>]+)/(?P<date> \ d+) " >[^<]+</a>: <span class= \' done \' >Dump complete</span> ' ) . finditer ( raw )
projects = [ ]
for i in m :
projects . append ( [ i . group ( ' project ' ) , i . group ( ' date ' ) ] )
projects . reverse ( ) #download oldest dumps first
#projects = [['enwiki', '20130805']]
start = args . start
for project , date in projects :
if start :
if start != project :
print ' Skipping %s , %s ' % ( project , date )
continue
else :
start = ' ' #reset
print ' - ' * 50 , ' \n ' , ' Checking ' , project , date , ' \n ' , ' - ' * 50
time . sleep ( 1 ) #ctrl-c
f = urllib . urlopen ( ' %s / %s / %s / ' % ( dumpsdomain , project , date ) )
htmlproj = f . read ( )
#print htmlproj
f . close ( )
for dumpclass in [ ' pages-meta-history \ d* \ .xml[^ \ .]* \ .7z ' ] :
corrupted = True
maxretries2 = maxretries
while corrupted and maxretries2 > 0 :
maxretries2 - = 1
m = re . compile ( r ' <a href= " (?P<urldump>/ %s / %s / %s - %s - %s ) " > ' % ( project , date , project , date , dumpclass ) ) . finditer ( htmlproj )
urldumps = [ ]
for i in m : #enwiki is splitted in several files, thats why we need a loop here
urldumps . append ( ' %s / %s ' % ( dumpsdomain , i . group ( ' urldump ' ) ) )
2011-06-25 22:24:41 +00:00
2014-06-25 15:32:24 +00:00
#print urldumps
for urldump in urldumps :
dumpfilename = urldump . split ( ' / ' ) [ - 1 ]
path = ' %s / %s ' % ( dumpfilename [ 0 ] , project )
if not os . path . exists ( path ) :
os . makedirs ( path )
os . system ( ' wget -c %s -O %s / %s ' % ( urldump , path , dumpfilename ) )
#md5check
os . system ( ' md5sum %s / %s > md5 ' % ( path , dumpfilename ) )
f = open ( ' md5 ' , ' r ' )
raw = f . read ( )
f . close ( )
md51 = re . findall ( r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s / %s ' % ( path , dumpfilename ) , raw ) [ 0 ]
print md51
f = urllib . urlopen ( ' %s / %s / %s / %s - %s -md5sums.txt ' % ( dumpsdomain , project , date , project , date ) )
raw = f . read ( )
f . close ( )
f = open ( ' %s / %s - %s -md5sums.txt ' % ( path , project , date ) , ' w ' )
f . write ( raw )
f . close ( )
md52 = re . findall ( r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s ' % ( dumpfilename ) , raw ) [ 0 ]
print md52
if md51 == md52 :
print ' md5sum is correct for this file, horay! \ o/ '
print ' \n ' * 3
corrupted = False
else :
os . remove ( ' %s / %s ' % ( path , dumpfilename ) )
if __name__ == ' __main__ ' :
main ( )