2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2011-06-25 22:24:41 +00:00
# -*- coding: utf-8 -*-
2014-06-25 15:32:24 +00:00
# Copyright (C) 2011-2014 WikiTeam
2011-06-25 22:24:41 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2014-10-02 21:06:42 +00:00
#
2011-06-25 22:24:41 +00:00
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
2014-10-02 21:06:42 +00:00
#
2011-06-25 22:24:41 +00:00
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-06-25 15:32:24 +00:00
import argparse
import os
2011-06-25 22:24:41 +00:00
import re
2011-06-28 21:27:29 +00:00
import sys
2011-06-25 22:24:41 +00:00
import time
import urllib
2014-10-02 21:06:42 +00:00
2014-06-25 15:32:24 +00:00
def main ( ) :
2014-10-02 21:06:42 +00:00
parser = argparse . ArgumentParser (
description = ' Downloader of Wikimedia dumps ' )
2014-06-25 15:32:24 +00:00
#parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
2014-10-02 21:06:42 +00:00
parser . add_argument (
' -r ' , ' --maxretries ' , help = ' Max retries to download a dump when md5sum doesn \' t fit. Default: 3 ' , required = False )
parser . add_argument (
' -s ' , ' --start ' , help = ' Start to download from this project (e.g.: eswiki, itwikisource, etc) ' , required = False )
2014-06-25 15:32:24 +00:00
args = parser . parse_args ( )
2014-10-02 21:06:42 +00:00
2014-06-25 15:32:24 +00:00
maxretries = 3
if args . maxretries and int ( args . maxretries ) > = 0 :
maxretries = int ( args . maxretries )
2014-10-02 21:06:42 +00:00
2014-06-25 15:32:24 +00:00
dumpsdomain = ' http://dumps.wikimedia.org '
f = urllib . urlopen ( ' %s /backup-index.html ' % ( dumpsdomain ) )
raw = f . read ( )
f . close ( )
2014-10-02 21:06:42 +00:00
m = re . compile (
r ' <a href= " (?P<project>[^>]+)/(?P<date> \ d+) " >[^<]+</a>: <span class= \' done \' >Dump complete</span> ' ) . finditer ( raw )
2014-06-25 15:32:24 +00:00
projects = [ ]
for i in m :
projects . append ( [ i . group ( ' project ' ) , i . group ( ' date ' ) ] )
2014-10-02 21:06:42 +00:00
projects . reverse ( ) # download oldest dumps first
2014-06-25 15:32:24 +00:00
#projects = [['enwiki', '20130805']]
start = args . start
for project , date in projects :
if start :
if start != project :
print ' Skipping %s , %s ' % ( project , date )
continue
else :
2014-10-02 21:06:42 +00:00
start = ' ' # reset
print ' - ' * 50 , ' \n ' , ' Checking ' , project , date , ' \n ' , ' - ' * 50
time . sleep ( 1 ) # ctrl-c
2014-06-25 15:32:24 +00:00
f = urllib . urlopen ( ' %s / %s / %s / ' % ( dumpsdomain , project , date ) )
htmlproj = f . read ( )
2014-10-02 21:06:42 +00:00
# print htmlproj
2014-06-25 15:32:24 +00:00
f . close ( )
2014-10-02 21:06:42 +00:00
2014-06-25 15:32:24 +00:00
for dumpclass in [ ' pages-meta-history \ d* \ .xml[^ \ .]* \ .7z ' ] :
corrupted = True
maxretries2 = maxretries
while corrupted and maxretries2 > 0 :
maxretries2 - = 1
2014-10-02 21:06:42 +00:00
m = re . compile ( r ' <a href= " (?P<urldump>/ %s / %s / %s - %s - %s ) " > ' %
( project , date , project , date , dumpclass ) ) . finditer ( htmlproj )
2014-06-25 15:32:24 +00:00
urldumps = [ ]
2014-10-02 21:06:42 +00:00
# enwiki is splitted in several files, thats why we need a loop
# here
for i in m :
urldumps . append (
' %s / %s ' % ( dumpsdomain , i . group ( ' urldump ' ) ) )
# print urldumps
2014-06-25 15:32:24 +00:00
for urldump in urldumps :
dumpfilename = urldump . split ( ' / ' ) [ - 1 ]
path = ' %s / %s ' % ( dumpfilename [ 0 ] , project )
if not os . path . exists ( path ) :
os . makedirs ( path )
2014-10-02 21:06:42 +00:00
os . system ( ' wget -c %s -O %s / %s ' %
( urldump , path , dumpfilename ) )
# md5check
2014-06-25 15:32:24 +00:00
os . system ( ' md5sum %s / %s > md5 ' % ( path , dumpfilename ) )
f = open ( ' md5 ' , ' r ' )
raw = f . read ( )
f . close ( )
2014-10-02 21:06:42 +00:00
md51 = re . findall (
r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s / %s ' % ( path , dumpfilename ) , raw ) [ 0 ]
2014-06-25 15:32:24 +00:00
print md51
2014-10-02 21:06:42 +00:00
f = urllib . urlopen (
' %s / %s / %s / %s - %s -md5sums.txt ' % ( dumpsdomain , project , date , project , date ) )
2014-06-25 15:32:24 +00:00
raw = f . read ( )
f . close ( )
2014-10-02 21:06:42 +00:00
f = open ( ' %s / %s - %s -md5sums.txt ' %
( path , project , date ) , ' w ' )
2014-06-25 15:32:24 +00:00
f . write ( raw )
f . close ( )
2014-10-02 21:06:42 +00:00
md52 = re . findall (
r ' (?P<md5>[a-f0-9] {32} ) \ s+ %s ' % ( dumpfilename ) , raw ) [ 0 ]
2014-06-25 15:32:24 +00:00
print md52
2014-10-02 21:06:42 +00:00
2014-06-25 15:32:24 +00:00
if md51 == md52 :
print ' md5sum is correct for this file, horay! \ o/ '
2014-10-02 21:06:42 +00:00
print ' \n ' * 3
2014-06-25 15:32:24 +00:00
corrupted = False
else :
os . remove ( ' %s / %s ' % ( path , dumpfilename ) )
if __name__ == ' __main__ ' :
main ( )