diff --git a/wikipediadownloader.py b/wikipediadownloader.py index fda6175..bd8f853 100644 --- a/wikipediadownloader.py +++ b/wikipediadownloader.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -# Copyright (C) 2011 WikiTeam +# Copyright (C) 2011-2014 WikiTeam # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -15,81 +15,93 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import argparse +import os import re import sys -import os import time import urllib -dumpsdomain = 'http://dumps.wikimedia.org' -f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain)) -raw = f.read() -f.close() - -m = re.compile(r'[^<]+: Dump complete').finditer(raw) -projects = [] -for i in m: - projects.append([i.group('project'), i.group('date')]) -projects.reverse() #oldest project dump, download first -#projects = [['enwiki', '20130805']] - -start = '' -if len(sys.argv) == 2: - start = sys.argv[1].lower() - -for project, date in projects: - if start: - if start != project: - print 'Skipping %s, %s' % (project, date) - continue - else: - start = '' #reset +def main(): + parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps') + #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False) + parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False) + parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False) + args = parser.parse_args() - print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50 - time.sleep(1) #ctrl-c - f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date)) - htmlproj = f.read() - #print htmlproj - f.close() + maxretries = 3 + if args.maxretries and int(args.maxretries) >= 0: + maxretries = int(args.maxretries) - for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']: - corrupted = True - maxretries = 3 - while corrupted and maxretries > 0: - maxretries -= 1 - m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj) - urldumps = [] - for i in m: #enwiki is splitted in several files, thats why we need a loop here - urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump'))) - - #print urldumps - for urldump in urldumps: - dumpfilename = urldump.split('/')[-1] - path = '%s/%s' % (dumpfilename[0], project) - if not os.path.exists(path): - os.makedirs(path) - os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename)) - - #md5check - os.system('md5sum %s/%s > md5' % (path, dumpfilename)) - f = open('md5', 'r') - raw = f.read() - f.close() - md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] - print md51 - - f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) - raw = f.read() - f.close() - f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w') - f.write(raw) - f.close() - md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] - print md52 + dumpsdomain = 'http://dumps.wikimedia.org' + f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain)) + raw = f.read() + f.close() + + m = re.compile(r'[^<]+: Dump complete').finditer(raw) + projects = [] + for i in m: + projects.append([i.group('project'), i.group('date')]) + projects.reverse() #download oldest dumps first + #projects = [['enwiki', '20130805']] + + start = args.start + for project, date in projects: + if start: + if start != project: + print 'Skipping %s, %s' % (project, date) + continue + else: + start = '' #reset + + print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50 + time.sleep(1) #ctrl-c + f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date)) + htmlproj = f.read() + #print htmlproj + f.close() + + for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']: + corrupted = True + maxretries2 = maxretries + while corrupted and maxretries2 > 0: + maxretries2 -= 1 + m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj) + urldumps = [] + for i in m: #enwiki is splitted in several files, thats why we need a loop here + urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump'))) - if md51 == md52: - print 'md5sum is correct for this file, horay! \o/' - print '\n'*3 - corrupted = False - else: - os.remove('%s/%s' % (path, dumpfilename)) + #print urldumps + for urldump in urldumps: + dumpfilename = urldump.split('/')[-1] + path = '%s/%s' % (dumpfilename[0], project) + if not os.path.exists(path): + os.makedirs(path) + os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename)) + + #md5check + os.system('md5sum %s/%s > md5' % (path, dumpfilename)) + f = open('md5', 'r') + raw = f.read() + f.close() + md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] + print md51 + + f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) + raw = f.read() + f.close() + f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w') + f.write(raw) + f.close() + md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] + print md52 + + if md51 == md52: + print 'md5sum is correct for this file, horay! \o/' + print '\n'*3 + corrupted = False + else: + os.remove('%s/%s' % (path, dumpfilename)) + +if __name__ == '__main__': + main()