improving args parsing and help

pull/117/head
Emilio J. Rodríguez-Posada 10 years ago
parent 6442b8734d
commit 51e230a4b3

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2011 WikiTeam # Copyright (C) 2011-2014 WikiTeam
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
@ -15,81 +15,93 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import os
import re import re
import sys import sys
import os
import time import time
import urllib import urllib
dumpsdomain = 'http://dumps.wikimedia.org' def main():
f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain)) parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps')
raw = f.read() #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
f.close() parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw) args = parser.parse_args()
projects = []
for i in m:
projects.append([i.group('project'), i.group('date')])
projects.reverse() #oldest project dump, download first
#projects = [['enwiki', '20130805']]
start = ''
if len(sys.argv) == 2:
start = sys.argv[1].lower()
for project, date in projects:
if start:
if start != project:
print 'Skipping %s, %s' % (project, date)
continue
else:
start = '' #reset
print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50 maxretries = 3
time.sleep(1) #ctrl-c if args.maxretries and int(args.maxretries) >= 0:
f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date)) maxretries = int(args.maxretries)
htmlproj = f.read()
#print htmlproj
f.close()
for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']: dumpsdomain = 'http://dumps.wikimedia.org'
corrupted = True f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
maxretries = 3 raw = f.read()
while corrupted and maxretries > 0: f.close()
maxretries -= 1
m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj) m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)
urldumps = [] projects = []
for i in m: #enwiki is splitted in several files, thats why we need a loop here for i in m:
urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump'))) projects.append([i.group('project'), i.group('date')])
projects.reverse() #download oldest dumps first
#print urldumps #projects = [['enwiki', '20130805']]
for urldump in urldumps:
dumpfilename = urldump.split('/')[-1] start = args.start
path = '%s/%s' % (dumpfilename[0], project) for project, date in projects:
if not os.path.exists(path): if start:
os.makedirs(path) if start != project:
os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename)) print 'Skipping %s, %s' % (project, date)
continue
#md5check else:
os.system('md5sum %s/%s > md5' % (path, dumpfilename)) start = '' #reset
f = open('md5', 'r')
raw = f.read() print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
f.close() time.sleep(1) #ctrl-c
md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
print md51 htmlproj = f.read()
#print htmlproj
f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) f.close()
raw = f.read()
f.close() for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']:
f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w') corrupted = True
f.write(raw) maxretries2 = maxretries
f.close() while corrupted and maxretries2 > 0:
md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] maxretries2 -= 1
print md52 m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
urldumps = []
for i in m: #enwiki is splitted in several files, thats why we need a loop here
urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
if md51 == md52: #print urldumps
print 'md5sum is correct for this file, horay! \o/' for urldump in urldumps:
print '\n'*3 dumpfilename = urldump.split('/')[-1]
corrupted = False path = '%s/%s' % (dumpfilename[0], project)
else: if not os.path.exists(path):
os.remove('%s/%s' % (path, dumpfilename)) os.makedirs(path)
os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
#md5check
os.system('md5sum %s/%s > md5' % (path, dumpfilename))
f = open('md5', 'r')
raw = f.read()
f.close()
md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
print md51
f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
raw = f.read()
f.close()
f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
f.write(raw)
f.close()
md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
print md52
if md51 == md52:
print 'md5sum is correct for this file, horay! \o/'
print '\n'*3
corrupted = False
else:
os.remove('%s/%s' % (path, dumpfilename))
if __name__ == '__main__':
main()

Loading…
Cancel
Save