diff --git a/wikipediadownloader.py b/wikipediadownloader.py
index fda6175..bd8f853 100644
--- a/wikipediadownloader.py
+++ b/wikipediadownloader.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
-# Copyright (C) 2011 WikiTeam
+# Copyright (C) 2011-2014 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@@ -15,81 +15,93 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import argparse
+import os
import re
import sys
-import os
import time
import urllib
-dumpsdomain = 'http://dumps.wikimedia.org'
-f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
-raw = f.read()
-f.close()
-
-m = re.compile(r'[^<]+: Dump complete').finditer(raw)
-projects = []
-for i in m:
- projects.append([i.group('project'), i.group('date')])
-projects.reverse() #oldest project dump, download first
-#projects = [['enwiki', '20130805']]
-
-start = ''
-if len(sys.argv) == 2:
- start = sys.argv[1].lower()
-
-for project, date in projects:
- if start:
- if start != project:
- print 'Skipping %s, %s' % (project, date)
- continue
- else:
- start = '' #reset
+def main():
+ parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps')
+ #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
+ parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
+ parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
+ args = parser.parse_args()
- print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
- time.sleep(1) #ctrl-c
- f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
- htmlproj = f.read()
- #print htmlproj
- f.close()
+ maxretries = 3
+ if args.maxretries and int(args.maxretries) >= 0:
+ maxretries = int(args.maxretries)
- for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']:
- corrupted = True
- maxretries = 3
- while corrupted and maxretries > 0:
- maxretries -= 1
- m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj)
- urldumps = []
- for i in m: #enwiki is splitted in several files, thats why we need a loop here
- urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
-
- #print urldumps
- for urldump in urldumps:
- dumpfilename = urldump.split('/')[-1]
- path = '%s/%s' % (dumpfilename[0], project)
- if not os.path.exists(path):
- os.makedirs(path)
- os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
-
- #md5check
- os.system('md5sum %s/%s > md5' % (path, dumpfilename))
- f = open('md5', 'r')
- raw = f.read()
- f.close()
- md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
- print md51
-
- f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
- raw = f.read()
- f.close()
- f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
- f.write(raw)
- f.close()
- md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
- print md52
+ dumpsdomain = 'http://dumps.wikimedia.org'
+ f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
+ raw = f.read()
+ f.close()
+
+ m = re.compile(r'[^<]+: Dump complete').finditer(raw)
+ projects = []
+ for i in m:
+ projects.append([i.group('project'), i.group('date')])
+ projects.reverse() #download oldest dumps first
+ #projects = [['enwiki', '20130805']]
+
+ start = args.start
+ for project, date in projects:
+ if start:
+ if start != project:
+ print 'Skipping %s, %s' % (project, date)
+ continue
+ else:
+ start = '' #reset
+
+ print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
+ time.sleep(1) #ctrl-c
+ f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
+ htmlproj = f.read()
+ #print htmlproj
+ f.close()
+
+ for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']:
+ corrupted = True
+ maxretries2 = maxretries
+ while corrupted and maxretries2 > 0:
+ maxretries2 -= 1
+ m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj)
+ urldumps = []
+ for i in m: #enwiki is splitted in several files, thats why we need a loop here
+ urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
- if md51 == md52:
- print 'md5sum is correct for this file, horay! \o/'
- print '\n'*3
- corrupted = False
- else:
- os.remove('%s/%s' % (path, dumpfilename))
+ #print urldumps
+ for urldump in urldumps:
+ dumpfilename = urldump.split('/')[-1]
+ path = '%s/%s' % (dumpfilename[0], project)
+ if not os.path.exists(path):
+ os.makedirs(path)
+ os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
+
+ #md5check
+ os.system('md5sum %s/%s > md5' % (path, dumpfilename))
+ f = open('md5', 'r')
+ raw = f.read()
+ f.close()
+ md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
+ print md51
+
+ f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
+ raw = f.read()
+ f.close()
+ f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
+ f.write(raw)
+ f.close()
+ md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
+ print md52
+
+ if md51 == md52:
+ print 'md5sum is correct for this file, horay! \o/'
+ print '\n'*3
+ corrupted = False
+ else:
+ os.remove('%s/%s' % (path, dumpfilename))
+
+if __name__ == '__main__':
+ main()