improving args parsing and help

10 years ago · 51e230a4b3
parent 6442b8734d
commit 51e230a4b3
1 changed files with 83 additions and 71 deletions
--- a/wikipediadownloader.py
+++ b/wikipediadownloader.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-

-# Copyright (C) 2011 WikiTeam
+# Copyright (C) 2011-2014 WikiTeam
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -15,12 +15,24 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

+import argparse
+import os
 import re
 import sys
-import os
 import time
 import urllib

+def main():
+    parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps')
+    #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
+    parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
+    parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
+    args = parser.parse_args()
+    
+    maxretries = 3
+    if args.maxretries and int(args.maxretries) >= 0:
+        maxretries = int(args.maxretries)
+    
    dumpsdomain = 'http://dumps.wikimedia.org'
    f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
    raw = f.read()
@ -30,13 +42,10 @@ m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span cla
    projects = []
    for i in m:
        projects.append([i.group('project'), i.group('date')])
-projects.reverse() #oldest project dump, download first
+    projects.reverse() #download oldest dumps first
    #projects = [['enwiki', '20130805']]

-start = ''
-if len(sys.argv) == 2:
-    start = sys.argv[1].lower()
-
+    start = args.start
    for project, date in projects:
        if start:
            if start != project:
@ -54,9 +63,9 @@ for project, date in projects:
        
        for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']:
            corrupted = True
-        maxretries = 3
-        while corrupted and maxretries > 0:
-            maxretries -= 1
+            maxretries2 = maxretries
+            while corrupted and maxretries2 > 0:
+                maxretries2 -= 1
                m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
                urldumps = []
                for i in m: #enwiki is splitted in several files, thats why we need a loop here
@ -93,3 +102,6 @@ for project, date in projects:
                        corrupted = False
                    else:
                        os.remove('%s/%s' % (path, dumpfilename))
+
+if __name__ == '__main__':
+    main()