diff --git a/wikipediadownloader.py b/wikipediadownloader.py
index bd8f853..15d23c8 100644
--- a/wikipediadownloader.py
+++ b/wikipediadownloader.py
@@ -6,12 +6,12 @@
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
@@ -22,27 +22,32 @@ import sys
import time
import urllib
+
def main():
- parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps')
+ parser = argparse.ArgumentParser(
+ description='Downloader of Wikimedia dumps')
#parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
- parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
- parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
+ parser.add_argument(
+ '-r', '--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
+ parser.add_argument(
+ '-s', '--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
args = parser.parse_args()
-
+
maxretries = 3
if args.maxretries and int(args.maxretries) >= 0:
maxretries = int(args.maxretries)
-
+
dumpsdomain = 'http://dumps.wikimedia.org'
f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
raw = f.read()
f.close()
- m = re.compile(r'[^<]+: Dump complete').finditer(raw)
+ m = re.compile(
+ r'[^<]+: Dump complete').finditer(raw)
projects = []
for i in m:
projects.append([i.group('project'), i.group('date')])
- projects.reverse() #download oldest dumps first
+ projects.reverse() # download oldest dumps first
#projects = [['enwiki', '20130805']]
start = args.start
@@ -52,53 +57,62 @@ def main():
print 'Skipping %s, %s' % (project, date)
continue
else:
- start = '' #reset
-
- print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
- time.sleep(1) #ctrl-c
+ start = '' # reset
+
+ print '-' * 50, '\n', 'Checking', project, date, '\n', '-' * 50
+ time.sleep(1) # ctrl-c
f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
htmlproj = f.read()
- #print htmlproj
+ # print htmlproj
f.close()
-
+
for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']:
corrupted = True
maxretries2 = maxretries
while corrupted and maxretries2 > 0:
maxretries2 -= 1
- m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj)
+ m = re.compile(r'' %
+ (project, date, project, date, dumpclass)).finditer(htmlproj)
urldumps = []
- for i in m: #enwiki is splitted in several files, thats why we need a loop here
- urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
-
- #print urldumps
+ # enwiki is splitted in several files, thats why we need a loop
+ # here
+ for i in m:
+ urldumps.append(
+ '%s/%s' % (dumpsdomain, i.group('urldump')))
+
+ # print urldumps
for urldump in urldumps:
dumpfilename = urldump.split('/')[-1]
path = '%s/%s' % (dumpfilename[0], project)
if not os.path.exists(path):
os.makedirs(path)
- os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
-
- #md5check
+ os.system('wget -c %s -O %s/%s' %
+ (urldump, path, dumpfilename))
+
+ # md5check
os.system('md5sum %s/%s > md5' % (path, dumpfilename))
f = open('md5', 'r')
raw = f.read()
f.close()
- md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
+ md51 = re.findall(
+ r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
print md51
-
- f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
+
+ f = urllib.urlopen(
+ '%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
raw = f.read()
f.close()
- f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
+ f = open('%s/%s-%s-md5sums.txt' %
+ (path, project, date), 'w')
f.write(raw)
f.close()
- md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
+ md52 = re.findall(
+ r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
print md52
-
+
if md51 == md52:
print 'md5sum is correct for this file, horay! \o/'
- print '\n'*3
+ print '\n' * 3
corrupted = False
else:
os.remove('%s/%s' % (path, dumpfilename))