wikiadownloader: Autopep8fied

* Made the source look a bit better, though this script might not be
  used anymore.

Signed-off-by: mr.Shu <mr@shu.io>
pull/197/head
mr.Shu 10 years ago
parent b3ef165529
commit f022b02e47

@ -22,11 +22,15 @@ import sys
import time
import urllib
def main():
parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps')
parser = argparse.ArgumentParser(
description='Downloader of Wikimedia dumps')
#parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False)
parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
parser.add_argument(
'-r', '--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False)
parser.add_argument(
'-s', '--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False)
args = parser.parse_args()
maxretries = 3
@ -38,7 +42,8 @@ def main():
raw = f.read()
f.close()
m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)
m = re.compile(
r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)
projects = []
for i in m:
projects.append([i.group('project'), i.group('date')])
@ -66,10 +71,14 @@ def main():
maxretries2 = maxretries
while corrupted and maxretries2 > 0:
maxretries2 -= 1
m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' %
(project, date, project, date, dumpclass)).finditer(htmlproj)
urldumps = []
for i in m: #enwiki is splitted in several files, thats why we need a loop here
urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
# enwiki is splitted in several files, thats why we need a loop
# here
for i in m:
urldumps.append(
'%s/%s' % (dumpsdomain, i.group('urldump')))
# print urldumps
for urldump in urldumps:
@ -77,23 +86,28 @@ def main():
path = '%s/%s' % (dumpfilename[0], project)
if not os.path.exists(path):
os.makedirs(path)
os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
os.system('wget -c %s -O %s/%s' %
(urldump, path, dumpfilename))
# md5check
os.system('md5sum %s/%s > md5' % (path, dumpfilename))
f = open('md5', 'r')
raw = f.read()
f.close()
md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
md51 = re.findall(
r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
print md51
f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
f = urllib.urlopen(
'%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
raw = f.read()
f.close()
f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
f = open('%s/%s-%s-md5sums.txt' %
(path, project, date), 'w')
f.write(raw)
f.close()
md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
md52 = re.findall(
r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
print md52
if md51 == md52:

Loading…
Cancel
Save