From f022b02e47f462fa0142683ebef8dca5eea18adb Mon Sep 17 00:00:00 2001 From: "mr.Shu" Date: Thu, 2 Oct 2014 23:06:42 +0200 Subject: [PATCH] wikiadownloader: Autopep8fied * Made the source look a bit better, though this script might not be used anymore. Signed-off-by: mr.Shu --- wikipediadownloader.py | 74 +++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/wikipediadownloader.py b/wikipediadownloader.py index bd8f853..15d23c8 100644 --- a/wikipediadownloader.py +++ b/wikipediadownloader.py @@ -6,12 +6,12 @@ # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . @@ -22,27 +22,32 @@ import sys import time import urllib + def main(): - parser = argparse.ArgumentParser(description='Downloader of Wikimedia dumps') + parser = argparse.ArgumentParser( + description='Downloader of Wikimedia dumps') #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False) - parser.add_argument('-r','--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False) - parser.add_argument('-s','--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False) + parser.add_argument( + '-r', '--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False) + parser.add_argument( + '-s', '--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False) args = parser.parse_args() - + maxretries = 3 if args.maxretries and int(args.maxretries) >= 0: maxretries = int(args.maxretries) - + dumpsdomain = 'http://dumps.wikimedia.org' f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain)) raw = f.read() f.close() - m = re.compile(r'[^<]+: Dump complete').finditer(raw) + m = re.compile( + r'[^<]+: Dump complete').finditer(raw) projects = [] for i in m: projects.append([i.group('project'), i.group('date')]) - projects.reverse() #download oldest dumps first + projects.reverse() # download oldest dumps first #projects = [['enwiki', '20130805']] start = args.start @@ -52,53 +57,62 @@ def main(): print 'Skipping %s, %s' % (project, date) continue else: - start = '' #reset - - print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50 - time.sleep(1) #ctrl-c + start = '' # reset + + print '-' * 50, '\n', 'Checking', project, date, '\n', '-' * 50 + time.sleep(1) # ctrl-c f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date)) htmlproj = f.read() - #print htmlproj + # print htmlproj f.close() - + for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']: corrupted = True maxretries2 = maxretries while corrupted and maxretries2 > 0: maxretries2 -= 1 - m = re.compile(r'' % (project, date, project, date, dumpclass)).finditer(htmlproj) + m = re.compile(r'' % + (project, date, project, date, dumpclass)).finditer(htmlproj) urldumps = [] - for i in m: #enwiki is splitted in several files, thats why we need a loop here - urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump'))) - - #print urldumps + # enwiki is splitted in several files, thats why we need a loop + # here + for i in m: + urldumps.append( + '%s/%s' % (dumpsdomain, i.group('urldump'))) + + # print urldumps for urldump in urldumps: dumpfilename = urldump.split('/')[-1] path = '%s/%s' % (dumpfilename[0], project) if not os.path.exists(path): os.makedirs(path) - os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename)) - - #md5check + os.system('wget -c %s -O %s/%s' % + (urldump, path, dumpfilename)) + + # md5check os.system('md5sum %s/%s > md5' % (path, dumpfilename)) f = open('md5', 'r') raw = f.read() f.close() - md51 = re.findall(r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] + md51 = re.findall( + r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] print md51 - - f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) + + f = urllib.urlopen( + '%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) raw = f.read() f.close() - f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w') + f = open('%s/%s-%s-md5sums.txt' % + (path, project, date), 'w') f.write(raw) f.close() - md52 = re.findall(r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] + md52 = re.findall( + r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] print md52 - + if md51 == md52: print 'md5sum is correct for this file, horay! \o/' - print '\n'*3 + print '\n' * 3 corrupted = False else: os.remove('%s/%s' % (path, dumpfilename))