wikiteam/wikipediadownloader.py

# -*- coding: utf-8 -*-

# Copyright (C) 2011 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re
import sys
import os
import time
import urllib

dumpsdomain = 'http://dumps.wikimedia.org'
f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))
raw = f.read()
f.close()

m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)
projects = []
for i in m:
    projects.append([i.group('project'), i.group('date')])
projects.reverse() #oldest project dump, download first
#projects = [['enwiki', '20110405']]

start = ''
if len(sys.argv) == 2:
    start = sys.argv[1].lower()

for project, date in projects:
    if start:
        if start != project:
            print 'Skipping %s, %s' % (project, date)
            continue
        else:
            start = '' #reset
    
    print '-'*50, '\n', 'Checking', project, date, '\n', '-'*50
    time.sleep(1) #ctrl-c
    f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))
    htmlproj = f.read()
    #print htmlproj
    f.close()
    
    for dumpclass in ['pages-meta-history\d*\.xml\.7z']:
        corrupted = True
        maxretries = 3
        while corrupted and maxretries > 0:
            maxretries -= 1
            m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)
            urldumps = []
            for i in m: #enwiki is splitted in several files, thats why we need a loop here
                urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))
            
            #print urldumps
            for urldump in urldumps:
                dumpfilename = urldump.split('/')[-1]
                path = '%s/%s' % (dumpfilename[0], project)
                if not os.path.exists(path):
                    os.makedirs(path)
                os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))
                
                #md5check
                os.system('md5sum %s/%s > md5' % (path, dumpfilename))
                f = open('md5', 'r')
                raw = f.read()
                f.close()
                md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]
                print md51
                
                f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))
                raw = f.read()
                f.close()
                f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')
                f.write(raw)
                f.close()
                md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]
                print md52
                
                if md51 == md52:
                    print 'md5sum is correct for this file, horay! \o/'
                    print '\n'*3
                    corrupted = False
                else:
                    os.remove('%s/%s' % (path, dumpfilename))
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`# -- coding: utf-8 --`

			`# Copyright (C) 2011 WikiTeam`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`import re`
skiping option git-svn-id: https://wikiteam.googlecode.com/svn/trunk@171 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 21:27:29 +00:00			`import sys`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`import os`
			`import time`
			`import urllib`

fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`dumpsdomain = 'http://dumps.wikimedia.org'`
			`f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain))`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`raw = f.read()`
			`f.close()`

fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`m = re.compile(r'<a href="(?P<project>[^>]+)/(?P<date>\d+)">[^<]+</a>: <span class=\'done\'>Dump complete</span>').finditer(raw)`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`projects = []`
			`for i in m:`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`projects.append([i.group('project'), i.group('date')])`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`projects.reverse() #oldest project dump, download first`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`#projects = [['enwiki', '20110405']]`
improved fail download management; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@170 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 13:44:50 +00:00
skiping option git-svn-id: https://wikiteam.googlecode.com/svn/trunk@171 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 21:27:29 +00:00			`start = ''`
			`if len(sys.argv) == 2:`
			`start = sys.argv[1].lower()`

fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`for project, date in projects:`
skiping option git-svn-id: https://wikiteam.googlecode.com/svn/trunk@171 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 21:27:29 +00:00			`if start:`
			`if start != project:`
			`print 'Skipping %s, %s' % (project, date)`
			`continue`
			`else:`
			`start = '' #reset`

fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`print '-'50, '\n', 'Checking', project, date, '\n', '-'50`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`time.sleep(1) #ctrl-c`
fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date))`
improved fail download management; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@170 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 13:44:50 +00:00			`htmlproj = f.read()`
			`#print htmlproj`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`f.close()`

fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`for dumpclass in ['pages-meta-history\d*\.xml\.7z']:`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`corrupted = True`
improved fail download management; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@170 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-28 13:44:50 +00:00			`maxretries = 3`
			`while corrupted and maxretries > 0:`
			`maxretries -= 1`
fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`m = re.compile(r'<a href="(?P<urldump>/%s/%s/%s-%s-%s)">' % (project, date, project, date, dumpclass)).finditer(htmlproj)`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`urldumps = []`
fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`for i in m: #enwiki is splitted in several files, thats why we need a loop here`
			`urldumps.append('%s/%s' % (dumpsdomain, i.group('urldump')))`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00
			`#print urldumps`
			`for urldump in urldumps:`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`dumpfilename = urldump.split('/')[-1]`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`path = '%s/%s' % (dumpfilename[0], project)`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`if not os.path.exists(path):`
			`os.makedirs(path)`
			`os.system('wget -c %s -O %s/%s' % (urldump, path, dumpfilename))`

			`#md5check`
			`os.system('md5sum %s/%s > md5' % (path, dumpfilename))`
			`f = open('md5', 'r')`
			`raw = f.read()`
			`f.close()`
			`md51 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0]`
			`print md51`

fixing wikipediadownloader.py, dumps url git-svn-id: https://wikiteam.googlecode.com/svn/trunk@759 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2012-08-07 08:27:45 +00:00			`f = urllib.urlopen('%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date))`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`raw = f.read()`
			`f.close()`
fixed for enwiki which uses several chunks for big files; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@168 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-26 12:48:26 +00:00			`f = open('%s/%s-%s-md5sums.txt' % (path, project, date), 'w')`
wikipedia dumps downloader; git-svn-id: https://wikiteam.googlecode.com/svn/trunk@167 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 2011-06-25 22:24:41 +00:00			`f.write(raw)`
			`f.close()`
			`md52 = re.findall(r'(?P<md5>[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0]`
			`print md52`

			`if md51 == md52:`
			`print 'md5sum is correct for this file, horay! \o/'`
			`print '\n'*3`
			`corrupted = False`
			`else:`
			`os.remove('%s/%s' % (path, dumpfilename))`