wikiteam/batchdownload/launcher.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2011-2012 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import re
import subprocess
import sys
import time

import dumpgenerator

wikis = open(sys.argv[1], 'r').read().splitlines()
for wiki in wikis:
    print "#"*73
    print "# Downloading", wiki
    print "#"*73
    wiki = wiki.lower()
    prefix = dumpgenerator.domain2prefix(config={'api': wiki})
    
    #check if compressed, in that case it is finished
    compressed = False
    for dirname, dirnames, filenames in os.walk('.'):
        if dirname == '.':
            for f in filenames:
                if f.startswith(prefix) and f.endswith('.7z'):
                    compressed = True
                    zipfilename = f
            break #stop searching, dot not explore subdirectories
    
    if compressed:
        print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
        # Get the archive's file list.
        archivecontent = subprocess.check_output (['7z', 'l', zipfilename])
        if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None:
            # We should perhaps not create an archive in this case, but we continue anyway.
            print "ERROR: The archive contains no history!"
        if re.search(ur"Special:Version\.html", archivecontent) is None:
            print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish."
        continue
    
    #download
    started = False #was this wiki download started before? then resume
    wikidir = ''
    for dirname, dirnames, filenames in os.walk('.'):
        if dirname == '.':
            for d in dirnames:
                if d.startswith(prefix):
                    wikidir = d
                    started = True
            break #stop searching, dot not explore subdirectories
    
    if started and wikidir: #then resume
        print 'Resuming download, using directory', wikidir
        os.system('python dumpgenerator.py --api=%s --xml --images --resume --path=%s' % (wiki, wikidir))
    else: #download from scratch
        os.system('python dumpgenerator.py --api=%s --xml --images' % wiki)
        #save wikidir now
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for d in dirnames:
                    if d.startswith(prefix):
                        wikidir = d
                break #stop searching, dot not explore subdirectories
    
    prefix = wikidir.split('-wikidump')[0]
    
    finished = False
    if started and wikidir and prefix:
        if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
            print "No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting."
        else:
            finished = True
    # You can also issue this on your working directory to find all incomplete dumps:
    # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
    
    #compress
    if finished:
        time.sleep(1)
        os.chdir(wikidir)
        print 'Changed directory to', os.getcwd()
        # Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.
        os.system('grep "<title>" *.xml -c;grep "<page>" *.xml -c;grep "</page>" *.xml -c;grep "<revision>" *.xml -c;grep "</revision>" *.xml -c')
        # Make a non-solid archive with all the text and metadata at default compression.
        os.system('7z a -ms=off ../%s-history.xml.7z %s-history.xml %s-titles.txt %s-images.txt index.html Special:Version.html config.txt errors.log' % (prefix, prefix, prefix, prefix))
        # Now we add the images, if there are some, to create another archive, without recompressing everything, at the min compression rate, higher doesn't compress images much more.
        os.system('cp ../%s-history.xml.7z ../%s-wikidump.7z' % (prefix, prefix))
        os.system('7z a -ms=off -mx=1 ../%s-wikidump.7z images/' % prefix)
        os.chdir('..')
        print 'Changed directory to', os.getcwd()
        time.sleep(1)
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#!/usr/bin/python`
			`# -- coding: utf-8 --`

			`# Copyright (C) 2011-2012 WikiTeam`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`import os`
			`import re`
Added a couple basic checks of the archives' content. No action follows, as for everything else here. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@589 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import subprocess`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import sys`
			`import time`

launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@508 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import dumpgenerator`

launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@516 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`wikis = open(sys.argv[1], 'r').read().splitlines()`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`for wiki in wikis:`
Added a couple basic checks of the archives' content. No action follows, as for everything else here. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@589 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print "#"*73`
			`print "# Downloading", wiki`
			`print "#"*73`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@508 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`wiki = wiki.lower()`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`prefix = dumpgenerator.domain2prefix(config={'api': wiki})`

			`#check if compressed, in that case it is finished`
			`compressed = False`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`for dirname, dirnames, filenames in os.walk('.'):`
			`if dirname == '.':`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@502 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`for f in filenames:`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if f.startswith(prefix) and f.endswith('.7z'):`
			`compressed = True`
			`zipfilename = f`
working on the uploader to Internet Archive S3; launcher.py now do not explore subdirectories, just the current one '.' git-svn-id: https://wikiteam.googlecode.com/svn/trunk@613 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`break #stop searching, dot not explore subdirectories`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if compressed:`
typo git-svn-id: https://wikiteam.googlecode.com/svn/trunk@571 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print 'Skipping... This wiki was downloaded and compressed before in', zipfilename`
Reduce CPU usage to create archives, don't create -wikidump if there's no images directory. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@594 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# Get the archive's file list.`
			`archivecontent = subprocess.check_output (['7z', 'l', zipfilename])`
spaces git-svn-id: https://wikiteam.googlecode.com/svn/trunk@592 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None:`
Reduce CPU usage to create archives, don't create -wikidump if there's no images directory. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@594 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# We should perhaps not create an archive in this case, but we continue anyway.`
Added a couple basic checks of the archives' content. No action follows, as for everything else here. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@589 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print "ERROR: The archive contains no history!"`
spaces git-svn-id: https://wikiteam.googlecode.com/svn/trunk@592 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if re.search(ur"Special:Version\.html", archivecontent) is None:`
Added a couple basic checks of the archives' content. No action follows, as for everything else here. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@589 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish."`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@503 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`continue`

launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#download`
			`started = False #was this wiki download started before? then resume`
			`wikidir = ''`
			`for dirname, dirnames, filenames in os.walk('.'):`
			`if dirname == '.':`
			`for d in dirnames:`
			`if d.startswith(prefix):`
			`wikidir = d`
			`started = True`
working on the uploader to Internet Archive S3; launcher.py now do not explore subdirectories, just the current one '.' git-svn-id: https://wikiteam.googlecode.com/svn/trunk@613 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`break #stop searching, dot not explore subdirectories`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
			`if started and wikidir: #then resume`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@501 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print 'Resuming download, using directory', wikidir`
			`os.system('python dumpgenerator.py --api=%s --xml --images --resume --path=%s' % (wiki, wikidir))`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`else: #download from scratch`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@501 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`os.system('python dumpgenerator.py --api=%s --xml --images' % wiki)`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@514 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#save wikidir now`
			`for dirname, dirnames, filenames in os.walk('.'):`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@515 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if dirname == '.':`
			`for d in dirnames:`
			`if d.startswith(prefix):`
			`wikidir = d`
working on the uploader to Internet Archive S3; launcher.py now do not explore subdirectories, just the current one '.' git-svn-id: https://wikiteam.googlecode.com/svn/trunk@613 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`break #stop searching, dot not explore subdirectories`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@513 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@560 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`prefix = wikidir.split('-wikidump')[0]`
Added simple check before compression, seems to work. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@648 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
Undefined variable sometimes (for instance if no index.php has been found). git-svn-id: https://wikiteam.googlecode.com/svn/trunk@651 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`finished = False`
Added simple check before compression, seems to work. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@648 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if started and wikidir and prefix:`
			`if (subprocess.call (['tail -n 1 %s/%s-history.xml \| grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):`
			`print "No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting."`
			`else:`
			`finished = True`
			`# You can also issue this on your working directory to find all incomplete dumps:`
			`# tail -n 1 /-history.xml \| grep -Ev -B 1 "</page>\|</mediawiki>\|==\|^$"`

			`#compress`
			`if finished:`
launcher.py git-svn-id: https://wikiteam.googlecode.com/svn/trunk@505 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`time.sleep(1)`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`os.chdir(wikidir)`
			`print 'Changed directory to', os.getcwd()`
Reduce CPU usage to create archives, don't create -wikidump if there's no images directory. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@594 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`os.system('grep "<title>" .xml -c;grep "<page>" .xml -c;grep "</page>" .xml -c;grep "<revision>" .xml -c;grep "</revision>" *.xml -c')`
Reduce CPU usage to create archives, don't create -wikidump if there's no images directory. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@594 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# Make a non-solid archive with all the text and metadata at default compression.`
			`os.system('7z a -ms=off ../%s-history.xml.7z %s-history.xml %s-titles.txt %s-images.txt index.html Special:Version.html config.txt errors.log' % (prefix, prefix, prefix, prefix))`
			`# Now we add the images, if there are some, to create another archive, without recompressing everything, at the min compression rate, higher doesn't compress images much more.`
			`os.system('cp ../%s-history.xml.7z ../%s-wikidump.7z' % (prefix, prefix))`
			`os.system('7z a -ms=off -mx=1 ../%s-wikidump.7z images/' % prefix)`
launcher for batch downloads git-svn-id: https://wikiteam.googlecode.com/svn/trunk@446 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`os.chdir('..')`
			`print 'Changed directory to', os.getcwd()`
Added simple check before compression, seems to work. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@648 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`time.sleep(1)`