2012-04-07 14:14:40 +00:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2011-2012 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import re
2012-04-13 20:43:39 +00:00
import subprocess
2012-04-07 14:14:40 +00:00
import sys
import time
2012-04-08 11:15:50 +00:00
import dumpgenerator
2012-04-08 13:22:26 +00:00
wikis = open ( sys . argv [ 1 ] , ' r ' ) . read ( ) . splitlines ( )
2012-04-07 14:14:40 +00:00
for wiki in wikis :
2012-04-13 20:43:39 +00:00
print " # " * 73
print " # Downloading " , wiki
print " # " * 73
2012-04-08 11:15:50 +00:00
wiki = wiki . lower ( )
2012-04-08 13:03:09 +00:00
prefix = dumpgenerator . domain2prefix ( config = { ' api ' : wiki } )
#check if compressed, in that case it is finished
compressed = False
2012-04-07 14:14:40 +00:00
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
if dirname == ' . ' :
2012-04-08 09:25:15 +00:00
for f in filenames :
2012-04-08 13:03:09 +00:00
if f . startswith ( prefix ) and f . endswith ( ' .7z ' ) :
compressed = True
zipfilename = f
2012-04-20 20:57:36 +00:00
break #stop searching, dot not explore subdirectories
2012-04-07 14:14:40 +00:00
2012-04-08 13:03:09 +00:00
if compressed :
2012-04-09 10:00:29 +00:00
print ' Skipping... This wiki was downloaded and compressed before in ' , zipfilename
2012-04-15 12:39:45 +00:00
# Get the archive's file list.
archivecontent = subprocess . check_output ( [ ' 7z ' , ' l ' , zipfilename ] )
2012-04-15 08:44:27 +00:00
if re . search ( ur " %s .+-history \ .xml " % ( prefix ) , archivecontent ) is None :
2012-04-15 12:39:45 +00:00
# We should perhaps not create an archive in this case, but we continue anyway.
2012-04-13 20:43:39 +00:00
print " ERROR: The archive contains no history! "
2012-04-15 08:44:27 +00:00
if re . search ( ur " Special:Version \ .html " , archivecontent ) is None :
2012-04-13 20:43:39 +00:00
print " WARNING: The archive doesn ' t contain Special:Version.html, this may indicate that download didn ' t finish. "
2012-04-08 09:25:41 +00:00
continue
2012-04-08 13:03:09 +00:00
#download
started = False #was this wiki download started before? then resume
wikidir = ' '
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
if dirname == ' . ' :
for d in dirnames :
if d . startswith ( prefix ) :
wikidir = d
started = True
2012-04-20 20:57:36 +00:00
break #stop searching, dot not explore subdirectories
2012-04-08 13:03:09 +00:00
if started and wikidir : #then resume
2012-04-08 09:20:24 +00:00
print ' Resuming download, using directory ' , wikidir
os . system ( ' python dumpgenerator.py --api= %s --xml --images --resume --path= %s ' % ( wiki , wikidir ) )
2012-04-08 13:03:09 +00:00
else : #download from scratch
2012-04-08 09:20:24 +00:00
os . system ( ' python dumpgenerator.py --api= %s --xml --images ' % wiki )
2012-04-08 13:19:42 +00:00
#save wikidir now
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
2012-04-08 13:21:06 +00:00
if dirname == ' . ' :
for d in dirnames :
if d . startswith ( prefix ) :
wikidir = d
2012-04-20 20:57:36 +00:00
break #stop searching, dot not explore subdirectories
2012-04-08 13:03:09 +00:00
2012-04-08 19:42:04 +00:00
prefix = wikidir . split ( ' -wikidump ' ) [ 0 ]
2012-05-05 10:37:25 +00:00
2012-05-05 11:14:12 +00:00
finished = False
2012-05-05 10:37:25 +00:00
if started and wikidir and prefix :
if ( subprocess . call ( [ ' tail -n 1 %s / %s -history.xml | grep -q " </mediawiki> " ' % ( wikidir , prefix ) ] , shell = True ) ) :
print " No </mediawiki> tag found: dump failed, needs fixing; resume didn ' t work. Exiting. "
else :
finished = True
# You can also issue this on your working directory to find all incomplete dumps:
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
#compress
if finished :
2012-04-08 09:47:30 +00:00
time . sleep ( 1 )
2012-04-07 14:14:40 +00:00
os . chdir ( wikidir )
print ' Changed directory to ' , os . getcwd ( )
2012-04-15 12:39:45 +00:00
# Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.
2012-04-07 14:14:40 +00:00
os . system ( ' grep " <title> " *.xml -c;grep " <page> " *.xml -c;grep " </page> " *.xml -c;grep " <revision> " *.xml -c;grep " </revision> " *.xml -c ' )
2012-04-15 12:39:45 +00:00
# Make a non-solid archive with all the text and metadata at default compression.
os . system ( ' 7z a -ms=off ../ %s -history.xml.7z %s -history.xml %s -titles.txt %s -images.txt index.html Special:Version.html config.txt errors.log ' % ( prefix , prefix , prefix , prefix ) )
# Now we add the images, if there are some, to create another archive, without recompressing everything, at the min compression rate, higher doesn't compress images much more.
os . system ( ' cp ../ %s -history.xml.7z ../ %s -wikidump.7z ' % ( prefix , prefix ) )
os . system ( ' 7z a -ms=off -mx=1 ../ %s -wikidump.7z images/ ' % prefix )
2012-04-07 14:14:40 +00:00
os . chdir ( ' .. ' )
print ' Changed directory to ' , os . getcwd ( )
2012-05-05 10:37:25 +00:00
time . sleep ( 1 )