@ -710,14 +710,14 @@ def saveConfig(config={}, configfilename=''):
f . close ( )
def welcome ( ) :
""" """
""" Opening message """
print " # " * 73
print """ # Welcome to DumpGenerator 0.1 by WikiTeam (GPL v3) #
# More info at: http://code.google.com/p/wikiteam/ #"""
print " # " * 73
print ' '
print " # " * 73
print """ # Copyright (C) 2011-201 2 WikiTeam #
print """ # Copyright (C) 2011-201 3 WikiTeam #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
@ -734,7 +734,7 @@ def welcome():
print ' '
def bye ( ) :
""" """
""" Closing message """
print " ---> Congratulations! Your dump is complete <--- "
print " If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list "
print " If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam "
@ -933,54 +933,28 @@ def checkXMLIntegrity(config={}):
print " XML dump is corrupted, regenerating a new dump "
generateXMLDump ( config = config , titles = titles )
def main ( params = [ ] ) :
""" Main function """
welcome ( )
configfilename = ' config.txt '
config , other = getParameters ( params = params )
#notice about wikipedia dumps
if re . findall ( r ' (?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage) \ .org ' , config [ ' api ' ] + config [ ' index ' ] ) :
print ' PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! '
print ' Download the dumps from http://dumps.wikimedia.org '
if not other [ ' force ' ] :
print ' Thanks! '
sys . exit ( )
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
#creating path or resuming if desired
c = 2
originalpath = config [ ' path ' ] # to avoid concat blabla-2, blabla-2-3, and so on...
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) : #do not enter if resume is requested from begining
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
reply = ' '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' % ( config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
sys . exit ( )
print ' You have selected: YES '
other [ ' resume ' ] = True
break
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
print ' You have selected: NO '
other [ ' resume ' ] = False
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
print ' Trying to use path " %s " ... ' % ( config [ ' path ' ] )
c + = 1
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
def createNewDump ( config = { } ) :
titles = [ ]
images = [ ]
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
titles + = getPageTitles ( config = config )
saveTitles ( config = config , titles = titles )
generateXMLDump ( config = config , titles = titles )
checkXMLIntegrity ( config = config )
if config [ ' images ' ] :
if config [ ' api ' ] :
images + = getImageFilenamesURLAPI ( config = config )
else :
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
images + = getImageFilenamesURL ( config = config )
saveImageFilenamesURL ( config = config , images = images )
generateImageDump ( config = config , other = other , images = images )
if config [ ' logs ' ] :
saveLogs ( config = config )
def resumePreviousDump ( config = { } ) :
titles = [ ]
images = [ ]
if other [ ' resume ' ] :
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
#load titles
@ -1088,23 +1062,23 @@ def main(params=[]):
if config [ ' logs ' ] :
#fix
pass
def saveSpecialVersion ( config = { } ) :
#save Special:Version as .html, to preserve extensions details
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
else :
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
titles + = getPageTitles ( config = config )
saveTitles ( config = config , titles = titles )
generateXMLDump ( config = config , titles = titles )
checkXMLIntegrity ( config = config )
if config [ ' images ' ] :
if config [ ' api ' ] :
images + = getImageFilenamesURLAPI ( config = config )
else :
images + = getImageFilenamesURL ( config = config )
saveImageFilenamesURL ( config = config , images = images )
generateImageDump ( config = config , other = other , images = images )
if config [ ' logs ' ] :
saveLogs ( config = config )
print ' Downloading Special:Version with extensions and other related info '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
raw = removeIP ( raw = raw )
f = open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
def saveIndexPHP ( config = { } ) :
#save index.php as .html, to preserve license details available at the botom of the page
if os . path . exists ( ' %s /index.html ' % ( config [ ' path ' ] ) ) :
print ' index.html exists, do not overwrite '
@ -1119,20 +1093,59 @@ def main(params=[]):
f . write ( raw )
f . close ( )
#save Special:Version as .html, to preserve extensions details
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
def avoidWikimediaProjects ( config = { } ) :
#notice about wikipedia dumps
if re . findall ( r ' (?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage) \ .org ' , config [ ' api ' ] + config [ ' index ' ] ) :
print ' PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! '
print ' Download the dumps from http://dumps.wikimedia.org '
if not other [ ' force ' ] :
print ' Thanks! '
sys . exit ( )
def main ( params = [ ] ) :
""" Main function """
welcome ( )
configfilename = ' config.txt '
config , other = getParameters ( params = params )
avoidWikimediaProjects ( config = config )
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
#creating path or resuming if desired
c = 2
originalpath = config [ ' path ' ] # to avoid concat blabla-2, blabla-2-3, and so on...
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) : #do not enter if resume is requested from begining
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
reply = ' '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' % ( config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
sys . exit ( )
print ' You have selected: YES '
other [ ' resume ' ] = True
break
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
print ' You have selected: NO '
other [ ' resume ' ] = False
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
print ' Trying to use path " %s " ... ' % ( config [ ' path ' ] )
c + = 1
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
print ' Downloading Special:Version with extensions and other related info '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
raw = removeIP ( raw = raw )
f = open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
if other [ ' resume ' ] :
resumePreviousDump ( config = config )
else :
createNewDump ( config = config )
saveIndexPHP ( config = config )
saveSpecialVersion ( config = config )
bye ( )
if __name__ == " __main__ " :