diff --git a/dumpgenerator.py b/dumpgenerator.py index 0e6cce4..42fade6 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -710,14 +710,14 @@ def saveConfig(config={}, configfilename=''): f.close() def welcome(): - """ """ + """ Opening message """ print "#"*73 print """# Welcome to DumpGenerator 0.1 by WikiTeam (GPL v3) # # More info at: http://code.google.com/p/wikiteam/ #""" print "#"*73 print '' print "#"*73 - print """# Copyright (C) 2011-2012 WikiTeam # + print """# Copyright (C) 2011-2013 WikiTeam # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or # @@ -734,7 +734,7 @@ def welcome(): print '' def bye(): - """ """ + """ Closing message """ print "---> Congratulations! Your dump is complete <---" print "If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list" print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam" @@ -933,12 +933,167 @@ def checkXMLIntegrity(config={}): print "XML dump is corrupted, regenerating a new dump" generateXMLDump(config=config, titles=titles) -def main(params=[]): - """ Main function """ - welcome() - configfilename = 'config.txt' - config, other = getParameters(params=params) +def createNewDump(config={}): + titles = [] + images = [] + print 'Trying generating a new dump into a new directory...' + if config['xml']: + titles += getPageTitles(config=config) + saveTitles(config=config, titles=titles) + generateXMLDump(config=config, titles=titles) + checkXMLIntegrity(config=config) + if config['images']: + if config['api']: + images += getImageFilenamesURLAPI(config=config) + else: + images += getImageFilenamesURL(config=config) + saveImageFilenamesURL(config=config, images=images) + generateImageDump(config=config, other=other, images=images) + if config['logs']: + saveLogs(config=config) + +def resumePreviousDump(config={}): + titles = [] + images = [] + print 'Resuming previous dump process...' + if config['xml']: + #load titles + lasttitle = '' + try: + f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') + raw = f.read() + titles = raw.split('\n') + lasttitle = titles[-1] + if not lasttitle: #empty line at EOF ? + lasttitle = titles[-2] + f.close() + except: + pass #probably file doesnot exists + if lasttitle == '--END--': + #titles list is complete + print 'Title list was completed in the previous session' + else: + print 'Title list is incomplete. Reloading...' + #do not resume, reload, to avoid inconsistences, deleted pages or so + titles = getPageTitles(config=config) + saveTitles(config=config, titles=titles) + #checking xml dump + xmliscomplete = False + lastxmltitle = '' + try: + f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r') + for l in f: + if re.findall('', l): + #xml dump is complete + xmliscomplete = True + break + xmltitles = re.findall(r'([^<]+)', l) #weird if found more than 1, but maybe + if xmltitles: + lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) + f.close() + except: + pass #probably file doesnot exists + #removing --END-- before getXMLs + while titles and titles[-1] in ['', '--END--']: + titles = titles[:-1] + if xmliscomplete: + print 'XML dump was completed in the previous session' + elif lastxmltitle: + #resuming... + print 'Resuming XML dump from "%s"' % (lastxmltitle) + generateXMLDump(config=config, titles=titles, start=lastxmltitle) + else: + #corrupt? only has XML header? + print 'XML is corrupt? Regenerating...' + generateXMLDump(config=config, titles=titles) + + if config['images']: + #load images + lastimage = '' + try: + f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') + raw = f.read() + lines = raw.split('\n') + for l in lines: + if re.search(r'\t', l): + images.append(l.split('\t')) + lastimage = lines[-1] + f.close() + except: + pass #probably file doesnot exists + if lastimage == '--END--': + print 'Image list was completed in the previous session' + else: + print 'Image list is incomplete. Reloading...' + #do not resume, reload, to avoid inconsistences, deleted images or so + if config['api']: + images=getImageFilenamesURLAPI(config=config) + else: + images = getImageFilenamesURL(config=config) + saveImageFilenamesURL(config=config, images=images) + #checking images directory + listdir = [] + try: + listdir = os.listdir('%s/images' % (config['path'])) + except: + pass #probably directory does not exist + listdir.sort() + complete = True + lastfilename = '' + lastfilename2 = '' + c = 0 + for filename, url, uploader in images: + lastfilename2 = lastfilename + lastfilename = filename #return always the complete filename, not the truncated + filename2 = filename + if len(filename2) > other['filenamelimit']: + filename2 = truncateFilename(other=other, filename=filename2) + if filename2 not in listdir: + complete = False + break + c +=1 + print '%d images were found in the directory from a previous session' % (c) + if complete: + #image dump is complete + print 'Image dump was completed in the previous session' + else: + generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort + if config['logs']: + #fix + pass + +def saveSpecialVersion(config={}): + #save Special:Version as .html, to preserve extensions details + if os.path.exists('%s/Special:Version.html' % (config['path'])): + print 'Special:Version.html exists, do not overwrite' + else: + print 'Downloading Special:Version with extensions and other related info' + req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()}) + f = urllib2.urlopen(req) + raw = f.read() + f.close() + raw = removeIP(raw=raw) + f = open('%s/Special:Version.html' % (config['path']), 'w') + f.write(raw) + f.close() + +def saveIndexPHP(config={}): + #save index.php as .html, to preserve license details available at the botom of the page + if os.path.exists('%s/index.html' % (config['path'])): + print 'index.html exists, do not overwrite' + else: + print 'Downloading index.php (Main Page) as index.html' + req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()}) + f = urllib2.urlopen(req) + raw = f.read() + f.close() + raw = removeIP(raw=raw) + f = open('%s/index.html' % (config['path']), 'w') + f.write(raw) + f.close() + +def avoidWikimediaProjects(config={}): #notice about wikipedia dumps if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']): print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' @@ -946,7 +1101,13 @@ def main(params=[]): if not other['force']: print 'Thanks!' sys.exit() - + +def main(params=[]): + """ Main function """ + welcome() + configfilename = 'config.txt' + config, other = getParameters(params=params) + avoidWikimediaProjects(config=config) print 'Analysing %s' % (config['api'] and config['api'] or config['index']) #creating path or resuming if desired @@ -978,161 +1139,13 @@ def main(params=[]): os.mkdir(config['path']) saveConfig(config=config, configfilename=configfilename) - titles = [] - images = [] if other['resume']: - print 'Resuming previous dump process...' - if config['xml']: - #load titles - lasttitle = '' - try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') - raw = f.read() - titles = raw.split('\n') - lasttitle = titles[-1] - if not lasttitle: #empty line at EOF ? - lasttitle = titles[-2] - f.close() - except: - pass #probably file doesnot exists - if lasttitle == '--END--': - #titles list is complete - print 'Title list was completed in the previous session' - else: - print 'Title list is incomplete. Reloading...' - #do not resume, reload, to avoid inconsistences, deleted pages or so - titles = getPageTitles(config=config) - saveTitles(config=config, titles=titles) - #checking xml dump - xmliscomplete = False - lastxmltitle = '' - try: - f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r') - for l in f: - if re.findall('', l): - #xml dump is complete - xmliscomplete = True - break - xmltitles = re.findall(r'([^<]+)', l) #weird if found more than 1, but maybe - if xmltitles: - lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) - f.close() - except: - pass #probably file doesnot exists - #removing --END-- before getXMLs - while titles and titles[-1] in ['', '--END--']: - titles = titles[:-1] - if xmliscomplete: - print 'XML dump was completed in the previous session' - elif lastxmltitle: - #resuming... - print 'Resuming XML dump from "%s"' % (lastxmltitle) - generateXMLDump(config=config, titles=titles, start=lastxmltitle) - else: - #corrupt? only has XML header? - print 'XML is corrupt? Regenerating...' - generateXMLDump(config=config, titles=titles) - - if config['images']: - #load images - lastimage = '' - try: - f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') - raw = f.read() - lines = raw.split('\n') - for l in lines: - if re.search(r'\t', l): - images.append(l.split('\t')) - lastimage = lines[-1] - f.close() - except: - pass #probably file doesnot exists - if lastimage == '--END--': - print 'Image list was completed in the previous session' - else: - print 'Image list is incomplete. Reloading...' - #do not resume, reload, to avoid inconsistences, deleted images or so - if config['api']: - images=getImageFilenamesURLAPI(config=config) - else: - images = getImageFilenamesURL(config=config) - saveImageFilenamesURL(config=config, images=images) - #checking images directory - listdir = [] - try: - listdir = os.listdir('%s/images' % (config['path'])) - except: - pass #probably directory does not exist - listdir.sort() - complete = True - lastfilename = '' - lastfilename2 = '' - c = 0 - for filename, url, uploader in images: - lastfilename2 = lastfilename - lastfilename = filename #return always the complete filename, not the truncated - filename2 = filename - if len(filename2) > other['filenamelimit']: - filename2 = truncateFilename(other=other, filename=filename2) - if filename2 not in listdir: - complete = False - break - c +=1 - print '%d images were found in the directory from a previous session' % (c) - if complete: - #image dump is complete - print 'Image dump was completed in the previous session' - else: - generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort - - if config['logs']: - #fix - pass + resumePreviousDump(config=config) else: - print 'Trying generating a new dump into a new directory...' - if config['xml']: - titles += getPageTitles(config=config) - saveTitles(config=config, titles=titles) - generateXMLDump(config=config, titles=titles) - checkXMLIntegrity(config=config) - if config['images']: - if config['api']: - images += getImageFilenamesURLAPI(config=config) - else: - images += getImageFilenamesURL(config=config) - saveImageFilenamesURL(config=config, images=images) - generateImageDump(config=config, other=other, images=images) - if config['logs']: - saveLogs(config=config) - - #save index.php as .html, to preserve license details available at the botom of the page - if os.path.exists('%s/index.html' % (config['path'])): - print 'index.html exists, do not overwrite' - else: - print 'Downloading index.php (Main Page) as index.html' - req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()}) - f = urllib2.urlopen(req) - raw = f.read() - f.close() - raw = removeIP(raw=raw) - f = open('%s/index.html' % (config['path']), 'w') - f.write(raw) - f.close() - - #save Special:Version as .html, to preserve extensions details - if os.path.exists('%s/Special:Version.html' % (config['path'])): - print 'Special:Version.html exists, do not overwrite' - else: - print 'Downloading Special:Version with extensions and other related info' - req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()}) - f = urllib2.urlopen(req) - raw = f.read() - f.close() - raw = removeIP(raw=raw) - f = open('%s/Special:Version.html' % (config['path']), 'w') - f.write(raw) - f.close() - + createNewDump(config=config) + + saveIndexPHP(config=config) + saveSpecialVersion(config=config) bye() if __name__ == "__main__":