moving code to functions; tiny changes in comments

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@814 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
11 years ago · 8295990df0
parent 79a310c470
commit 8295990df0
1 changed files with 175 additions and 162 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -710,14 +710,14 @@ def saveConfig(config={}, configfilename=''):
    f.close()
    
 def welcome():
-    """  """
+    """ Opening message """
    print "#"*73
    print """# Welcome to DumpGenerator 0.1 by WikiTeam (GPL v3)                     #
 # More info at: http://code.google.com/p/wikiteam/                      #"""
    print "#"*73
    print ''
    print "#"*73
-    print """# Copyright (C) 2011-2012 WikiTeam                                      #
+    print """# Copyright (C) 2011-2013 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #
@ -734,7 +734,7 @@ def welcome():
    print ''

 def bye():
-    """  """
+    """ Closing message """
    print "---> Congratulations! Your dump is complete <---"
    print "If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list"
    print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam"
@ -933,12 +933,167 @@ def checkXMLIntegrity(config={}):
        print "XML dump is corrupted, regenerating a new dump"
        generateXMLDump(config=config, titles=titles)

-def main(params=[]):
-    """ Main function """
-    welcome()
-    configfilename = 'config.txt'
-    config, other = getParameters(params=params)
+def createNewDump(config={}):
+    titles = []
+    images = []
+    print 'Trying generating a new dump into a new directory...'
+    if config['xml']:
+        titles += getPageTitles(config=config)
+        saveTitles(config=config, titles=titles)
+        generateXMLDump(config=config, titles=titles)
+        checkXMLIntegrity(config=config)
+    if config['images']:
+        if config['api']:
+            images += getImageFilenamesURLAPI(config=config)
+        else:
+            images += getImageFilenamesURL(config=config)
+        saveImageFilenamesURL(config=config, images=images)
+        generateImageDump(config=config, other=other, images=images)
+    if config['logs']:
+        saveLogs(config=config)
+
+def resumePreviousDump(config={}):
+    titles = []
+    images = []
+    print 'Resuming previous dump process...'
+    if config['xml']:
+        #load titles
+        lasttitle = ''
+        try:
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            raw = f.read()
+            titles = raw.split('\n')
+            lasttitle = titles[-1]
+            if not lasttitle: #empty line at EOF ?
+                lasttitle = titles[-2]
+            f.close()
+        except:
+            pass #probably file doesnot exists
+        if lasttitle == '--END--':
+            #titles list is complete
+            print 'Title list was completed in the previous session'
+        else:
+            print 'Title list is incomplete. Reloading...'
+            #do not resume, reload, to avoid inconsistences, deleted pages or so
+            titles = getPageTitles(config=config)
+            saveTitles(config=config, titles=titles)
+        #checking xml dump
+        xmliscomplete = False
+        lastxmltitle = ''
+        try:
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            for l in f:
+                if re.findall('</mediawiki>', l):
+                    #xml dump is complete
+                    xmliscomplete = True
+                    break
+                xmltitles = re.findall(r'<title>([^<]+)</title>', l) #weird if found more than 1, but maybe
+                if xmltitles:
+                    lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
+            f.close()
+        except:
+            pass #probably file doesnot exists
+        #removing --END-- before getXMLs
+        while titles and titles[-1] in ['', '--END--']:
+            titles = titles[:-1]
+        if xmliscomplete:
+            print 'XML dump was completed in the previous session'
+        elif lastxmltitle:
+            #resuming...
+            print 'Resuming XML dump from "%s"' % (lastxmltitle)
+            generateXMLDump(config=config, titles=titles, start=lastxmltitle)
+        else:
+            #corrupt? only has XML header?
+            print 'XML is corrupt? Regenerating...'
+            generateXMLDump(config=config, titles=titles)
+    
+    if config['images']:
+        #load images
+        lastimage = ''
+        try:
+            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            raw = f.read()
+            lines = raw.split('\n')
+            for l in lines:
+                if re.search(r'\t', l):
+                    images.append(l.split('\t'))
+            lastimage = lines[-1]
+            f.close()
+        except:
+            pass #probably file doesnot exists
+        if lastimage == '--END--':
+            print 'Image list was completed in the previous session'
+        else:
+            print 'Image list is incomplete. Reloading...'
+            #do not resume, reload, to avoid inconsistences, deleted images or so
+            if config['api']:
+                images=getImageFilenamesURLAPI(config=config)
+            else:
+                images = getImageFilenamesURL(config=config)
+            saveImageFilenamesURL(config=config, images=images)
+        #checking images directory
+        listdir = []
+        try:
+            listdir = os.listdir('%s/images' % (config['path']))
+        except:
+            pass #probably directory does not exist
+        listdir.sort()
+        complete = True
+        lastfilename = ''
+        lastfilename2 = ''
+        c = 0
+        for filename, url, uploader in images:
+            lastfilename2 = lastfilename
+            lastfilename = filename #return always the complete filename, not the truncated
+            filename2 = filename
+            if len(filename2) > other['filenamelimit']:
+                filename2 = truncateFilename(other=other, filename=filename2)
+            if filename2 not in listdir:
+                complete = False
+                break
+            c +=1
+        print '%d images were found in the directory from a previous session' % (c)
+        if complete:
+            #image dump is complete
+            print 'Image dump was completed in the previous session'
+        else:
+            generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
+    
+    if config['logs']:
+        #fix
+        pass

+def saveSpecialVersion(config={}):
+    #save Special:Version as .html, to preserve extensions details
+    if os.path.exists('%s/Special:Version.html' % (config['path'])):
+        print 'Special:Version.html exists, do not overwrite'
+    else:
+        print 'Downloading Special:Version with extensions and other related info'
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
+        f = urllib2.urlopen(req)
+        raw = f.read()
+        f.close()
+        raw = removeIP(raw=raw)
+        f = open('%s/Special:Version.html' % (config['path']), 'w')
+        f.write(raw)
+        f.close()
+
+def saveIndexPHP(config={}):
+    #save index.php as .html, to preserve license details available at the botom of the page
+    if os.path.exists('%s/index.html' % (config['path'])):
+        print 'index.html exists, do not overwrite'
+    else:
+        print 'Downloading index.php (Main Page) as index.html'
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()})
+        f = urllib2.urlopen(req)
+        raw = f.read()
+        f.close()
+        raw = removeIP(raw=raw)
+        f = open('%s/index.html' % (config['path']), 'w')
+        f.write(raw)
+        f.close()
+
+def avoidWikimediaProjects(config={}):
    #notice about wikipedia dumps
    if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
        print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
@ -947,6 +1102,12 @@ def main(params=[]):
            print 'Thanks!'
            sys.exit()

+def main(params=[]):
+    """ Main function """
+    welcome()
+    configfilename = 'config.txt'
+    config, other = getParameters(params=params)
+    avoidWikimediaProjects(config=config)
    print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
    
    #creating path or resuming if desired
@ -978,161 +1139,13 @@ def main(params=[]):
        os.mkdir(config['path'])
        saveConfig(config=config, configfilename=configfilename)
    
-    titles = []
-    images = []
    if other['resume']:
-        print 'Resuming previous dump process...'
-        if config['xml']:
-            #load titles
-            lasttitle = ''
-            try:
-                f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
-                raw = f.read()
-                titles = raw.split('\n')
-                lasttitle = titles[-1]
-                if not lasttitle: #empty line at EOF ?
-                    lasttitle = titles[-2]
-                f.close()
-            except:
-                pass #probably file doesnot exists
-            if lasttitle == '--END--':
-                #titles list is complete
-                print 'Title list was completed in the previous session'
-            else:
-                print 'Title list is incomplete. Reloading...'
-                #do not resume, reload, to avoid inconsistences, deleted pages or so
-                titles = getPageTitles(config=config)
-                saveTitles(config=config, titles=titles)
-            #checking xml dump
-            xmliscomplete = False
-            lastxmltitle = ''
-            try:
-                f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r')
-                for l in f:
-                    if re.findall('</mediawiki>', l):
-                        #xml dump is complete
-                        xmliscomplete = True
-                        break
-                    xmltitles = re.findall(r'<title>([^<]+)</title>', l) #weird if found more than 1, but maybe
-                    if xmltitles:
-                        lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
-                f.close()
-            except:
-                pass #probably file doesnot exists
-            #removing --END-- before getXMLs
-            while titles and titles[-1] in ['', '--END--']:
-                titles = titles[:-1]
-            if xmliscomplete:
-                print 'XML dump was completed in the previous session'
-            elif lastxmltitle:
-                #resuming...
-                print 'Resuming XML dump from "%s"' % (lastxmltitle)
-                generateXMLDump(config=config, titles=titles, start=lastxmltitle)
-            else:
-                #corrupt? only has XML header?
-                print 'XML is corrupt? Regenerating...'
-                generateXMLDump(config=config, titles=titles)
-        
-        if config['images']:
-            #load images
-            lastimage = ''
-            try:
-                f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
-                raw = f.read()
-                lines = raw.split('\n')
-                for l in lines:
-                    if re.search(r'\t', l):
-                        images.append(l.split('\t'))
-                lastimage = lines[-1]
-                f.close()
-            except:
-                pass #probably file doesnot exists
-            if lastimage == '--END--':
-                print 'Image list was completed in the previous session'
-            else:
-                print 'Image list is incomplete. Reloading...'
-                #do not resume, reload, to avoid inconsistences, deleted images or so
-                if config['api']:
-                    images=getImageFilenamesURLAPI(config=config)
-                else:
-                    images = getImageFilenamesURL(config=config)
-                saveImageFilenamesURL(config=config, images=images)
-            #checking images directory
-            listdir = []
-            try:
-                listdir = os.listdir('%s/images' % (config['path']))
-            except:
-                pass #probably directory does not exist
-            listdir.sort()
-            complete = True
-            lastfilename = ''
-            lastfilename2 = ''
-            c = 0
-            for filename, url, uploader in images:
-                lastfilename2 = lastfilename
-                lastfilename = filename #return always the complete filename, not the truncated
-                filename2 = filename
-                if len(filename2) > other['filenamelimit']:
-                    filename2 = truncateFilename(other=other, filename=filename2)
-                if filename2 not in listdir:
-                    complete = False
-                    break
-                c +=1
-            print '%d images were found in the directory from a previous session' % (c)
-            if complete:
-                #image dump is complete
-                print 'Image dump was completed in the previous session'
-            else:
-                generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
-        
-        if config['logs']:
-            #fix
-            pass
+        resumePreviousDump(config=config)
    else:
-        print 'Trying generating a new dump into a new directory...'
-        if config['xml']:
-            titles += getPageTitles(config=config)
-            saveTitles(config=config, titles=titles)
-            generateXMLDump(config=config, titles=titles)
-            checkXMLIntegrity(config=config)
-        if config['images']:
-            if config['api']:
-                images += getImageFilenamesURLAPI(config=config)
-            else:
-                images += getImageFilenamesURL(config=config)
-            saveImageFilenamesURL(config=config, images=images)
-            generateImageDump(config=config, other=other, images=images)
-        if config['logs']:
-            saveLogs(config=config)
-    
-    #save index.php as .html, to preserve license details available at the botom of the page
-    if os.path.exists('%s/index.html' % (config['path'])):
-        print 'index.html exists, do not overwrite'
-    else:
-        print 'Downloading index.php (Main Page) as index.html'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()})
-        f = urllib2.urlopen(req)
-        raw = f.read()
-        f.close()
-        raw = removeIP(raw=raw)
-        f = open('%s/index.html' % (config['path']), 'w')
-        f.write(raw)
-        f.close()
-    
-    #save Special:Version as .html, to preserve extensions details
-    if os.path.exists('%s/Special:Version.html' % (config['path'])):
-        print 'Special:Version.html exists, do not overwrite'
-    else:
-        print 'Downloading Special:Version with extensions and other related info'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
-        f = urllib2.urlopen(req)
-        raw = f.read()
-        f.close()
-        raw = removeIP(raw=raw)
-        f = open('%s/Special:Version.html' % (config['path']), 'w')
-        f.write(raw)
-        f.close()
+        createNewDump(config=config)

+    saveIndexPHP(config=config)    
+    saveSpecialVersion(config=config)
    bye()

 if __name__ == "__main__":