From 4633837bbf5bdbe9df377663bd8af41a35ddd195 Mon Sep 17 00:00:00 2001 From: emijrp Date: Thu, 7 Apr 2011 15:56:48 +0000 Subject: [PATCH] git-svn-id: https://wikiteam.googlecode.com/svn/trunk@17 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index f0f0c0f..24cc015 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -21,6 +21,7 @@ import os import re import subprocess import sys +import time import urllib import urllib2 @@ -186,6 +187,7 @@ def generateXMLDump(config={}, titles=[]): xmlfile.write(header) c = 1 for title in titles: + time.sleep(config['delay']) if c % 10 == 0: print ' Downloaded %d pages' % (c) xml = getXMLPage(config={}, title=title) @@ -207,9 +209,9 @@ def saveTitles(config={}, titles=[]): def generateImageDump(config={}): #slurp all the images - #special:imagelist #save in a .tar? #tener en cuenta http://www.mediawiki.org/wiki/Manual:ImportImages.php + #fix, download .desc ? print 'Retrieving image filenames' r_next = r'(?\d+)&' # (?[^>]+/./../[^>]+)">[^<]+').finditer(raw) for i in m: url = i.group('url') + if url[0] == '/': #relative URL ZOMG! + url = '%s%s' % (config['domain'].split('/index.php')[0], url) filename = re.sub('_', ' ', url.split('/')[-1]) filename_ = re.sub(' ', '_', url.split('/')[-1]) images.append([filename, url]) @@ -240,6 +244,7 @@ def generateImageDump(config={}): c = 0 for filename, url in images: + time.sleep(config['delay']) urllib.urlretrieve(url, '%s/%s' % (imagepath, filename)) c += 1 if c % 10 == 0: @@ -311,13 +316,14 @@ def getParameters(): 'namespaces': [0], 'path': '', 'threads': 1, + 'delay': 0, } other = { 'resume': False, } #console params try: - opts, args = getopt.getopt(sys.argv[1:], "", ["h", "help", "path=", "domain=", "images", "logs", "xml", "curonly", "threads=", "resume" ]) + opts, args = getopt.getopt(sys.argv[1:], "", ["h", "help", "path=", "domain=", "images", "logs", "xml", "curonly", "threads=", "resume", "delay=" ]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" @@ -352,6 +358,8 @@ def getParameters(): config["threads"] = int(a) elif o in ("--resume"): other["resume"] = True + elif o in ("--delay"): + config["delay"] = int(a) else: assert False, "unhandled option" @@ -386,6 +394,8 @@ def main(): print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!' sys.exit() + print 'Analysing %s' % (config['domain']) + #creating path or resuming if desired c = 2 while os.path.isdir(config['path']):