From d44e70ba7fcc6b071fdcded06cfc14943cb26e44 Mon Sep 17 00:00:00 2001 From: emijrp Date: Wed, 6 Apr 2011 12:26:13 +0000 Subject: [PATCH] git-svn-id: https://wikiteam.googlecode.com/svn/trunk@10 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index f93e76a..fb9e893 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -118,12 +118,12 @@ def getHeader(domain=''): def getXML(domain='', title='', curonly=False): #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - limit = 1000 + limit = 100 truncated = False title_ = re.sub(' ', '_', title) print title headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'} - params = {'title': 'Special:Export', 'pages': title, 'action': 'submit', } + params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', } if curonly: params['curonly'] = 1 else: @@ -153,6 +153,7 @@ def getXML(domain='', title='', curonly=False): else: #offset is OK in this wiki, merge with the previous chunk of this page history and continue xml = xml.split('')[0]+xml2.split('\n')[1] + print 'merging' else: params['offset'] = '' #no more edits in this page history print title, len(re.findall(r_timestamp, xml)), 'edits' @@ -167,17 +168,18 @@ if __name__ == '__main__': #domain = 'http://archiveteam.org/index.php' #domain = 'http://wikanda.cadizpedia.eu/w/index.php' domain = 'http://en.citizendium.org/index.php' + domain = 'http://en.wikipedia.org/w/index.php' curonly = False namespaces = [0] if re.findall(r'(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews)\.org', domain): print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!' - sys.exit() + #sys.exit() #get titles print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces]) #titles = getAllPageTitles(domain=domain, namespaces=namespaces) - titles = ['Life'] + titles = ['Bay of Cádiz'] #print '\n'.join(titles) print '%d page titles loaded' % (len(titles))