diff --git a/dumpgenerator.py b/dumpgenerator.py index ff9430c..de72f7b 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -24,12 +24,19 @@ import urllib import urllib2 # todo: +# +# resuming feature: +# save all titles in a .txt, to resume when ctrl-c +# re.findall('([^<]+)', xml) to see when it was aborted, and resume from there +# +# other: # curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una) # usar api o parsear html si no está disponible # http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export # threads para bajar más rápido? pedir varias páginas a la vez # images? # Special:Log? uploads, account creations, etc +# download Special:Version to save whch extension it used def cleanHTML(raw=''): if re.search('', raw): # @@ -41,7 +48,7 @@ def cleanHTML(raw=''): sys.exit() return raw -def getAllPageTitles(domain='', namespaces=[]): +def getTitles(domain='', namespaces=[]): #http://en.wikipedia.org/wiki/Special:AllPages #http://archiveteam.org/index.php?title=Special:AllPages #http://www.wikanda.es/wiki/Especial:Todas @@ -49,6 +56,8 @@ def getAllPageTitles(domain='', namespaces=[]): print 'Please, use --domain parameter' sys.exit() + print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces]) + #namespace checks and stuff namespacenames = {0:''} # main is 0, no prefix if namespaces: @@ -72,6 +81,8 @@ def getAllPageTitles(domain='', namespaces=[]): #retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore namespaces = [i for i in set(namespaces)] #uniques + print '%d namespaces have been found' % (len(namespaces)) + titles = [] for namespace in namespaces: print ' Retrieving titles in the namespace', namespace @@ -106,19 +117,20 @@ def getAllPageTitles(domain='', namespaces=[]): if not i.group('title').startswith('Special:'): if not i.group('title') in titles: titles.append(i.group('title')) + print '%d page titles loaded' % (len(titles)) return titles -def getHeader(domain=''): +def getXMLHeader(domain=''): #get the header of a random page, to attach it in the complete XML backup #similar to: ')[0] return header -def getXML(domain='', title='', curonly=False): +def getXMLPage(domain='', title='', curonly=False): #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - limit = 100 + limit = 1000 truncated = False title_ = re.sub(' ', '_', title) headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'} @@ -162,38 +174,51 @@ def cleanXML(xml=''): xml = xml.split('')[0] return xml -if __name__ == '__main__': - domain = 'http://archiveteam.org/index.php' - #domain = 'http://wikanda.cadizpedia.eu/w/index.php' - #domain = 'http://en.citizendium.org/index.php' - #domain = 'http://en.wikipedia.org/w/index.php' - curonly = False - namespaces = [0] - - if re.findall(r'(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews)\.org', domain): - print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!' - sys.exit() - - #get titles - print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces]) - titles = getAllPageTitles(domain=domain, namespaces=namespaces) - #print '\n'.join(titles) - print '%d page titles loaded' % (len(titles)) - - #get xml +def generateXMLDump(domain='', titles=[]): print 'Retrieving the XML for every page' - header = getHeader(domain=domain) + header = getXMLHeader(domain=domain) footer = '\n' #new line at the end - xmlfilename = 'wikidump-%s.xml' % (str(datetime.datetime.now())) + xmlfilename = 'wikidump-%s-%s.xml' % (curonly and 'current' or 'history', str(datetime.datetime.now())) xmlfile = open(xmlfilename, 'w') xmlfile.write(header) c = 1 for title in titles: if c % 10 == 0: print ' Downloaded %d pages' % (c) - xml = getXML(domain=domain, title=title, curonly=curonly) + xml = getXMLPage(domain=domain, title=title, curonly=curonly) xml = cleanXML(xml=xml) xmlfile.write(xml) c += 1 xmlfile.write(footer) xmlfile.close() + +def saveTitles(): + #save titles in a txt for resume if needed + pass + +def generateImageDump(): + #slurp all the images + pass + +if __name__ == '__main__': + #read sys.argv + + #domain = 'http://archiveteam.org/index.php' + #domain = 'http://bulbapedia.bulbagarden.net/w/index.php' + #domain = 'http://wikanda.cadizpedia.eu/w/index.php' + #domain = 'http://en.citizendium.org/index.php' + #domain = 'http://en.wikipedia.org/w/index.php' + domain = 'http://www.editthis.info/CODE_WIKI/' + domain = 'http://www.editthis.info/bobobo_WIKI/' + domain = 'http://osl2.uca.es/wikira/index.php' + curonly = False + namespaces = ['all'] + + if re.findall(r'(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews)\.org', domain): + print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!' + sys.exit() + + titles = getTitles(domain=domain, namespaces=namespaces) + saveTitles() + generateXMLDump(domain=domain, titles=titles) + generateImageDump()