#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-
# Copyright (C) 2011 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import datetime
import os
import re
import subprocess
import sys
import urllib
import urllib2
# todo:
# curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una)
# usar api o parsear html si no está disponible
# http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export
# threads para bajar más rápido? pedir varias páginas a la vez
def cleanHTML(raw=''):
if re.search('', raw): #
raw = raw.split('')[1].split('')[0]
elif re.search('', raw):
raw = raw.split('')[1].split('')[0]
else:
print 'This wiki doesn\'t use marks to split contain'
sys.exit()
return raw
def getAllPageTitles(domain='', namespaces=[]):
#http://en.wikipedia.org/wiki/Special:AllPages
#http://archiveteam.org/index.php?title=Special:AllPages
#http://www.wikanda.es/wiki/Especial:Todas
if not domain:
print 'Please, use --domain parameter'
sys.exit()
#namespace checks and stuff
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
raw = urllib.urlopen('%s?title=Special:Allpages' % (domain)).read()
m = re.compile(r'').finditer(raw) # [^>]*? to include selected="selected"
if 'all' in namespaces:
namespaces = []
for i in m:
namespaces.append(int(i.group("namespaceid")))
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
else:
#check if those namespaces really exist in this wiki
namespaces2 = []
for i in m:
if int(i.group("namespaceid")) in namespaces:
namespaces2.append(int(i.group("namespaceid")))
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
namespaces = namespaces2
else:
namespaces = [0]
#retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore
namespaces = [i for i in set(namespaces)] #uniques
titles = []
for namespace in namespaces:
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (domain, namespace)
raw = urllib.urlopen(url).read()
raw = cleanHTML(raw)
r_title = r'title="(?P
[^>]+)">'
r_suballpages = r'&from=(?P[^>]+)&to=(?P[^>]+)">'
deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
c = 0
checked_suballpages = []
rawacum = raw
while re.search(r_suballpages, raw) and c < deep:
#load sub-Allpages
m = re.compile(r_suballpages).finditer(raw)
for i in m:
fr = i.group('from')
to = i.group('to')
name = '%s-%s' % (fr, to)
if not name in checked_suballpages:
checked_suballpages.append(name)
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (domain, namespace, fr, to) #do not put urllib.quote in fr or to
raw2 = urllib.urlopen(url).read()
raw2 = cleanHTML(raw2)
rawacum += raw2 #merge it after removed junk
print ' Detected sub-Allpages:', name, len(raw2), 'bytes', len(re.findall(r_title, raw2))
c += 1
m = re.compile(r_title).finditer(rawacum)
for i in m:
if not i.group('title').startswith('Special:'):
if not i.group('title') in titles:
titles.append(i.group('title'))
return titles
def getHeader(domain=''):
#get the header of a random page, to attach it in the complete XML backup
#similar to: ')[0]
return header
def getXML(domain='', title='', curonly=False):
#http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
limit = 1000
title_ = re.sub(' ', '_', title)
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'}
params = {'title': 'Special:Export', 'pages': title, 'action': 'submit', }
if curonly:
params['curonly'] = 1
else:
params['offset'] = '1'
params['limit'] = limit
data = urllib.urlencode(params)
req = urllib2.Request(url=domain, data=data, headers=headers)
f = urllib2.urlopen(req)
xml = f.read()
#if complete history, check if this page history has > 1000 edits, if so, retrieve all using offset
if not curonly:
xml2 = xml
while len(re.findall(r'', xml2)) == limit:
#try to retrieve more, although perhaps it is exact 1000 edits
params['offset'] = re.findall(r'([^<]+)', xml2)[-1]
data = urllib.urlencode(params)
req2 = urllib2.Request(url=domain, data=data, headers=headers)
f2 = urllib2.urlopen(req2)
xml2 = f2.read()
if re.findall(r'([^<]+)', xml2)[-1] == params['offset']:
print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories can be truncated'
break
xml = xml.split('')[0]+xml2.split('\n')[1]
print title, len(xml2), re.findall('[^<]+', xml2)
return xml
def cleanXML(xml=''):
xml = xml.split('\n')[1]
xml = xml.split('')[0]
return xml
if __name__ == '__main__':
domain = 'http://archiveteam.org/index.php' # 'http://en.wikipedia.org/w'
#domain = 'http://wikanda.cadizpedia.eu/w/index.php' # 'http://en.wikipedia.org/w'
curonly = False
namespaces = [0]
if re.findall(r'(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews)\.org', domain):
print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!'
sys.exit()
#get titles
print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces])
titles = getAllPageTitles(domain=domain, namespaces=namespaces)
#print '\n'.join(titles)
print '%d titles loaded' % (len(titles))
#get xml
print 'Retrieving the XML for every title'
header = getHeader(domain=domain)
footer = ''
xmlfilename = 'wikidump-%s.xml' % (str(datetime.datetime.now()))
xmlfile = open(xmlfilename, 'w')
xmlfile.write(header)
c = 1
for title in titles:
if c % 10 == 0:
print ' Downloaded %d pages' % (c)
xml = getXML(domain=domain, title=title, curonly=curonly)
xml = cleanXML(xml=xml)
xmlfile.write(xml)
c += 1
xmlfile.write(footer)
xmlfile.close()