@ -24,51 +24,29 @@ import cPickle
import datetime
import argparse
import json
import gzip
try :
from hashlib import md5
except ImportError : # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import StringIO
import requests
import subprocess
import sys
import time
import urllib
import urllib2
__VERSION__ = ' 0.2.2 ' #major, minor, micro
def getVersion ( ) :
return ( __VERSION__ )
# This class is from https://github.com/crustymonkey/py-sonic/blob/master/libsonic/connection.py#L50
class POSTHTTPRedirectHandler ( urllib2 . HTTPRedirectHandler ) :
def redirect_request ( self , req , fp , code , msg , headers , newurl ) :
m = req . get_method ( )
if ( code in ( 301 , 302 , 303 , 307 ) and m in ( " GET " , " HEAD " )
or code in ( 301 , 302 , 303 , 307 ) and m == " POST " ) :
newurl = newurl . replace ( ' ' , ' % 20 ' )
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in ( " content-length " , " content-type " )
)
data = None
if req . has_data ( ) :
data = req . get_data ( )
return urllib2 . Request ( newurl ,
data = data ,
headers = newheaders ,
origin_req_host = req . get_origin_req_host ( ) ,
unverifiable = True )
else :
raise urllib2 . HTTPError ( req . get_full_url ( ) , code , msg , headers , fp )
def truncateFilename ( other = { } , filename = ' ' ) :
""" Truncate filenames when downloading images with large filenames """
return filename [ : other [ ' filenamelimit ' ] ] + md5 ( filename ) . hexdigest ( ) + ' . ' + filename . split ( ' . ' ) [ - 1 ]
def delay ( config = { } ):
def delay ( config = { } , session = None ) :
""" Add a delay if configured for that """
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
@ -95,20 +73,15 @@ def cleanHTML(raw=''):
sys . exit ( )
return raw
def getNamespacesScraper ( config = { } ):
def getNamespacesScraper ( config = { } , session = None ):
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
""" Function called if no API is available """
namespaces = config [ ' namespaces ' ]
namespacenames = { 0 : ' ' } # main is 0, no prefix
if namespaces :
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Allpages ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' index ' ] , data = { ' title ' : ' Special:Allpages ' , } , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
delay ( config = config , session = session )
m = re . compile ( r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw ) # [^>]*? to include selected="selected"
if ' all ' in namespaces :
@ -131,19 +104,14 @@ def getNamespacesScraper(config={}):
print ' %d namespaces found ' % ( len ( namespaces ) )
return namespaces , namespacenames
def getNamespacesAPI ( config = { } ):
def getNamespacesAPI ( config = { } , session = None ):
""" Uses the API to get the list of namespaces names and ids """
namespaces = config [ ' namespaces ' ]
namespacenames = { 0 : ' ' } # main is 0, no prefix
if namespaces :
req = urllib2 . Request ( url = config [ ' api ' ] , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
result = json . loads ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) )
else :
result = json . loads ( f . read ( ) )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' api ' ] , data = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' json ' } , headers = { ' User-Agent ' : getUserAgent ( ) } )
result = json . loads ( r . text )
delay ( config = config , session = session )
if ' all ' in namespaces :
namespaces = [ ]
@ -169,10 +137,10 @@ def getNamespacesAPI(config={}):
print ' %d namespaces found ' % ( len ( namespaces ) )
return namespaces , namespacenames
def getPageTitlesAPI ( config = { } ):
def getPageTitlesAPI ( config = { } , session = None ):
""" Uses the API to get the list of page titles """
titles = [ ]
namespaces , namespacenames = getNamespacesAPI ( config = config )
namespaces , namespacenames = getNamespacesAPI ( config = config , session = session )
for namespace in namespaces :
if namespace in config [ ' exnamespaces ' ] :
print ' Skipping namespace = %d ' % ( namespace )
@ -180,29 +148,14 @@ def getPageTitlesAPI(config={}):
c = 0
print ' Retrieving titles in the namespace %d ' % ( namespace )
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
headers = { ' User-Agent ' : getUserAgent ( ) }
apfrom = ' ! '
while apfrom :
sys . stderr . write ( ' . ' ) #progress
params = { ' action ' : ' query ' , ' list ' : ' allpages ' , ' apnamespace ' : namespace , ' apfrom ' : apfrom . encode ( ' utf-8 ' ) , ' format ' : ' json ' , ' aplimit ' : 500 }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' api ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' (1) Server is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
print ' An error has occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
jsontitles = json . loads ( unicode ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) , ' utf-8 ' ) )
else :
jsontitles = json . loads ( unicode ( f . read ( ) , ' utf-8 ' ) )
f . close ( )
r = session . post ( url = config [ ' api ' ] , data = params , headers = headers )
#FIXME Handle HTTP errors here!
jsontitles = json . loads ( r . text )
apfrom = ' '
if jsontitles . has_key ( ' query-continue ' ) and jsontitles [ ' query-continue ' ] . has_key ( ' allpages ' ) :
if jsontitles [ ' query-continue ' ] [ ' allpages ' ] . has_key ( ' apcontinue ' ) :
@ -218,23 +171,19 @@ def getPageTitlesAPI(config={}):
titles = list ( set ( titles ) )
apfrom = ' '
c + = len ( jsontitles [ ' query ' ] [ ' allpages ' ] )
delay ( config = config )
delay ( config = config , session = session )
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
def getPageTitlesScraper ( config = { } ):
def getPageTitlesScraper ( config = { } , session = None ):
""" """
titles = [ ]
namespaces , namespacenames = getNamespacesScraper ( config = config )
namespaces , namespacenames = getNamespacesScraper ( config = config , session = session )
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
url = ' %s ?title=Special:Allpages&namespace= %s ' % ( config [ ' index ' ] , namespace )
req = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
r = session . get ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
raw = cleanHTML ( raw )
r_title = r ' title= " (?P<title>[^>]+) " > '
@ -269,24 +218,20 @@ def getPageTitlesScraper(config={}):
if not name in checked_suballpages :
checked_suballpages . append ( name ) #to avoid reload dupe subpages links
delay ( config = config )
req2 = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req2 )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw2 = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw2 = f . read ( )
delay ( config = config , session = session )
r2 = session . get ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw2 = r2 . text
raw2 = cleanHTML ( raw2 )
rawacum + = raw2 #merge it after removed junk
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , len ( re . findall ( r_title , raw2 ) ) , ' pages '
delay ( config = config )
delay ( config = config , session = session )
c + = 1
c = 0
m = re . compile ( r_title ) . finditer ( rawacum )
for i in m :
t = undoHTMLEntities ( text = unicode( i. group ( ' title ' ) , ' utf-8 ' ) )
t = undoHTMLEntities ( text = i. group ( ' title ' ) )
if not t . startswith ( ' Special: ' ) :
if not t in titles :
titles . append ( t )
@ -294,7 +239,7 @@ def getPageTitlesScraper(config={}):
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
def getPageTitles ( config = { } ):
def getPageTitles ( config = { } , session = None ):
""" Get list of page titles """
#http://en.wikipedia.org/wiki/Special:AllPages
#http://archiveteam.org/index.php?title=Special:AllPages
@ -304,9 +249,9 @@ def getPageTitles(config={}):
titles = [ ]
if config [ ' api ' ] :
titles = getPageTitlesAPI ( config = config )
titles = getPageTitlesAPI ( config = config , session = session )
elif config [ ' index ' ] :
titles = getPageTitlesScraper ( config = config )
titles = getPageTitlesScraper ( config = config , session = session )
titles = list ( set ( titles ) ) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
titles . sort ( ) #sorting
@ -314,22 +259,22 @@ def getPageTitles(config={}):
print ' %d page titles loaded ' % ( len ( titles ) )
return titles
def getXMLHeader ( config = { } ):
def getXMLHeader ( config = { } , session = None ):
""" Retrieve a random page to extract XML headers (namespace info, etc) """
#get the header of a random page, to attach it in the complete XML backup
#similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
randomtitle = ' Main_Page ' #previously AMF5LKE43MNFGHKSDMRTJ
xml = getXMLPage ( config = config , title = randomtitle , verbose = False )
xml = getXMLPage ( config = config , title = randomtitle , verbose = False , session = session )
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
if not xml :
print ' XML export on this wiki is broken, quitting. '
sys . exit ( )
return header
def getXMLFileDesc ( config = { } , title = ' ' ):
def getXMLFileDesc ( config = { } , title = ' ' , session = None ):
""" Get XML for image description page """
config [ ' curonly ' ] = 1 #tricky to get only the most recent desc
return getXMLPage ( config = config , title = title , verbose = False )
return getXMLPage ( config = config , title = title , verbose = False , session = session )
def getUserAgent ( ) :
""" Return a cool user-agent to hide Python user-agent """
@ -348,7 +293,7 @@ def logerror(config={}, text=''):
f . write ( output . encode ( ' utf-8 ' ) )
f . close ( )
def getXMLPageCore ( headers = { } , params = { } , config = { } ):
def getXMLPageCore ( headers = { } , params = { } , config = { } , session = None ):
""" """
#returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
#if retrieving params['limit'] revisions fails, returns a current only version
@ -380,30 +325,14 @@ def getXMLPageCore(headers={}, params={}, config={}):
print ' Saving in the errors log, and skipping... '
logerror ( config = config , text = ' Error while retrieving the last revision of " %s " . Skipping. ' % ( params [ ' pages ' ] ) )
return ' ' # empty xml
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' index ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' (2) Server is slow... Waiting some seconds and retrying... '
time . sleep ( 15 )
f = urllib2 . urlopen ( req )
except :
print ' An error has occurred while retrieving " %s " ' % ( params [ ' pages ' ] )
print ' Please, resume the dump, --resume '
sys . exit ( )
# The error is usually temporary, but we exit the dump altogether.
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
xml = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
xml = f . read ( )
#FIXME HANDLE HTTP Errors HERE
r = session . post ( url = config [ ' index ' ] , data = params , headers = headers )
xml = r . text
c + = 1
return xml
def getXMLPage ( config = { } , title = ' ' , verbose = True ):
def getXMLPage ( config = { } , title = ' ' , verbose = True , session = None ) :
""" Get the full history (or current only) of a page """
#if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
@ -414,8 +343,8 @@ def getXMLPage(config={}, title='', verbose=True):
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ . encode ( ' utf-8 ' ) , ' action ' : ' submit ' , }
headers = { ' User-Agent ' : getUserAgent ( ) }
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' , }
if config [ ' curonly ' ] :
params [ ' curonly ' ] = 1
params [ ' limit ' ] = 1
@ -425,7 +354,7 @@ def getXMLPage(config={}, title='', verbose=True):
if config . has_key ( ' templates ' ) and config [ ' templates ' ] : #in other case, do not set params['templates']
params [ ' templates ' ] = 1
xml = getXMLPageCore ( headers = headers , params = params , config = config )
xml = getXMLPageCore ( headers = headers , params = params , config = config , session = session )
#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
#else, warning about Special:Export truncating large page histories
@ -433,7 +362,7 @@ def getXMLPage(config={}, title='', verbose=True):
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) : # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
while not truncated and params [ ' offset ' ] : #next chunk
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ] #get the last timestamp from the acum XML
xml2 = getXMLPageCore ( headers = headers , params = params , config = config )
xml2 = getXMLPageCore ( headers = headers , params = params , config = config , session = session )
if re . findall ( r_timestamp , xml2 ) : #are there more edits in this next XML chunk or no <page></page>?
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
@ -475,11 +404,11 @@ def cleanXML(xml=''):
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
return xml
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' ):
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' , session = None ):
""" Generates a XML dump for a list of titles """
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
header = getXMLHeader ( config = config )
header = getXMLHeader ( config = config , session = session )
footer = ' </mediawiki> \n ' #new line at the end
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' )
xmlfile = ' '
@ -508,7 +437,7 @@ def generateXMLDump(config={}, titles=[], start=''):
#requested complete xml dump
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( header )
xmlfile . write ( header . encode ( ' utf-8 ' ) )
xmlfile . close ( )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
@ -520,17 +449,17 @@ def generateXMLDump(config={}, titles=[], start=''):
lock = False
if lock :
continue
delay ( config = config )
delay ( config = config , session = session )
if c % 10 == 0 :
print ' Downloaded %d pages ' % ( c )
xml = getXMLPage ( config = config , title = title )
xml = getXMLPage ( config = config , title = title , session = session )
xml = cleanXML ( xml = xml )
if not xml :
logerror ( config = config , text = u ' The page " %s " was missing in the wiki (probably deleted) ' % ( title ) )
#here, XML is a correct <page> </page> chunk or
#an empty string due to a deleted page (logged in errors log) or
#an empty string due to an error while retrieving the page from server (logged in errors log)
xmlfile . write ( xml )
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
c + = 1
xmlfile . write ( footer )
xmlfile . close ( )
@ -547,18 +476,18 @@ def saveTitles(config={}, titles=[]):
print ' Titles saved at... ' , titlesfilename
def saveImageFilenamesURL ( config = { } , images = [ ] ):
def saveImageFilenamesURL ( config = { } , images = [ ] , session = None ):
""" Save image list in a file, including filename, url and uploader """
imagesfilename = ' %s - %s -images.txt ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
output = u " %s \n --END-- " % ( u ' \n ' . join ( [ u ' %s \t %s \t %s ' % ( filename , url , uploader ) for filename , url , uploader in images ] ) )
imagesfile . write ( output . encode ( ' utf-8 ' ) )
imagesfile. write ( ( ' \n ' . join ( [ ' %s \t %s \t %s ' % ( filename , url , uploader ) for filename , url , uploader in images ] ) . encode ( ' utf-8 ' ) ) )
imagesfile . write ( ' \n --END-- ' )
imagesfile . close ( )
print ' Image filenames and URLs saved at... ' , imagesfilename
def getImageFilenamesURL ( config = { } ):
def getImageFilenamesURL ( config = { } , session = None ):
""" Retrieve file list: filename, url, uploader """
print ' Retrieving image filenames '
@ -569,14 +498,9 @@ def getImageFilenamesURL(config={}):
retries = 5
while offset :
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = unicode ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) , ' utf-8 ' )
else :
raw = unicode ( f . read ( ) , ' utf-8 ' )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' index ' ] , data = { ' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset , } , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
delay ( config = config , session = session )
if re . search ( ur ' (?i)(allowed memory size of \ d+ bytes exhausted|Call to a member function getURL) ' , raw ) : # delicate wiki
if limit > 10 :
print ' Error: listing %d images in a chunk is not possible, trying tiny chunks ' % ( limit )
@ -645,36 +569,20 @@ def getImageFilenamesURL(config={}):
images . sort ( )
return images
def getImageFilenamesURLAPI ( config = { } ):
def getImageFilenamesURLAPI ( config = { } , session = None ):
""" Retrieve file list: filename, url, uploader """
print ' Retrieving image filenames '
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
headers = { ' User-Agent ' : getUserAgent ( ) }
aifrom = ' ! '
images = [ ]
while aifrom :
sys . stderr . write ( ' . ' ) #progress
params = { ' action ' : ' query ' , ' list ' : ' allimages ' , ' aiprop ' : ' url|user ' , ' aifrom ' : aifrom , ' format ' : ' json ' , ' ailimit ' : 500 }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' api ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' (3) Server is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
print ' An error has occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
jsonimages = json . loads ( unicode ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) , ' utf-8 ' ) )
else :
jsonimages = json . loads ( unicode ( f . read ( ) , ' utf-8 ' ) )
f . close ( )
#print jsonimages
delay ( config = config )
#FIXME Handle HTTP Errors HERE
r = session . post ( url = config [ ' api ' ] , data = params , headers = headers )
jsonimages = json . loads ( r . text )
delay ( config = config , session = session )
aifrom = ' '
if jsonimages . has_key ( ' query-continue ' ) and jsonimages [ ' query-continue ' ] . has_key ( ' allimages ' ) :
if jsonimages [ ' query-continue ' ] [ ' allimages ' ] . has_key ( ' aicontinue ' ) :
@ -691,13 +599,8 @@ def getImageFilenamesURLAPI(config={}):
domainalone = config [ ' index ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ] #remove from :// (http or https) until the first / after domain
url = u ' %s :// %s / %s ' % ( config [ ' index ' ] . split ( ' :// ' ) [ 0 ] , domainalone , url ) # concat http(s) + domain + relative url
url = re . sub ( ' ' , ' _ ' , url )
if image . has_key ( ' name ' ) :
#some API returns image name http://hastur.net/w/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10
filename = re . sub ( ' _ ' , ' ' , image [ ' name ' ] )
else :
#other not http://wiki.annotation.jp/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10
#tips for dealing with unquote http://stackoverflow.com/questions/5139249/python-url-unquote-unicode
filename = re . sub ( ' _ ' , ' ' , unicode ( urllib2 . unquote ( url . encode ( ' ascii ' ) ) . split ( ' / ' ) [ - 1 ] , ' utf-8 ' ) )
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
filename = unicode ( urllib . unquote ( ( re . sub ( ' _ ' , ' ' , url . split ( ' / ' ) [ - 1 ] ) ) . encode ( ' ascii ' , ' ignore ' ) ) , ' utf-8 ' )
uploader = re . sub ( ' _ ' , ' ' , image [ ' user ' ] )
images . append ( [ filename , url , uploader ] )
@ -720,7 +623,7 @@ def undoHTMLEntities(text=''):
return text
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' ):
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' , session = None ):
""" Save files and descriptions using a file list """
#fix use subdirectories md5
@ -739,11 +642,11 @@ def generateImageDump(config={}, other={}, images=[], start=''):
lock = False
if lock :
continue
delay ( config = config )
delay ( config = config , session = session )
#saving file
#truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
filename2 = filename
filename2 = urllib. unquote ( filename)
if len ( filename2 ) > other [ ' filenamelimit ' ] :
# split last . (extension) and then merge
filename2 = truncateFilename ( other = other , filename = filename2 )
@ -761,21 +664,21 @@ def generateImageDump(config={}, other={}, images=[], start=''):
# TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
#saving description if any
xmlfiledesc = getXMLFileDesc ( config = config , title = u ' Image: %s ' % ( filename ) ) # use Image: for backwards compatibility
xmlfiledesc = getXMLFileDesc ( config = config , title = u ' Image: %s ' % ( filename ) , session = session ) # use Image: for backwards compatibility
f = open ( ' %s / %s .desc ' % ( imagepath , filename2 ) , ' w ' )
if not re . search ( r ' </mediawiki> ' , xmlfiledesc ) : #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
#failure when retrieving desc? then save it as empty .desc
xmlfiledesc = ' '
f . write ( xmlfiledesc )
f . write ( xmlfiledesc . encode ( ' utf-8 ' ) )
f . close ( )
delay ( config = config )
delay ( config = config , session = session )
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
print ' Downloaded %d images ' % ( c )
def saveLogs ( config = { } ):
def saveLogs ( config = { } , session = None ):
""" Save Special:Log """
#get all logs from Special:Log
""" parse
@ -793,9 +696,9 @@ def saveLogs(config={}):
< option value = " " > Todos los registros < / option >
< / select >
"""
delay ( config = config )
delay ( config = config , session = session )
def domain2prefix ( config = { } ):
def domain2prefix ( config = { } , session = None ):
""" Convert domain name to a valid prefix filename. """
# At this point, both api and index are supposed to be defined
@ -966,6 +869,15 @@ def getParameters(params=[]):
else :
index = args . index
cj = cookielib . MozillaCookieJar ( )
if args . cookies :
cj . load ( args . cookies )
print ' Using cookies from %s ' % args . cookies
session = requests . Session ( )
session . cookies = cj
session . headers = { ' User-Agent ' : getUserAgent ( ) }
config = {
' curonly ' : args . curonly ,
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
@ -984,18 +896,12 @@ def getParameters(params=[]):
' resume ' : args . resume ,
' filenamelimit ' : 100 , #do not change
' force ' : args . force ,
' session ' : session
}
if config [ ' cookies ' ] :
cj = cookielib . MozillaCookieJar ( )
cj . load ( config [ ' cookies ' ] )
opener = urllib2 . build_opener ( POSTHTTPRedirectHandler , urllib2 . HTTPCookieProcessor ( cj ) )
urllib2 . install_opener ( opener )
print ' Using cookies from %s ' % config [ ' cookies ' ]
if config [ ' api ' ] :
#check api.php
if checkAPI ( config [ ' api ' ] , config ):
if checkAPI ( config [ ' api ' ] , config , session = other [ ' session ' ] ) :
print ' api.php is OK '
else :
print ' Error in api.php, please, provide a correct path to api.php '
@ -1003,7 +909,7 @@ def getParameters(params=[]):
if config [ ' index ' ] :
#check index.php
if checkIndexphp ( config [ ' index ' ] , config ):
if checkIndexphp ( config [ ' index ' ] , config , session = other [ ' session ' ] ):
print ' index.php is OK '
else :
print ' Error in index.php, please, provide a correct path to index.php '
@ -1011,39 +917,29 @@ def getParameters(params=[]):
#calculating path, if not defined by user with --path=
if not config [ ' path ' ] :
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config ), config [ ' date ' ] )
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config , session = session ), config [ ' date ' ] )
return config , other
def checkAPI ( api , config = { } ):
def checkAPI ( api , config = { } , session = None ):
""" Checking API availability """
req = urllib2 . Request ( url = api , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
resultText = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
resultText = f . read ( )
f . close ( )
global cj
r = session . post ( url = api , data = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } , headers = { ' User-Agent ' : getUserAgent ( ) } )
resultText = r . text
print ' Checking api.php... ' , api
if " MediaWiki API is not enabled for this site. " in resultText :
return False
result = json . loads ( resultText )
delay ( config = config )
delay ( config = config , session = session )
if result . has_key ( ' query ' ) :
return True
return False
def checkIndexphp ( indexphp , config = { } ):
def checkIndexphp ( indexphp , config = { } , session = None ):
""" Checking index.php availability """
req = urllib2 . Request ( url = indexphp , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
r = session . post ( url = indexphp , data = { ' title ' : ' Special:Version ' } , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
delay ( config = config , session = session )
print ' Checking index.php... ' , indexphp
if re . search ( r ' (Special:Badtitle</a>|class= " permissions-errors " | " wgCanonicalSpecialPageName " : " Badtitle " |Login Required</h1>) ' , raw ) and not config [ ' cookies ' ] : # Workaround for issue 71
print " ERROR: This wiki requires login and we are not authenticated "
@ -1062,7 +958,7 @@ def removeIP(raw=''):
return raw
def checkXMLIntegrity ( config = { } ):
def checkXMLIntegrity ( config = { } , session = None ):
""" Check XML dump integrity, to detect broken XML chunks """
return
@ -1072,7 +968,7 @@ def checkXMLIntegrity(config={}):
checkpageclose = 0
checkrevisionopen = 0
checkrevisionclose = 0
for line in file ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config ), config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' ) . read ( ) . splitlines ( ) :
for line in file ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = session ), config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' ) . read ( ) . splitlines ( ) :
if " <revision> " in line :
checkrevisionopen + = 1
elif " </revision> " in line :
@ -1103,19 +999,19 @@ def createNewDump(config={}, other={}):
images = [ ]
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
titles + = getPageTitles ( config = config )
titles + = getPageTitles ( config = config , session = other [ ' session ' ] )
saveTitles ( config = config , titles = titles )
generateXMLDump ( config = config , titles = titles )
generateXMLDump ( config = config , titles = titles , session = other [ ' session ' ] )
checkXMLIntegrity ( config = config )
if config [ ' images ' ] :
if config [ ' api ' ] :
images + = getImageFilenamesURLAPI ( config = config )
images + = getImageFilenamesURLAPI ( config = config , session = other [ ' session ' ] )
else :
images + = getImageFilenamesURL ( config = config )
saveImageFilenamesURL ( config = config , images = images )
generateImageDump ( config = config , other = other , images = images )
images + = getImageFilenamesURL ( config = config , session = other [ ' session ' ] )
saveImageFilenamesURL ( config = config , images = images , session = other [ ' session ' ] )
generateImageDump ( config = config , other = other , images = images , session = other [ ' session ' ] )
if config [ ' logs ' ] :
saveLogs ( config = config )
saveLogs ( config = config , session = session )
def resumePreviousDump ( config = { } , other = { } ) :
titles = [ ]
@ -1125,7 +1021,7 @@ def resumePreviousDump(config={}, other={}):
#load titles
lasttitle = ' '
try :
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ), config [ ' date ' ] ) , ' r ' )
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = session ), config [ ' date ' ] ) , ' r ' )
raw = unicode ( f . read ( ) , ' utf-8 ' )
titles = raw . split ( ' \n ' )
lasttitle = titles [ - 1 ]
@ -1140,13 +1036,13 @@ def resumePreviousDump(config={}, other={}):
else :
print ' Title list is incomplete. Reloading... '
#do not resume, reload, to avoid inconsistences, deleted pages or so
titles = getPageTitles ( config = config )
titles = getPageTitles ( config = config , session = other [ ' session ' ] )
saveTitles ( config = config , titles = titles )
#checking xml dump
xmliscomplete = False
lastxmltitle = ' '
try :
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config ), config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = session ), config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
for l in f :
if re . findall ( ' </mediawiki> ' , l ) :
#xml dump is complete
@ -1176,7 +1072,7 @@ def resumePreviousDump(config={}, other={}):
#load images
lastimage = ' '
try :
f = open ( ' %s / %s - %s -images.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ), config [ ' date ' ] ) , ' r ' )
f = open ( ' %s / %s - %s -images.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = session ), config [ ' date ' ] ) , ' r ' )
raw = unicode ( f . read ( ) , ' utf-8 ' ) . strip ( )
lines = raw . split ( ' \n ' )
for l in lines :
@ -1192,9 +1088,9 @@ def resumePreviousDump(config={}, other={}):
print ' Image list is incomplete. Reloading... '
#do not resume, reload, to avoid inconsistences, deleted images or so
if config [ ' api ' ] :
images = getImageFilenamesURLAPI ( config = config )
images = getImageFilenamesURLAPI ( config = config , session = session )
else :
images = getImageFilenamesURL ( config = config )
images = getImageFilenamesURL ( config = config , session = session )
saveImageFilenamesURL ( config = config , images = images )
#checking images directory
listdir = [ ]
@ -1228,47 +1124,37 @@ def resumePreviousDump(config={}, other={}):
#fix
pass
def saveSpecialVersion ( config = { } ):
def saveSpecialVersion ( config = { } , session = None ):
""" Save Special:Version as .html, to preserve extensions details """
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
else :
print ' Downloading Special:Version with extensions and other related info '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' index ' ] , data = { ' title ' : ' Special:Version ' , } , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
delay ( config = config , session = session )
raw = removeIP ( raw = raw )
f = open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . write ( raw . encode ( ' utf-8 ' ) )
f . close ( )
def saveIndexPHP ( config = { } ):
def saveIndexPHP ( config = { } , session = None ):
""" Save index.php as .html, to preserve license details available at the botom of the page """
if os . path . exists ( ' %s /index.html ' % ( config [ ' path ' ] ) ) :
print ' index.html exists, do not overwrite '
else :
print ' Downloading index.php (Main Page) as index.html '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' index ' ] , data = { } , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = r . text
delay ( config = config , session = session )
raw = removeIP ( raw = raw )
f = open ( ' %s /index.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . write ( raw . encode ( ' utf-8 ' ) )
f . close ( )
def saveSiteInfo ( config = { } ):
def saveSiteInfo ( config = { } , session = None ):
""" Save a file with site info """
if config [ ' api ' ] :
@ -1276,14 +1162,9 @@ def saveSiteInfo(config={}):
print ' siteinfo.json exists, do not overwrite '
else :
print ' Downloading site info as siteinfo.json '
req = urllib2 . Request ( url = config [ ' api ' ] , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
result = json . loads ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) )
else :
result = json . loads ( f . read ( ) )
f . close ( )
delay ( config = config )
r = session . post ( url = config [ ' api ' ] , data = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } , headers = { ' User-Agent ' : getUserAgent ( ) } )
result = json . loads ( r . text )
delay ( config = config , session = session )
f = open ( ' %s /siteinfo.json ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( json . dumps ( result , indent = 4 , sort_keys = True ) )
f . close ( )
@ -1324,6 +1205,7 @@ def main(params=[]):
""" Main function """
configfilename = ' config.txt '
session = requests . Session ( )
config , other = getParameters ( params = params )
avoidWikimediaProjects ( config = config , other = other )
@ -1364,9 +1246,9 @@ def main(params=[]):
else :
createNewDump ( config = config , other = other )
saveIndexPHP ( config = config )
saveSpecialVersion ( config = config )
saveSiteInfo ( config = config )
saveIndexPHP ( config = config , session = session )
saveSpecialVersion ( config = config , session = session )
saveSiteInfo ( config = config , session = session )
bye ( )
if __name__ == " __main__ " :