@ -24,12 +24,14 @@ import cPickle
import datetime
import getopt
import json
import gzip
try :
from hashlib import md5
except ImportError : # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import StringIO
import subprocess
import sys
import time
@ -99,8 +101,11 @@ def getNamespacesScraper(config={}):
namespaces = config [ ' namespaces ' ]
namespacenames = { 0 : ' ' } # main is 0, no prefix
if namespaces :
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Allpages ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Allpages ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
@ -131,8 +136,11 @@ def getNamespacesAPI(config={}):
namespaces = config [ ' namespaces ' ]
namespacenames = { 0 : ' ' } # main is 0, no prefix
if namespaces :
req = urllib2 . Request ( url = config [ ' api ' ] , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = config [ ' api ' ] , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
result = json . loads ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) )
else :
result = json . loads ( f . read ( ) )
f . close ( )
delay ( config = config )
@ -172,7 +180,7 @@ def getPageTitlesAPI(config={}):
c = 0
print ' Retrieving titles in the namespace %d ' % ( namespace )
headers = { ' User-Agent ' : getUserAgent ( ) }
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
apfrom = ' ! '
while apfrom :
sys . stderr . write ( ' . ' ) #progress
@ -190,6 +198,9 @@ def getPageTitlesAPI(config={}):
print ' An error has occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
jsontitles = json . loads ( unicode ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) , ' utf-8 ' ) )
else :
jsontitles = json . loads ( unicode ( f . read ( ) , ' utf-8 ' ) )
f . close ( )
apfrom = ' '
@ -218,8 +229,12 @@ def getPageTitlesScraper(config={}):
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
url = ' %s ?title=Special:Allpages&namespace= %s ' % ( config [ ' index ' ] , namespace )
req = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw = urllib2 . urlopen ( req ) . read ( )
req = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
raw = cleanHTML ( raw )
r_title = r ' title= " (?P<title>[^>]+) " > '
@ -255,8 +270,12 @@ def getPageTitlesScraper(config={}):
if not name in checked_suballpages :
checked_suballpages . append ( name ) #to avoid reload dupe subpages links
delay ( config = config )
req2 = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
raw2 = urllib2 . urlopen ( req ) . read ( )
req2 = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw2 = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw2 = f . read ( )
raw2 = cleanHTML ( raw2 )
rawacum + = raw2 #merge it after removed junk
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , len ( re . findall ( r_title , raw2 ) ) , ' pages '
@ -375,6 +394,9 @@ def getXMLPageCore(headers={}, params={}, config={}):
print ' Please, resume the dump, --resume '
sys . exit ( )
# The error is usually temporary, but we exit the dump altogether.
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
xml = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
xml = f . read ( )
c + = 1
@ -391,7 +413,7 @@ def getXMLPage(config={}, title='', verbose=True):
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
headers = { ' User-Agent ' : getUserAgent ( ) }
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ . encode ( ' utf-8 ' ) , ' action ' : ' submit ' , }
if config [ ' curonly ' ] :
params [ ' curonly ' ] = 1
@ -546,8 +568,11 @@ def getImageFilenamesURL(config={}):
retries = 5
while offset :
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset , } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
@ -623,7 +648,7 @@ def getImageFilenamesURLAPI(config={}):
""" Retrieve file list: filename, url, uploader """
print ' Retrieving image filenames '
headers = { ' User-Agent ' : getUserAgent ( ) }
headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' }
aifrom = ' ! '
images = [ ]
while aifrom :
@ -642,6 +667,9 @@ def getImageFilenamesURLAPI(config={}):
print ' An error has occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
xml = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
xml = f . read ( )
f . close ( )
delay ( config = config )
@ -1000,9 +1028,11 @@ def getParameters(params=[]):
def checkAPI ( api , config = { } ) :
""" Checking API availability """
req = urllib2 . Request ( url = api , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
req = urllib2 . Request ( url = api , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' } )
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
result = json . loads ( gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( ) )
else :
result = json . loads ( f . read ( ) )
f . close ( )
delay ( config = config )
@ -1014,8 +1044,11 @@ def checkAPI(api, config={}):
def checkIndexphp ( indexphp , config = { } ) :
""" Checking index.php availability """
req = urllib2 . Request ( url = indexphp , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = indexphp , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
@ -1210,8 +1243,11 @@ def saveSpecialVersion(config={}):
print ' Special:Version.html exists, do not overwrite '
else :
print ' Downloading Special:Version with extensions and other related info '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )
@ -1227,8 +1263,11 @@ def saveIndexPHP(config={}):
print ' index.html exists, do not overwrite '
else :
print ' Downloading index.php (Main Page) as index.html '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { } ) , headers = { ' User-Agent ' : getUserAgent ( ) })
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { } ) , headers = { ' User-Agent ' : getUserAgent ( ) , ' Accept-Encoding ' : ' gzip ' })
f = urllib2 . urlopen ( req )
if f . headers . get ( ' Content-Encoding ' ) and ' gzip ' in f . headers . get ( ' Content-Encoding ' ) :
raw = gzip . GzipFile ( fileobj = StringIO . StringIO ( f . read ( ) ) ) . read ( )
else :
raw = f . read ( )
f . close ( )
delay ( config = config )