diff --git a/dumpgenerator.py b/dumpgenerator.py
index b4fea0d..5fe9b82 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -24,12 +24,14 @@ import cPickle
import datetime
import getopt
import json
+import gzip
try:
from hashlib import md5
except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import os
import re
+import StringIO
import subprocess
import sys
import time
@@ -99,9 +101,12 @@ def getNamespacesScraper(config={}):
namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
- req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- raw = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
f.close()
delay(config=config)
@@ -131,9 +136,12 @@ def getNamespacesAPI(config={}):
namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
- req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- result = json.loads(f.read())
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
+ else:
+ result = json.loads(f.read())
f.close()
delay(config=config)
@@ -172,7 +180,7 @@ def getPageTitlesAPI(config={}):
c = 0
print ' Retrieving titles in the namespace %d' % (namespace)
- headers = {'User-Agent': getUserAgent()}
+ headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
apfrom = '!'
while apfrom:
sys.stderr.write('.') #progress
@@ -190,7 +198,10 @@ def getPageTitlesAPI(config={}):
print 'An error has occurred while retrieving page titles with API'
print 'Please, resume the dump, --resume'
sys.exit()
- jsontitles = json.loads(unicode(f.read(), 'utf-8'))
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
+ else:
+ jsontitles = json.loads(unicode(f.read(), 'utf-8'))
f.close()
apfrom = ''
if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'):
@@ -218,8 +229,12 @@ def getPageTitlesScraper(config={}):
for namespace in namespaces:
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
- req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
- raw = urllib2.urlopen(req).read()
+ req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
+ f = urllib2.urlopen(req)
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
raw = cleanHTML(raw)
r_title = r'title="(?P
[^>]+)">'
@@ -255,8 +270,12 @@ def getPageTitlesScraper(config={}):
if not name in checked_suballpages:
checked_suballpages.append(name) #to avoid reload dupe subpages links
delay(config=config)
- req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
- raw2 = urllib2.urlopen(req).read()
+ req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
+ f = urllib2.urlopen(req)
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw2 = f.read()
raw2 = cleanHTML(raw2)
rawacum += raw2 #merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
@@ -375,7 +394,10 @@ def getXMLPageCore(headers={}, params={}, config={}):
print 'Please, resume the dump, --resume'
sys.exit()
# The error is usually temporary, but we exit the dump altogether.
- xml = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ xml = f.read()
c += 1
return xml
@@ -391,7 +413,7 @@ def getXMLPage(config={}, title='', verbose=True):
title_ = title
title_ = re.sub(' ', '_', title_)
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
- headers = {'User-Agent': getUserAgent()}
+ headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
if config['curonly']:
params['curonly'] = 1
@@ -546,9 +568,12 @@ def getImageFilenamesURL(config={}):
retries = 5
while offset:
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
- req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- raw = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
f.close()
delay(config=config)
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
@@ -623,7 +648,7 @@ def getImageFilenamesURLAPI(config={}):
""" Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames'
- headers = {'User-Agent': getUserAgent()}
+ headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
aifrom = '!'
images = []
while aifrom:
@@ -642,7 +667,10 @@ def getImageFilenamesURLAPI(config={}):
print 'An error has occurred while retrieving page titles with API'
print 'Please, resume the dump, --resume'
sys.exit()
- xml = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ xml = f.read()
f.close()
delay(config=config)
# Match the query-continue, old and new format
@@ -1000,10 +1028,12 @@ def getParameters(params=[]):
def checkAPI(api, config={}):
""" Checking API availability """
-
- req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- result = json.loads(f.read())
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
+ else:
+ result = json.loads(f.read())
f.close()
delay(config=config)
print 'Checking api.php...', api
@@ -1014,9 +1044,12 @@ def checkAPI(api, config={}):
def checkIndexphp(indexphp, config={}):
""" Checking index.php availability """
- req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- raw = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
f.close()
delay(config=config)
print 'Checking index.php...', indexphp
@@ -1210,9 +1243,12 @@ def saveSpecialVersion(config={}):
print 'Special:Version.html exists, do not overwrite'
else:
print 'Downloading Special:Version with extensions and other related info'
- req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- raw = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
f.close()
delay(config=config)
raw = removeIP(raw=raw)
@@ -1227,9 +1263,12 @@ def saveIndexPHP(config={}):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
- req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()})
+ req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
- raw = f.read()
+ if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+ raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+ else:
+ raw = f.read()
f.close()
delay(config=config)
raw = removeIP(raw=raw)