2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-15 00:15:00 +00:00

Add Content-Encoding: gzip support

This commit is contained in:
balr0g 2014-06-27 08:51:44 -04:00
parent 7a6ef18339
commit d60e560571

View File

@ -24,12 +24,14 @@ import cPickle
import datetime import datetime
import getopt import getopt
import json import json
import gzip
try: try:
from hashlib import md5 from hashlib import md5
except ImportError: # Python 2.4 compatibility except ImportError: # Python 2.4 compatibility
from md5 import new as md5 from md5 import new as md5
import os import os
import re import re
import StringIO
import subprocess import subprocess
import sys import sys
import time import time
@ -99,8 +101,11 @@ def getNamespacesScraper(config={}):
namespaces = config['namespaces'] namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix namespacenames = {0:''} # main is 0, no prefix
if namespaces: if namespaces:
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read() raw = f.read()
f.close() f.close()
delay(config=config) delay(config=config)
@ -131,8 +136,11 @@ def getNamespacesAPI(config={}):
namespaces = config['namespaces'] namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix namespacenames = {0:''} # main is 0, no prefix
if namespaces: if namespaces:
req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
else:
result = json.loads(f.read()) result = json.loads(f.read())
f.close() f.close()
delay(config=config) delay(config=config)
@ -172,7 +180,7 @@ def getPageTitlesAPI(config={}):
c = 0 c = 0
print ' Retrieving titles in the namespace %d' % (namespace) print ' Retrieving titles in the namespace %d' % (namespace)
headers = {'User-Agent': getUserAgent()} headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
apfrom = '!' apfrom = '!'
while apfrom: while apfrom:
sys.stderr.write('.') #progress sys.stderr.write('.') #progress
@ -190,6 +198,9 @@ def getPageTitlesAPI(config={}):
print 'An error has occurred while retrieving page titles with API' print 'An error has occurred while retrieving page titles with API'
print 'Please, resume the dump, --resume' print 'Please, resume the dump, --resume'
sys.exit() sys.exit()
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
else:
jsontitles = json.loads(unicode(f.read(), 'utf-8')) jsontitles = json.loads(unicode(f.read(), 'utf-8'))
f.close() f.close()
apfrom = '' apfrom = ''
@ -218,8 +229,12 @@ def getPageTitlesScraper(config={}):
for namespace in namespaces: for namespace in namespaces:
print ' Retrieving titles in the namespace', namespace print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
raw = urllib2.urlopen(req).read() f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read()
raw = cleanHTML(raw) raw = cleanHTML(raw)
r_title = r'title="(?P<title>[^>]+)">' r_title = r'title="(?P<title>[^>]+)">'
@ -255,8 +270,12 @@ def getPageTitlesScraper(config={}):
if not name in checked_suballpages: if not name in checked_suballpages:
checked_suballpages.append(name) #to avoid reload dupe subpages links checked_suballpages.append(name) #to avoid reload dupe subpages links
delay(config=config) delay(config=config)
req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()}) req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
raw2 = urllib2.urlopen(req).read() f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw2 = f.read()
raw2 = cleanHTML(raw2) raw2 = cleanHTML(raw2)
rawacum += raw2 #merge it after removed junk rawacum += raw2 #merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
@ -375,6 +394,9 @@ def getXMLPageCore(headers={}, params={}, config={}):
print 'Please, resume the dump, --resume' print 'Please, resume the dump, --resume'
sys.exit() sys.exit()
# The error is usually temporary, but we exit the dump altogether. # The error is usually temporary, but we exit the dump altogether.
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
xml = f.read() xml = f.read()
c += 1 c += 1
@ -391,7 +413,7 @@ def getXMLPage(config={}, title='', verbose=True):
title_ = title title_ = title
title_ = re.sub(' ', '_', title_) title_ = re.sub(' ', '_', title_)
#do not convert & into %26, title_ = re.sub('&', '%26', title_) #do not convert & into %26, title_ = re.sub('&', '%26', title_)
headers = {'User-Agent': getUserAgent()} headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', } params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
if config['curonly']: if config['curonly']:
params['curonly'] = 1 params['curonly'] = 1
@ -546,8 +568,11 @@ def getImageFilenamesURL(config={}):
retries = 5 retries = 5
while offset: while offset:
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read() raw = f.read()
f.close() f.close()
delay(config=config) delay(config=config)
@ -623,7 +648,7 @@ def getImageFilenamesURLAPI(config={}):
""" Retrieve file list: filename, url, uploader """ """ Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames' print 'Retrieving image filenames'
headers = {'User-Agent': getUserAgent()} headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
aifrom = '!' aifrom = '!'
images = [] images = []
while aifrom: while aifrom:
@ -642,6 +667,9 @@ def getImageFilenamesURLAPI(config={}):
print 'An error has occurred while retrieving page titles with API' print 'An error has occurred while retrieving page titles with API'
print 'Please, resume the dump, --resume' print 'Please, resume the dump, --resume'
sys.exit() sys.exit()
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
xml = f.read() xml = f.read()
f.close() f.close()
delay(config=config) delay(config=config)
@ -1000,9 +1028,11 @@ def getParameters(params=[]):
def checkAPI(api, config={}): def checkAPI(api, config={}):
""" Checking API availability """ """ Checking API availability """
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
else:
result = json.loads(f.read()) result = json.loads(f.read())
f.close() f.close()
delay(config=config) delay(config=config)
@ -1014,8 +1044,11 @@ def checkAPI(api, config={}):
def checkIndexphp(indexphp, config={}): def checkIndexphp(indexphp, config={}):
""" Checking index.php availability """ """ Checking index.php availability """
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read() raw = f.read()
f.close() f.close()
delay(config=config) delay(config=config)
@ -1210,8 +1243,11 @@ def saveSpecialVersion(config={}):
print 'Special:Version.html exists, do not overwrite' print 'Special:Version.html exists, do not overwrite'
else: else:
print 'Downloading Special:Version with extensions and other related info' print 'Downloading Special:Version with extensions and other related info'
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read() raw = f.read()
f.close() f.close()
delay(config=config) delay(config=config)
@ -1227,8 +1263,11 @@ def saveIndexPHP(config={}):
print 'index.html exists, do not overwrite' print 'index.html exists, do not overwrite'
else: else:
print 'Downloading index.php (Main Page) as index.html' print 'Downloading index.php (Main Page) as index.html'
req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read() raw = f.read()
f.close() f.close()
delay(config=config) delay(config=config)