mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-15 00:15:00 +00:00
Add Content-Encoding: gzip support
This commit is contained in:
parent
7a6ef18339
commit
d60e560571
@ -24,12 +24,14 @@ import cPickle
|
|||||||
import datetime
|
import datetime
|
||||||
import getopt
|
import getopt
|
||||||
import json
|
import json
|
||||||
|
import gzip
|
||||||
try:
|
try:
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
except ImportError: # Python 2.4 compatibility
|
except ImportError: # Python 2.4 compatibility
|
||||||
from md5 import new as md5
|
from md5 import new as md5
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import StringIO
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
@ -99,8 +101,11 @@ def getNamespacesScraper(config={}):
|
|||||||
namespaces = config['namespaces']
|
namespaces = config['namespaces']
|
||||||
namespacenames = {0:''} # main is 0, no prefix
|
namespacenames = {0:''} # main is 0, no prefix
|
||||||
if namespaces:
|
if namespaces:
|
||||||
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -131,8 +136,11 @@ def getNamespacesAPI(config={}):
|
|||||||
namespaces = config['namespaces']
|
namespaces = config['namespaces']
|
||||||
namespacenames = {0:''} # main is 0, no prefix
|
namespacenames = {0:''} # main is 0, no prefix
|
||||||
if namespaces:
|
if namespaces:
|
||||||
req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
|
||||||
|
else:
|
||||||
result = json.loads(f.read())
|
result = json.loads(f.read())
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -172,7 +180,7 @@ def getPageTitlesAPI(config={}):
|
|||||||
|
|
||||||
c = 0
|
c = 0
|
||||||
print ' Retrieving titles in the namespace %d' % (namespace)
|
print ' Retrieving titles in the namespace %d' % (namespace)
|
||||||
headers = {'User-Agent': getUserAgent()}
|
headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
|
||||||
apfrom = '!'
|
apfrom = '!'
|
||||||
while apfrom:
|
while apfrom:
|
||||||
sys.stderr.write('.') #progress
|
sys.stderr.write('.') #progress
|
||||||
@ -190,6 +198,9 @@ def getPageTitlesAPI(config={}):
|
|||||||
print 'An error has occurred while retrieving page titles with API'
|
print 'An error has occurred while retrieving page titles with API'
|
||||||
print 'Please, resume the dump, --resume'
|
print 'Please, resume the dump, --resume'
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
|
||||||
|
else:
|
||||||
jsontitles = json.loads(unicode(f.read(), 'utf-8'))
|
jsontitles = json.loads(unicode(f.read(), 'utf-8'))
|
||||||
f.close()
|
f.close()
|
||||||
apfrom = ''
|
apfrom = ''
|
||||||
@ -218,8 +229,12 @@ def getPageTitlesScraper(config={}):
|
|||||||
for namespace in namespaces:
|
for namespace in namespaces:
|
||||||
print ' Retrieving titles in the namespace', namespace
|
print ' Retrieving titles in the namespace', namespace
|
||||||
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
|
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
|
||||||
req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
raw = urllib2.urlopen(req).read()
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
|
raw = f.read()
|
||||||
raw = cleanHTML(raw)
|
raw = cleanHTML(raw)
|
||||||
|
|
||||||
r_title = r'title="(?P<title>[^>]+)">'
|
r_title = r'title="(?P<title>[^>]+)">'
|
||||||
@ -255,8 +270,12 @@ def getPageTitlesScraper(config={}):
|
|||||||
if not name in checked_suballpages:
|
if not name in checked_suballpages:
|
||||||
checked_suballpages.append(name) #to avoid reload dupe subpages links
|
checked_suballpages.append(name) #to avoid reload dupe subpages links
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
|
req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
raw2 = urllib2.urlopen(req).read()
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
|
raw2 = f.read()
|
||||||
raw2 = cleanHTML(raw2)
|
raw2 = cleanHTML(raw2)
|
||||||
rawacum += raw2 #merge it after removed junk
|
rawacum += raw2 #merge it after removed junk
|
||||||
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
|
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
|
||||||
@ -375,6 +394,9 @@ def getXMLPageCore(headers={}, params={}, config={}):
|
|||||||
print 'Please, resume the dump, --resume'
|
print 'Please, resume the dump, --resume'
|
||||||
sys.exit()
|
sys.exit()
|
||||||
# The error is usually temporary, but we exit the dump altogether.
|
# The error is usually temporary, but we exit the dump altogether.
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
xml = f.read()
|
xml = f.read()
|
||||||
c += 1
|
c += 1
|
||||||
|
|
||||||
@ -391,7 +413,7 @@ def getXMLPage(config={}, title='', verbose=True):
|
|||||||
title_ = title
|
title_ = title
|
||||||
title_ = re.sub(' ', '_', title_)
|
title_ = re.sub(' ', '_', title_)
|
||||||
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
|
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
|
||||||
headers = {'User-Agent': getUserAgent()}
|
headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
|
||||||
params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
|
params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
|
||||||
if config['curonly']:
|
if config['curonly']:
|
||||||
params['curonly'] = 1
|
params['curonly'] = 1
|
||||||
@ -546,8 +568,11 @@ def getImageFilenamesURL(config={}):
|
|||||||
retries = 5
|
retries = 5
|
||||||
while offset:
|
while offset:
|
||||||
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
|
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
|
||||||
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -623,7 +648,7 @@ def getImageFilenamesURLAPI(config={}):
|
|||||||
""" Retrieve file list: filename, url, uploader """
|
""" Retrieve file list: filename, url, uploader """
|
||||||
|
|
||||||
print 'Retrieving image filenames'
|
print 'Retrieving image filenames'
|
||||||
headers = {'User-Agent': getUserAgent()}
|
headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
|
||||||
aifrom = '!'
|
aifrom = '!'
|
||||||
images = []
|
images = []
|
||||||
while aifrom:
|
while aifrom:
|
||||||
@ -642,6 +667,9 @@ def getImageFilenamesURLAPI(config={}):
|
|||||||
print 'An error has occurred while retrieving page titles with API'
|
print 'An error has occurred while retrieving page titles with API'
|
||||||
print 'Please, resume the dump, --resume'
|
print 'Please, resume the dump, --resume'
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
xml = f.read()
|
xml = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -1000,9 +1028,11 @@ def getParameters(params=[]):
|
|||||||
|
|
||||||
def checkAPI(api, config={}):
|
def checkAPI(api, config={}):
|
||||||
""" Checking API availability """
|
""" Checking API availability """
|
||||||
|
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
|
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
|
||||||
|
else:
|
||||||
result = json.loads(f.read())
|
result = json.loads(f.read())
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -1014,8 +1044,11 @@ def checkAPI(api, config={}):
|
|||||||
def checkIndexphp(indexphp, config={}):
|
def checkIndexphp(indexphp, config={}):
|
||||||
""" Checking index.php availability """
|
""" Checking index.php availability """
|
||||||
|
|
||||||
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -1210,8 +1243,11 @@ def saveSpecialVersion(config={}):
|
|||||||
print 'Special:Version.html exists, do not overwrite'
|
print 'Special:Version.html exists, do not overwrite'
|
||||||
else:
|
else:
|
||||||
print 'Downloading Special:Version with extensions and other related info'
|
print 'Downloading Special:Version with extensions and other related info'
|
||||||
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
@ -1227,8 +1263,11 @@ def saveIndexPHP(config={}):
|
|||||||
print 'index.html exists, do not overwrite'
|
print 'index.html exists, do not overwrite'
|
||||||
else:
|
else:
|
||||||
print 'Downloading index.php (Main Page) as index.html'
|
print 'Downloading index.php (Main Page) as index.html'
|
||||||
req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()})
|
req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
|
||||||
f = urllib2.urlopen(req)
|
f = urllib2.urlopen(req)
|
||||||
|
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
|
||||||
|
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
|
||||||
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
delay(config=config)
|
delay(config=config)
|
||||||
|
Loading…
Reference in New Issue
Block a user