Add Content-Encoding: gzip support

10 years ago · d60e560571
parent 7a6ef18339
commit d60e560571
1 changed files with 64 additions and 25 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -24,12 +24,14 @@ import cPickle
 import datetime
 import getopt
 import json
+import gzip
 try:
    from hashlib import md5
 except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
+import StringIO
 import subprocess
 import sys
 import time
@ -99,8 +101,11 @@ def getNamespacesScraper(config={}):
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent()})
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            raw = f.read()
        f.close()
        delay(config=config)
@ -131,8 +136,11 @@ def getNamespacesAPI(config={}):
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
+        req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
+        else:
            result = json.loads(f.read())
        f.close()
        delay(config=config)
@ -172,7 +180,7 @@ def getPageTitlesAPI(config={}):
        
        c = 0
        print '    Retrieving titles in the namespace %d' % (namespace)
-        headers = {'User-Agent': getUserAgent()}
+        headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.') #progress
@ -190,6 +198,9 @@ def getPageTitlesAPI(config={}):
                    print 'An error has occurred while retrieving page titles with API'
                    print 'Please, resume the dump, --resume'
                    sys.exit()
+            if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+                jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
+            else:
                jsontitles = json.loads(unicode(f.read(), 'utf-8'))
            f.close()
            apfrom = ''
@ -218,8 +229,12 @@ def getPageTitlesScraper(config={}):
    for namespace in namespaces:
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
-        req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
-        raw = urllib2.urlopen(req).read()
+        req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
+        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
+            raw = f.read()
        raw = cleanHTML(raw)
        
        r_title = r'title="(?P<title>[^>]+)">'
@ -255,8 +270,12 @@ def getPageTitlesScraper(config={}):
                if not name in checked_suballpages:
                    checked_suballpages.append(name) #to avoid reload dupe subpages links
                    delay(config=config)
-                    req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent()})
-                    raw2 = urllib2.urlopen(req).read()
+                    req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
+                    f = urllib2.urlopen(req)
+                    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+                        raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+                    else:
+                        raw2 = f.read()
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2 #merge it after removed junk
                    print '    Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
@ -375,6 +394,9 @@ def getXMLPageCore(headers={}, params={}, config={}):
                print 'Please, resume the dump, --resume'
                sys.exit()
                # The error is usually temporary, but we exit the dump altogether.
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            xml = f.read()
        c += 1
    
@ -391,7 +413,7 @@ def getXMLPage(config={}, title='', verbose=True):
    title_ = title
    title_ = re.sub(' ', '_', title_)
    #do not convert & into %26, title_ = re.sub('&', '%26', title_)
-    headers = {'User-Agent': getUserAgent()}
+    headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
    params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
    if config['curonly']:
        params['curonly'] = 1
@ -546,8 +568,11 @@ def getImageFilenamesURL(config={}):
    retries = 5
    while offset:
        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent()})
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            raw = f.read()
        f.close()
        delay(config=config)
@ -623,7 +648,7 @@ def getImageFilenamesURLAPI(config={}):
    """ Retrieve file list: filename, url, uploader """
    
    print 'Retrieving image filenames'
-    headers = {'User-Agent': getUserAgent()}
+    headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
    aifrom = '!'
    images = []
    while aifrom:
@ -642,6 +667,9 @@ def getImageFilenamesURLAPI(config={}):
                print 'An error has occurred while retrieving page titles with API'
                print 'Please, resume the dump, --resume'
                sys.exit()
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            xml = f.read()
        f.close()
        delay(config=config)
@ -1000,9 +1028,11 @@ def getParameters(params=[]):

 def checkAPI(api, config={}):
    """ Checking API availability """
-    
-    req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
+    req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
    f = urllib2.urlopen(req)
+    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+        result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
+    else:
        result = json.loads(f.read())
    f.close()
    delay(config=config)
@ -1014,8 +1044,11 @@ def checkAPI(api, config={}):
 def checkIndexphp(indexphp, config={}):
    """ Checking index.php availability """
    
-    req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
+    req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
    f = urllib2.urlopen(req)
+    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+        raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+    else:
        raw = f.read()
    f.close()
    delay(config=config)
@ -1210,8 +1243,11 @@ def saveSpecialVersion(config={}):
        print 'Special:Version.html exists, do not overwrite'
    else:
        print 'Downloading Special:Version with extensions and other related info'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            raw = f.read()
        f.close()
        delay(config=config)
@ -1227,8 +1263,11 @@ def saveIndexPHP(config={}):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent()})
+        req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
        f = urllib2.urlopen(req)
+        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
+            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+        else:
            raw = f.read()
        f.close()
        delay(config=config)