Initial port to Requests

10 years ago · c8e11a949b
parent 9553e3550c
commit c8e11a949b
1 changed files with 127 additions and 245 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -24,51 +24,29 @@ import cPickle
 import datetime
 import argparse
 import json
-import gzip
 try:
    from hashlib import md5
 except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
-import StringIO
+import requests
 import subprocess
 import sys
 import time
 import urllib
-import urllib2

 __VERSION__ = '0.2.2' #major, minor, micro

 def getVersion():
    return(__VERSION__)

-# This class is from https://github.com/crustymonkey/py-sonic/blob/master/libsonic/connection.py#L50
-class POSTHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
-    def redirect_request(self, req, fp, code, msg, headers, newurl):
-        m = req.get_method()
-        if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
-            or code in (301, 302, 303, 307) and m == "POST"):
-            newurl = newurl.replace(' ', '%20')
-            newheaders = dict((k,v) for k,v in req.headers.items()
-                              if k.lower() not in ("content-length", "content-type")
-                             )
-            data = None
-            if req.has_data():
-                data = req.get_data()
-            return urllib2.Request(newurl,
-                           data=data,
-                           headers=newheaders,
-                           origin_req_host=req.get_origin_req_host(),
-                           unverifiable=True)
-        else:
-            raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)

 def truncateFilename(other={}, filename=''):
    """ Truncate filenames when downloading images with large filenames """
    return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1]

-def delay(config={}):
+def delay(config={}, session=None):
    """ Add a delay if configured for that """
    if config['delay'] > 0:
        print 'Sleeping... %d seconds...' % (config['delay'])
@ -95,20 +73,15 @@ def cleanHTML(raw=''):
        sys.exit()
    return raw

-def getNamespacesScraper(config={}):
+def getNamespacesScraper(config={}, session=None):
    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
    """ Function called if no API is available """
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-        else:
-            raw = f.read()
-        f.close()
-        delay(config=config)
+        r = session.post(url=config['index'], data={'title': 'Special:Allpages', }, headers={'User-Agent': getUserAgent()})
+        raw = r.text
+        delay(config=config, session=session)

        m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected"
        if 'all' in namespaces:
@ -131,19 +104,14 @@ def getNamespacesScraper(config={}):
    print '%d namespaces found' % (len(namespaces))
    return namespaces, namespacenames
    
-def getNamespacesAPI(config={}):
+def getNamespacesAPI(config={}, session=None):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
-        else:
-            result = json.loads(f.read())
-        f.close()
-        delay(config=config)
+        r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+        result = json.loads(r.text)
+        delay(config=config, session=session)

        if 'all' in namespaces:
            namespaces = []
@ -169,10 +137,10 @@ def getNamespacesAPI(config={}):
    print '%d namespaces found' % (len(namespaces))
    return namespaces, namespacenames

-def getPageTitlesAPI(config={}):
+def getPageTitlesAPI(config={}, session=None):
    """ Uses the API to get the list of page titles """
    titles = []
-    namespaces, namespacenames = getNamespacesAPI(config=config)
+    namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            print '    Skipping namespace = %d' % (namespace)
@ -180,29 +148,14 @@ def getPageTitlesAPI(config={}):
        
        c = 0
        print '    Retrieving titles in the namespace %d' % (namespace)
-        headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
+        headers = {'User-Agent': getUserAgent()}
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.') #progress
            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
-            data = urllib.urlencode(params)
-            req = urllib2.Request(url=config['api'], data=data, headers=headers)
-            try:
-                f = urllib2.urlopen(req)
-            except:
-                try:
-                    print '(1) Server is slow... Waiting some seconds and retrying...'
-                    time.sleep(10)
-                    f = urllib2.urlopen(req)
-                except:
-                    print 'An error has occurred while retrieving page titles with API'
-                    print 'Please, resume the dump, --resume'
-                    sys.exit()
-            if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-                jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
-            else:
-                jsontitles = json.loads(unicode(f.read(), 'utf-8'))
-            f.close()
+            r = session.post(url=config['api'], data=params, headers=headers)
+            #FIXME Handle HTTP errors here!
+            jsontitles = json.loads(r.text)
            apfrom = ''
            if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'):
                if jsontitles['query-continue']['allpages'].has_key('apcontinue'):
@ -218,23 +171,19 @@ def getPageTitlesAPI(config={}):
                titles = list(set(titles))
                apfrom = ''
            c += len(jsontitles['query']['allpages'])
-            delay(config=config)
+            delay(config=config, session=session)
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles

-def getPageTitlesScraper(config={}):
+def getPageTitlesScraper(config={}, session=None):
    """  """
    titles = []
-    namespaces, namespacenames = getNamespacesScraper(config=config)
+    namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
    for namespace in namespaces:
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
-        req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-        else:
-            raw = f.read()
+        r = session.get(url=url, headers={'User-Agent': getUserAgent()})
+        raw = r.text
        raw = cleanHTML(raw)
        
        r_title = r'title="(?P<title>[^>]+)">'
@ -269,24 +218,20 @@ def getPageTitlesScraper(config={}):
                
                if not name in checked_suballpages:
                    checked_suballpages.append(name) #to avoid reload dupe subpages links
-                    delay(config=config)
-                    req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-                    f = urllib2.urlopen(req2)
-                    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-                        raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-                    else:
-                        raw2 = f.read()
+                    delay(config=config, session=session)
+                    r2 = session.get(url=url, headers={'User-Agent': getUserAgent()})
+                    raw2 = r2.text
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2 #merge it after removed junk
                    print '    Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'

-                delay(config=config)
+                delay(config=config, session=session)
            c += 1
        
        c = 0
        m = re.compile(r_title).finditer(rawacum)
        for i in m:
-            t = undoHTMLEntities(text=unicode(i.group('title'), 'utf-8'))
+            t = undoHTMLEntities(text=i.group('title'))
            if not t.startswith('Special:'):
                if not t in titles:
                    titles.append(t)
@ -294,7 +239,7 @@ def getPageTitlesScraper(config={}):
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles

-def getPageTitles(config={}):
+def getPageTitles(config={}, session=None):
    """ Get list of page titles """
    #http://en.wikipedia.org/wiki/Special:AllPages
    #http://archiveteam.org/index.php?title=Special:AllPages
@ -304,9 +249,9 @@ def getPageTitles(config={}):
    
    titles = []
    if config['api']:
-        titles = getPageTitlesAPI(config=config)
+        titles = getPageTitlesAPI(config=config, session=session)
    elif config['index']:
-        titles = getPageTitlesScraper(config=config)
+        titles = getPageTitlesScraper(config=config, session=session)
    
    titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
    titles.sort() #sorting
@ -314,22 +259,22 @@ def getPageTitles(config={}):
    print '%d page titles loaded' % (len(titles))
    return titles

-def getXMLHeader(config={}):
+def getXMLHeader(config={}, session=None):
    """ Retrieve a random page to extract XML headers (namespace info, etc) """
    #get the header of a random page, to attach it in the complete XML backup
    #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
    randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(config=config, title=randomtitle, verbose=False)
+    xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session)
    header = xml.split('</mediawiki>')[0]
    if not xml:
        print 'XML export on this wiki is broken, quitting.'
        sys.exit()
    return header

-def getXMLFileDesc(config={}, title=''):
+def getXMLFileDesc(config={}, title='', session=None):
    """ Get XML for image description page """
    config['curonly'] = 1 #tricky to get only the most recent desc
-    return getXMLPage(config=config, title=title, verbose=False)
+    return getXMLPage(config=config, title=title, verbose=False, session=session)

 def getUserAgent():
    """ Return a cool user-agent to hide Python user-agent """
@ -348,7 +293,7 @@ def logerror(config={}, text=''):
        f.write(output.encode('utf-8'))
        f.close()

-def getXMLPageCore(headers={}, params={}, config={}):
+def getXMLPageCore(headers={}, params={}, config={}, session=None):
    """  """
    #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
    #if retrieving params['limit'] revisions fails, returns a current only version
@ -380,30 +325,14 @@ def getXMLPageCore(headers={}, params={}, config={}):
                print '    Saving in the errors log, and skipping...'
                logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages']))
                return '' # empty xml
-        
-        data = urllib.urlencode(params)
-        req = urllib2.Request(url=config['index'], data=data, headers=headers)
-        try:
-            f = urllib2.urlopen(req)
-        except:
-            try:
-                print '(2) Server is slow... Waiting some seconds and retrying...'
-                time.sleep(15)
-                f = urllib2.urlopen(req)
-            except:
-                print 'An error has occurred while retrieving "%s"' % (params['pages'])
-                print 'Please, resume the dump, --resume'
-                sys.exit()
-                # The error is usually temporary, but we exit the dump altogether.
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-        else:
-            xml = f.read()
+        #FIXME HANDLE HTTP Errors HERE
+        r = session.post(url=config['index'], data=params, headers=headers)
+        xml = r.text
        c += 1
    
    return xml

-def getXMLPage(config={}, title='', verbose=True):
+def getXMLPage(config={}, title='', verbose=True, session=None):
    """ Get the full history (or current only) of a page """

    #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
@ -414,8 +343,8 @@ def getXMLPage(config={}, title='', verbose=True):
    title_ = title
    title_ = re.sub(' ', '_', title_)
    #do not convert & into %26, title_ = re.sub('&', '%26', title_)
-    headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
-    params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', }
+    headers = {'User-Agent': getUserAgent()}
+    params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', }
    if config['curonly']:
        params['curonly'] = 1
        params['limit'] = 1
@ -425,7 +354,7 @@ def getXMLPage(config={}, title='', verbose=True):
    if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
        params['templates'] = 1
    
-    xml = getXMLPageCore(headers=headers, params=params, config=config)
+    xml = getXMLPageCore(headers=headers, params=params, config=config, session=session)

    #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    #else, warning about Special:Export truncating large page histories
@ -433,7 +362,7 @@ def getXMLPage(config={}, title='', verbose=True):
    if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
        while not truncated and params['offset']: #next chunk
            params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
-            xml2 = getXMLPageCore(headers=headers, params=params, config=config)
+            xml2 = getXMLPageCore(headers=headers, params=params, config=config, session=session)
            
            if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>?
                if re.findall(r_timestamp, xml2)[-1] == params['offset']:
@ -475,11 +404,11 @@ def cleanXML(xml=''):
        xml = xml.split('</mediawiki>')[0]
    return xml

-def generateXMLDump(config={}, titles=[], start=''):
+def generateXMLDump(config={}, titles=[], start='', session=None):
    """ Generates a XML dump for a list of titles """
    
    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
-    header = getXMLHeader(config=config)
+    header = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n' #new line at the end
    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history')
    xmlfile = ''
@ -508,7 +437,7 @@ def generateXMLDump(config={}, titles=[], start=''):
        #requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
-        xmlfile.write(header)
+        xmlfile.write(header.encode('utf-8'))
        xmlfile.close()
    
    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
@ -520,17 +449,17 @@ def generateXMLDump(config={}, titles=[], start=''):
            lock = False
        if lock:
            continue
-        delay(config=config)
+        delay(config=config, session=session)
        if c % 10 == 0:
            print 'Downloaded %d pages' % (c)
-        xml = getXMLPage(config=config, title=title)
+        xml = getXMLPage(config=config, title=title, session=session)
        xml = cleanXML(xml=xml)
        if not xml:
            logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
        #here, XML is a correct <page> </page> chunk or 
        #an empty string due to a deleted page (logged in errors log) or
        #an empty string due to an error while retrieving the page from server (logged in errors log)
-        xmlfile.write(xml)
+        xmlfile.write(xml.encode('utf-8'))
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
@ -547,18 +476,18 @@ def saveTitles(config={}, titles=[]):
    
    print 'Titles saved at...', titlesfilename

-def saveImageFilenamesURL(config={}, images=[]):
+def saveImageFilenamesURL(config={}, images=[], session=None):
    """ Save image list in a file, including filename, url and uploader """

    imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
    imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
-    output = u"%s\n--END--" % (u'\n'.join([u'%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
-    imagesfile.write(output.encode('utf-8'))
+    imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
+    imagesfile.write('\n--END--')
    imagesfile.close()
    
    print 'Image filenames and URLs saved at...', imagesfilename

-def getImageFilenamesURL(config={}):
+def getImageFilenamesURL(config={}, session=None):
    """ Retrieve file list: filename, url, uploader """
    
    print 'Retrieving image filenames'
@ -569,14 +498,9 @@ def getImageFilenamesURL(config={}):
    retries = 5
    while offset:
        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8')
-        else:
-            raw = unicode(f.read(), 'utf-8')
-        f.close()
-        delay(config=config)
+        r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }, headers={'User-Agent': getUserAgent()})
+        raw = r.text
+        delay(config=config, session=session)
        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
            if limit > 10:
                print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
@ -645,36 +569,20 @@ def getImageFilenamesURL(config={}):
    images.sort()
    return images

-def getImageFilenamesURLAPI(config={}):
+def getImageFilenamesURLAPI(config={}, session=None):
    """ Retrieve file list: filename, url, uploader """
    
    print 'Retrieving image filenames'
-    headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}
+    headers = {'User-Agent': getUserAgent()}
    aifrom = '!'
    images = []
    while aifrom:
        sys.stderr.write('.') #progress
        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
-        data = urllib.urlencode(params)
-        req = urllib2.Request(url=config['api'], data=data, headers=headers)
-        try:
-            f = urllib2.urlopen(req)
-        except:
-            try:
-                print '(3) Server is slow... Waiting some seconds and retrying...'
-                time.sleep(10)
-                f = urllib2.urlopen(req)
-            except:
-                print 'An error has occurred while retrieving page titles with API'
-                print 'Please, resume the dump, --resume'
-                sys.exit()
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            jsonimages = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
-        else:
-            jsonimages = json.loads(unicode(f.read(), 'utf-8'))
-        f.close()
-        #print jsonimages
-        delay(config=config)
+        #FIXME Handle HTTP Errors HERE
+        r = session.post(url=config['api'], data=params, headers=headers)
+        jsonimages = json.loads(r.text)
+        delay(config=config, session=session)
        aifrom = ''
        if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
            if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
@ -691,13 +599,8 @@ def getImageFilenamesURLAPI(config={}):
                domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
                url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
            url = re.sub(' ', '_', url)
-            if image.has_key('name'):
-                #some API returns image name http://hastur.net/w/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10
-                filename = re.sub('_', ' ', image['name'])
-            else:
-                #other not http://wiki.annotation.jp/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10
-                #tips for dealing with unquote http://stackoverflow.com/questions/5139249/python-url-unquote-unicode
-                filename = re.sub('_', ' ', unicode(urllib2.unquote(url.encode('ascii')).split('/')[-1], 'utf-8'))
+            # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
+            filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8')
            uploader = re.sub('_', ' ', image['user'])
            images.append([filename, url, uploader])

@ -720,7 +623,7 @@ def undoHTMLEntities(text=''):
    
    return text

-def generateImageDump(config={}, other={}, images=[], start=''):
+def generateImageDump(config={}, other={}, images=[], start='', session=None):
    """ Save files and descriptions using a file list """
    
    #fix use subdirectories md5
@ -739,11 +642,11 @@ def generateImageDump(config={}, other={}, images=[], start=''):
            lock = False
        if lock:
            continue
-        delay(config=config)
+        delay(config=config, session=session)
        
        #saving file
        #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
-        filename2 = filename
+        filename2 = urllib.unquote(filename)
        if len(filename2) > other['filenamelimit']:
            # split last . (extension) and then merge
            filename2 = truncateFilename(other=other, filename=filename2)
@ -761,21 +664,21 @@ def generateImageDump(config={}, other={}, images=[], start=''):
        # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
        
        #saving description if any
-        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename)) # use Image: for backwards compatibility
+        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
        f = open('%s/%s.desc' % (imagepath, filename2), 'w')
        if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
            #failure when retrieving desc? then save it as empty .desc
            xmlfiledesc = ''
-        f.write(xmlfiledesc)
+        f.write(xmlfiledesc.encode('utf-8'))
        f.close()
-        delay(config=config)
+        delay(config=config, session=session)
        c += 1
        if c % 10 == 0:
            print '    Downloaded %d images' % (c)
    
    print 'Downloaded %d images' % (c)
    
-def saveLogs(config={}):
+def saveLogs(config={}, session=None):
    """ Save Special:Log """
    #get all logs from Special:Log
    """parse
@ -793,9 +696,9 @@ def saveLogs(config={}):
    <option value="">Todos los registros</option>
    </select>
    """
-    delay(config=config)
+    delay(config=config, session=session)

-def domain2prefix(config={}):
+def domain2prefix(config={}, session=None):
    """ Convert domain name to a valid prefix filename. """
    
    # At this point, both api and index are supposed to be defined
@ -966,6 +869,15 @@ def getParameters(params=[]):
    else:
        index = args.index

+    cj = cookielib.MozillaCookieJar()
+    if args.cookies:
+        cj.load(args.cookies)
+        print 'Using cookies from %s' % args.cookies
+
+    session = requests.Session()
+    session.cookies = cj
+    session.headers = {'User-Agent': getUserAgent()}
+
    config = {
        'curonly': args.curonly,
        'date': datetime.datetime.now().strftime('%Y%m%d'),
@ -984,18 +896,12 @@ def getParameters(params=[]):
        'resume': args.resume,
        'filenamelimit': 100, #do not change
        'force': args.force,
+        'session': session
    }
-    
-    if config['cookies']:
-        cj = cookielib.MozillaCookieJar()
-        cj.load(config['cookies'])
-        opener = urllib2.build_opener(POSTHTTPRedirectHandler, urllib2.HTTPCookieProcessor(cj))
-        urllib2.install_opener(opener)
-        print 'Using cookies from %s' % config['cookies']
        
    if config['api']:
        #check api.php
-        if checkAPI(config['api'], config):
+        if checkAPI(config['api'], config, session=other['session']):
            print 'api.php is OK'
        else:
            print 'Error in api.php, please, provide a correct path to api.php'
@ -1003,7 +909,7 @@ def getParameters(params=[]):
    
    if config['index']:
        #check index.php
-        if checkIndexphp(config['index'], config):
+        if checkIndexphp(config['index'], config, session=other['session']):
            print 'index.php is OK'
        else:
            print 'Error in index.php, please, provide a correct path to index.php'
@ -1011,39 +917,29 @@ def getParameters(params=[]):
    
    #calculating path, if not defined by user with --path=
    if not config['path']:
-        config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date'])
+        config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config, session=session), config['date'])

    return config, other
    
-def checkAPI(api, config={}):
+def checkAPI(api, config={}, session=None):
    """ Checking API availability """
-    req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-    f = urllib2.urlopen(req)
-    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-        resultText = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-    else:
-        resultText = f.read()
-    f.close()
+    global cj
+    r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+    resultText = r.text
    print 'Checking api.php...', api
    if "MediaWiki API is not enabled for this site." in resultText:
        return False
    result = json.loads(resultText)
-    delay(config=config)
+    delay(config=config, session=session)
    if result.has_key('query'):
        return True
    return False

-def checkIndexphp(indexphp, config={}):
+def checkIndexphp(indexphp, config={}, session=None):
    """ Checking index.php availability """
-    
-    req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-    f = urllib2.urlopen(req)
-    if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-        raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-    else:
-        raw = f.read()
-    f.close()
-    delay(config=config)
+    r = session.post(url=indexphp, data={'title': 'Special:Version'}, headers={'User-Agent': getUserAgent()})
+    raw = r.text
+    delay(config=config, session=session)
    print 'Checking index.php...', indexphp
    if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']: # Workaround for issue 71
         print "ERROR: This wiki requires login and we are not authenticated"
@ -1062,7 +958,7 @@ def removeIP(raw=''):
    
    return raw

-def checkXMLIntegrity(config={}):
+def checkXMLIntegrity(config={}, session=None):
    """ Check XML dump integrity, to detect broken XML chunks """
    return 
    
@ -1072,7 +968,7 @@ def checkXMLIntegrity(config={}):
    checkpageclose = 0
    checkrevisionopen = 0
    checkrevisionclose = 0
-    for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines():
+    for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines():
        if "<revision>" in line:
            checkrevisionopen += 1
        elif "</revision>" in line:
@ -1103,19 +999,19 @@ def createNewDump(config={}, other={}):
    images = []
    print 'Trying generating a new dump into a new directory...'
    if config['xml']:
-        titles += getPageTitles(config=config)
+        titles += getPageTitles(config=config, session=other['session'])
        saveTitles(config=config, titles=titles)
-        generateXMLDump(config=config, titles=titles)
+        generateXMLDump(config=config, titles=titles, session=other['session'])
        checkXMLIntegrity(config=config)
    if config['images']:
        if config['api']:
-            images += getImageFilenamesURLAPI(config=config)
+            images += getImageFilenamesURLAPI(config=config, session=other['session'])
        else:
-            images += getImageFilenamesURL(config=config)
-        saveImageFilenamesURL(config=config, images=images)
-        generateImageDump(config=config, other=other, images=images)
+            images += getImageFilenamesURL(config=config, session=other['session'])
+        saveImageFilenamesURL(config=config, images=images, session=other['session'])
+        generateImageDump(config=config, other=other, images=images, session=other['session'])
    if config['logs']:
-        saveLogs(config=config)
+        saveLogs(config=config, session=session)

 def resumePreviousDump(config={}, other={}):
    titles = []
@ -1125,7 +1021,7 @@ def resumePreviousDump(config={}, other={}):
        #load titles
        lasttitle = ''
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8')
            titles = raw.split('\n')
            lasttitle = titles[-1]
@ -1140,13 +1036,13 @@ def resumePreviousDump(config={}, other={}):
        else:
            print 'Title list is incomplete. Reloading...'
            #do not resume, reload, to avoid inconsistences, deleted pages or so
-            titles = getPageTitles(config=config)
+            titles = getPageTitles(config=config, session=other['session'])
            saveTitles(config=config, titles=titles)
        #checking xml dump
        xmliscomplete = False
        lastxmltitle = ''
        try:
-            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r')
            for l in f:
                if re.findall('</mediawiki>', l):
                    #xml dump is complete
@ -1176,7 +1072,7 @@ def resumePreviousDump(config={}, other={}):
        #load images
        lastimage = ''
        try:
-            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8').strip()
            lines = raw.split('\n')
            for l in lines:
@ -1192,9 +1088,9 @@ def resumePreviousDump(config={}, other={}):
            print 'Image list is incomplete. Reloading...'
            #do not resume, reload, to avoid inconsistences, deleted images or so
            if config['api']:
-                images=getImageFilenamesURLAPI(config=config)
+                images=getImageFilenamesURLAPI(config=config, session=session)
            else:
-                images = getImageFilenamesURL(config=config)
+                images = getImageFilenamesURL(config=config, session=session)
            saveImageFilenamesURL(config=config, images=images)
        #checking images directory
        listdir = []
@ -1228,47 +1124,37 @@ def resumePreviousDump(config={}, other={}):
        #fix
        pass

-def saveSpecialVersion(config={}):
+def saveSpecialVersion(config={}, session=None):
    """ Save Special:Version as .html, to preserve extensions details """
    
    if os.path.exists('%s/Special:Version.html' % (config['path'])):
        print 'Special:Version.html exists, do not overwrite'
    else:
        print 'Downloading Special:Version with extensions and other related info'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-        else:
-            raw = f.read()
-        f.close()
-        delay(config=config)
+        r = session.post(url=config['index'], data={'title': 'Special:Version', }, headers={'User-Agent': getUserAgent()})
+        raw = r.text
+        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        f = open('%s/Special:Version.html' % (config['path']), 'w')
-        f.write(raw)
+        f.write(raw.encode('utf-8'))
        f.close()

-def saveIndexPHP(config={}):
+def saveIndexPHP(config={}, session=None):
    """ Save index.php as .html, to preserve license details available at the botom of the page """
    
    if os.path.exists('%s/index.html' % (config['path'])):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-        f = urllib2.urlopen(req)
-        if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
-        else:
-            raw = f.read()
-        f.close()
-        delay(config=config)
+        r = session.post(url=config['index'], data={}, headers={'User-Agent': getUserAgent()})
+        raw = r.text
+        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        f = open('%s/index.html' % (config['path']), 'w')
-        f.write(raw)
+        f.write(raw.encode('utf-8'))
        f.close()

-def saveSiteInfo(config={}):
+def saveSiteInfo(config={}, session=None):
    """ Save a file with site info """
    
    if config['api']:
@ -1276,14 +1162,9 @@ def saveSiteInfo(config={}):
            print 'siteinfo.json exists, do not overwrite'
        else:
            print 'Downloading site info as siteinfo.json'
-            req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
-            f = urllib2.urlopen(req)
-            if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-                result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
-            else:
-                result = json.loads(f.read())
-            f.close()
-            delay(config=config)
+            r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+            result = json.loads(r.text)
+            delay(config=config, session=session)
            f = open('%s/siteinfo.json' % (config['path']), 'w')
            f.write(json.dumps(result, indent=4, sort_keys=True))
            f.close()
@ -1324,6 +1205,7 @@ def main(params=[]):
    """ Main function """
    
    configfilename = 'config.txt'
+    session = requests.Session()
    config, other = getParameters(params=params)
    avoidWikimediaProjects(config=config, other=other)
    
@ -1364,9 +1246,9 @@ def main(params=[]):
    else:
        createNewDump(config=config, other=other)

-    saveIndexPHP(config=config)    
-    saveSpecialVersion(config=config)
-    saveSiteInfo(config=config)
+    saveIndexPHP(config=config, session=session)
+    saveSpecialVersion(config=config, session=session)
+    saveSiteInfo(config=config, session=session)
    bye()

 if __name__ == "__main__":