improving image list downloader

10 years ago · aaa1822759
parent 88c9468c0e 53802bd8b8
commit aaa1822759
3 changed files with 449 additions and 1591 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -40,12 +40,12 @@ try:
 except ImportError:
    print "Please install or update the Requests module."
    sys.exit(1)
-import subprocess
 import time
 import urllib

 __VERSION__ = '0.2.2'  # major, minor, micro

+
 def getVersion():
    return(__VERSION__)

@ -54,23 +54,28 @@ def truncateFilename(other={}, filename=''):
    """ Truncate filenames when downloading images with large filenames """
    return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1]

+
 def delay(config={}, session=None):
    """ Add a delay if configured for that """
    if config['delay'] > 0:
        print 'Sleeping... %d seconds...' % (config['delay'])
        time.sleep(config['delay'])

+
 def cleanHTML(raw=''):
    """ Extract only the real wiki content and remove rubbish """
    """ This function is ONLY used to retrieve page titles and file names when no API is available """
    """ DO NOT use this function to extract page content """
-    #different "tags" used by different MediaWiki versions to mark where starts and ends content
+    # different "tags" used by different MediaWiki versions to mark where
+    # starts and ends content
    if re.search('<!-- bodytext -->', raw):
        raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
    elif re.search('<!-- start content -->', raw):
-        raw = raw.split('<!-- start content -->')[1].split('<!-- end content -->')[0]
+        raw = raw.split(
+            '<!-- start content -->')[1].split('<!-- end content -->')[0]
    elif re.search('<!-- Begin Content Area -->', raw):
-        raw = raw.split('<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
+        raw = raw.split(
+            '<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
    elif re.search('<!-- content -->', raw):
        raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
    elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
@ -83,6 +88,7 @@ def cleanHTML(raw=''):
        sys.exit()
    return raw

+
 def handleStatusCode(response):
    statuscode = response.status_code
    if statuscode >= 200 and statuscode < 300:
@ -115,29 +121,35 @@ def handleStatusCode(response):
        print response.url
        sys.exit(1)

+
 def getNamespacesScraper(config={}, session=None):
    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
    """ Function called if no API is available """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
-        r = session.post(url=config['index'], data={'title': 'Special:Allpages'})
+        r = session.post(
+            url=config['index'], data={'title': 'Special:Allpages'})
        raw = r.text
        delay(config=config, session=session)

-        m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected"
+        # [^>]*? to include selected="selected"
+        m = re.compile(
+            r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
        if 'all' in namespaces:
            namespaces = []
            for i in m:
                namespaces.append(int(i.group("namespaceid")))
-                namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+                namespacenames[int(i.group("namespaceid"))] = i.group(
+                    "namespacename")
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in m:
                if int(i.group("namespaceid")) in namespaces:
                    namespaces2.append(int(i.group("namespaceid")))
-                    namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+                    namespacenames[int(i.group("namespaceid"))] = i.group(
+                        "namespacename")
            namespaces = namespaces2
    else:
        namespaces = [0]
@ -146,12 +158,14 @@ def getNamespacesScraper(config={}, session=None):
    print '%d namespaces found' % (len(namespaces))
    return namespaces, namespacenames

+
 def getNamespacesAPI(config={}, session=None):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
-        r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
+        r = session.post(url=config['api'], data={
+                         'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
        result = json.loads(r.text)
        delay(config=config, session=session)

@ -179,10 +193,12 @@ def getNamespacesAPI(config={}, session=None):
    print '%d namespaces found' % (len(namespaces))
    return namespaces, namespacenames

+
 def getPageTitlesAPI(config={}, session=None):
    """ Uses the API to get the list of page titles """
    titles = []
-    namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+    namespaces, namespacenames = getNamespacesAPI(
+        config=config, session=session)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            print '    Skipping namespace = %d' % (namespace)
@ -193,22 +209,25 @@ def getPageTitlesAPI(config={}, session=None):
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
-            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
+            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace,
+                      'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
            r = session.post(url=config['api'], data=params)
            handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = json.loads(r.text)
            apfrom = ''
-            if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'):
-                if jsontitles['query-continue']['allpages'].has_key('apcontinue'):
+            if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']:
+                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apcontinue']
-                elif jsontitles['query-continue']['allpages'].has_key('apfrom'):
+                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            # print apfrom
            # print jsontitles
-            titles += [page['title'] for page in jsontitles['query']['allpages']]
+            titles += [page['title']
+                       for page in jsontitles['query']['allpages']]
            if len(titles) != len(set(titles)):
-                #probably we are in a loop, server returning dupe titles, stop it
+                # probably we are in a loop, server returning dupe titles, stop
+                # it
                print 'Probably a loop, finishing'
                titles = list(set(titles))
                apfrom = ''
@ -217,13 +236,16 @@ def getPageTitlesAPI(config={}, session=None):
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles

+
 def getPageTitlesScraper(config={}, session=None):
    """  """
    titles = []
-    namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
+    namespaces, namespacenames = getNamespacesScraper(
+        config=config, session=session)
    for namespace in namespaces:
        print '    Retrieving titles in the namespace', namespace
-        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
+        url = '%s?title=Special:Allpages&namespace=%s' % (
+            config['index'], namespace)
        r = session.get(url=url)
        raw = r.text
        raw = cleanHTML(raw)
@ -239,7 +261,9 @@ def getPageTitlesScraper(config={}, session=None):
        else:
            pass  # perhaps no subpages

-        deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
+        # 3 is the current deep of English Wikipedia for Special:Allpages, 3
+        # levels
+        deep = 3
        c = 0
        checked_suballpages = []
        rawacum = raw
@ -252,14 +276,20 @@ def getPageTitlesScraper(config={}, session=None):
                if r_suballpages == r_suballpages1:
                    to = i.group('to')
                    name = '%s-%s' % (fr, to)
-                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to
-                elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex)
-                    fr = fr.split('&amp;namespace=')[0] #clean &amp;namespace=\d, sometimes happens
+                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
+                        config['index'], namespace, fr, to)  # do not put urllib.quote in fr or to
+                # fix, esta regexp no carga bien todas? o falla el r_title en
+                # este tipo de subpag? (wikiindex)
+                elif r_suballpages == r_suballpages2:
+                    # clean &amp;namespace=\d, sometimes happens
+                    fr = fr.split('&amp;namespace=')[0]
                    name = fr
-                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace)
+                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (
+                        config['index'], name, namespace)

-                if not name in checked_suballpages:
-                    checked_suballpages.append(name) #to avoid reload dupe subpages links
+                if name not in checked_suballpages:
+                    # to avoid reload dupe subpages links
+                    checked_suballpages.append(name)
                    delay(config=config, session=session)
                    r2 = session.get(url=url)
                    raw2 = r2.text
@ -275,12 +305,13 @@ def getPageTitlesScraper(config={}, session=None):
        for i in m:
            t = undoHTMLEntities(text=i.group('title'))
            if not t.startswith('Special:'):
-                if not t in titles:
+                if t not in titles:
                    titles.append(t)
                    c += 1
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles

+
 def getPageTitles(config={}, session=None):
    """ Get list of page titles """
    # http://en.wikipedia.org/wiki/Special:AllPages
@ -295,29 +326,36 @@ def getPageTitles(config={}, session=None):
    elif config['index']:
        titles = getPageTitlesScraper(config=config, session=session)

-    titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
+    # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
+    # namespace and widget namespace))
+    titles = list(set(titles))
    titles.sort()  # sorting

    print '%d page titles loaded' % (len(titles))
    return titles

+
 def getXMLHeader(config={}, session=None):
    """ Retrieve a random page to extract XML headers (namespace info, etc) """
    # get the header of a random page, to attach it in the complete XML backup
-    #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
+    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
+    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session)
+    xml = getXMLPage(
+        config=config, title=randomtitle, verbose=False, session=session)
    header = xml.split('</mediawiki>')[0]
    if not xml:
        print 'XML export on this wiki is broken, quitting.'
        sys.exit()
    return header

+
 def getXMLFileDesc(config={}, title='', session=None):
    """ Get XML for image description page """
    config['curonly'] = 1  # tricky to get only the most recent desc
    return getXMLPage(config=config, title=title, verbose=False, session=session)

+
 def getUserAgent():
    """ Return a cool user-agent to hide Python user-agent """
    useragents = [
@ -327,13 +365,16 @@ def getUserAgent():
    ]
    return useragents[0]

+
 def logerror(config={}, text=''):
    """ Log error in file """
    if text:
        with open('%s/errors.log' % (config['path']), 'a') as outfile:
-            output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
+            output = u'%s: %s\n' % (
+                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
            outfile.write(output.encode('utf-8'))

+
 def getXMLPageCore(headers={}, params={}, config={}, session=None):
    """  """
    # returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
@ -346,25 +387,32 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
    increment = 20  # increment every retry
    while not re.search(r'</mediawiki>', xml):
        if c > 0 and c < maxretries:
-            wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
            print '    XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait)
            time.sleep(wait)
-            if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
+            # reducing server load requesting smallest chunks (if curonly then
+            # limit = 1 from mother function)
+            if params['limit'] > 1:
                params['limit'] = params['limit'] / 2  # half
        if c >= maxretries:
            print '    We have retried %d times' % (c)
            print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
            # If it's not already what we tried: our last chance, preserve only the last revision...
            # config['curonly'] means that the whole dump is configured to save nonly the last
-            # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
            if not config['curonly']:
                print '    Trying to save only the last revision for this page...'
                params['curonly'] = 1
-                logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages']))
+                logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (
+                    params['pages']))
                return getXMLPageCore(headers=headers, params=params, config=config)
            else:
                print '    Saving in the errors log, and skipping...'
-                logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages']))
+                logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (
+                    params['pages']))
                return ''  # empty xml
        # FIXME HANDLE HTTP Errors HERE
        r = session.post(url=config['index'], data=params, headers=headers)
@ -374,6 +422,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):

    return xml

+
 def getXMLPage(config={}, title='', verbose=True, session=None):
    """ Get the full history (or current only) of a page """

@ -392,7 +441,8 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    else:
        params['offset'] = '1'  # 1 always < 2000s
        params['limit'] = limit
-    if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
+    # in other case, do not set params['templates']
+    if 'templates' in config and config['templates']:
        params['templates'] = 1

    xml = getXMLPageCore(params=params, config=config, session=session)
@ -400,14 +450,21 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    # else, warning about Special:Export truncating large page histories
    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
-    if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
+    # search for timestamps in xml to avoid analysing empty pages like
+    # Special:Allpages and the random one
+    if not config['curonly'] and re.search(r_timestamp, xml):
        while not truncated and params['offset']:  # next chunk
-            params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
-            xml2 = getXMLPageCore(params=params, config=config, session=session)
+            # get the last timestamp from the acum XML
+            params['offset'] = re.findall(r_timestamp, xml)[-1]
+            xml2 = getXMLPageCore(
+                params=params, config=config, session=session)

-            if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>?
+            # are there more edits in this next XML chunk or no <page></page>?
+            if re.findall(r_timestamp, xml2):
                if re.findall(r_timestamp, xml2)[-1] == params['offset']:
-                    #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
+                    # again the same XML, this wiki does not support params in
+                    # Special:Export, offer complete XML up to X edits (usually
+                    # 1000)
                    print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated'
                    truncated = True
                    break
@ -423,8 +480,10 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                          <timestamp>2011-03-09T19:57:06Z</timestamp>
                          <contributor>
                    """
-                    #offset is OK in this wiki, merge with the previous chunk of this page history and continue
-                    xml = xml.split('</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    # offset is OK in this wiki, merge with the previous chunk
+                    # of this page history and continue
+                    xml = xml.split(
+                        '</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
            else:
                params['offset'] = ''  # no more edits in this page history

@ -437,6 +496,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):

    return xml

+
 def cleanXML(xml=''):
    """ Trim redundant info """
    # do not touch XML codification, leave AS IS
@ -445,13 +505,15 @@ def cleanXML(xml=''):
        xml = xml.split('</mediawiki>')[0]
    return xml

+
 def generateXMLDump(config={}, titles=[], start='', session=None):
    """ Generates a XML dump for a list of titles """

    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n'  # new line at the end
-    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history')
+    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
+                                    config['date'], config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
    if start:
@ -462,7 +524,8 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        c = 0
        for l in xmlfile:
            # removing <page>\n until end of file
-            if c != 0: #lock to avoid write an empty line at the begining of file
+            # lock to avoid write an empty line at the begining of file
+            if c != 0:
                if not re.search(r'<title>%s</title>' % (start), l):
                    xmlfile2.write(prev)
                else:
@ -472,8 +535,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        xmlfile.close()
        xmlfile2.close()
        # subst xml with xml2
-        os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump
-        os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name
+        # remove previous xml dump
+        os.remove('%s/%s' % (config['path'], xmlfilename))
+        # move correctly truncated dump to its real name
+        os.rename(
+            '%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename))
    else:
        # requested complete xml dump
        lock = False
@ -496,20 +562,24 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        xml = getXMLPage(config=config, title=title, session=session)
        xml = cleanXML(xml=xml)
        if not xml:
-            logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
+            logerror(
+                config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
        # here, XML is a correct <page> </page> chunk or
        # an empty string due to a deleted page (logged in errors log) or
-        #an empty string due to an error while retrieving the page from server (logged in errors log)
+        # an empty string due to an error while retrieving the page from server
+        # (logged in errors log)
        xmlfile.write(xml.encode('utf-8'))
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

+
 def saveTitles(config={}, titles=[]):
    """ Save title list in a file """

-    titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
+    titlesfilename = '%s-%s-titles.txt' % (
+        domain2prefix(config=config), config['date'])
    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
    output = u"%s\n--END--" % ('\n'.join(titles))
    titlesfile.write(output.encode('utf-8'))
@ -517,17 +587,21 @@ def saveTitles(config={}, titles=[]):

    print 'Titles saved at...', titlesfilename

+
 def saveImageFilenamesURL(config={}, images=[], session=None):
    """ Save image list in a file, including filename, url and uploader """

-    imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
+    imagesfilename = '%s-%s-images.txt' % (
+        domain2prefix(config=config), config['date'])
    imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
-    imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
+    imagesfile.write(('\n'.join(['%s\t%s\t%s' % (
+        filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
    imagesfile.write('\n--END--')
    imagesfile.close()

    print 'Image filenames and URLs saved at...', imagesfilename

+
 def curateImageURL(config={}, url=''):
    """ Returns an absolute URL for an image, adding the domain if missing """
    
@ -552,21 +626,27 @@ def curateImageURL(config={}, url=''):
    
    return url

+
 def getImageFilenamesURL(config={}, session=None):
    """ Retrieve file list: filename, url, uploader """

    print 'Retrieving image filenames'
-    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;' # (?<! http://docs.python.org/library/re.html
+    # (?<! http://docs.python.org/library/re.html
+    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;'
    images = []
    offset = '29990101000000'  # january 1, 2999
    limit = 5000
    retries = 5
    while offset:
-        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
+        # 5000 overload some servers, but it is needed for sites like this with
+        # no next links
+        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
+        r = session.post(url=config['index'], data={
+                         'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
        raw = r.text
        delay(config=config, session=session)
-        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
+        # delicate wiki
+        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw):
            if limit > 10:
                print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
                limit = limit / 10
@ -581,11 +661,16 @@ def getImageFilenamesURL(config={}, session=None):

        raw = cleanHTML(raw)
        # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
-        #wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
+        # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
+        # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
+        # class="new" title="Usuario:Fernandocg (página no
+        # existe)">Fernandocg</a></td>
        r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-        #wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version
+        # wikijuegos 1.9.5
+        # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
+        # mediawiki version
        r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-        #gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&amp;action=edit&amp;redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description">&#160;</td><td class="TablePager_col_count">1</td></tr>
+        # gentoowiki 1.18
        r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
@ -634,6 +719,7 @@ def getImageFilenamesURL(config={}, session=None):
    images.sort()
    return images

+
 def getImageFilenamesURLAPI(config={}, session=None):
    """ Retrieve file list: filename, url, uploader """

@ -643,7 +729,8 @@ def getImageFilenamesURLAPI(config={}, session=None):
    images = []
    while aifrom:
        sys.stderr.write('.')  # progress
-        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
+        params = {'action': 'query', 'list': 'allimages', 'aiprop':
+                  'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
        r = session.post(url=config['api'], data=params)
        handleStatusCode(r)
@ -708,10 +795,13 @@ def getImageFilenamesURLAPI(config={}, session=None):
    images.sort()
    return images

+
 def undoHTMLEntities(text=''):
    """ Undo some HTML codes """

-    text = re.sub('&lt;', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
+    # i guess only < > & " ' need conversion
+    # http://www.w3schools.com/html/html_entities.asp
+    text = re.sub('&lt;', '<', text)
    text = re.sub('&gt;', '>', text)
    text = re.sub('&amp;', '&', text)
    text = re.sub('&quot;', '"', text)
@ -719,6 +809,7 @@ def undoHTMLEntities(text=''):

    return text

+
 def generateImageDump(config={}, other={}, images=[], start='', session=None):
    """ Save files and descriptions using a file list """

@ -741,7 +832,8 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
        delay(config=config, session=session)

        # saving file
-        #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
+        # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
+        # limit). Later .desc is added to filename, so better 100 as max)
        filename2 = urllib.unquote(filename)
        if len(filename2) > other['filenamelimit']:
            # split last . (extension) and then merge
@ -753,9 +845,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
        imagefile.write(r.content)
        imagefile.close()
        # saving description if any
-        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
+        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (
+            filename), session=session)  # use Image: for backwards compatibility
        f = open('%s/%s.desc' % (imagepath, filename2), 'w')
-        if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
+        # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
+        if not re.search(r'</mediawiki>', xmlfiledesc):
            # failure when retrieving desc? then save it as empty .desc
            xmlfiledesc = ''
        f.write(xmlfiledesc.encode('utf-8'))
@ -767,6 +861,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):

    print 'Downloaded %d images' % (c)

+
 def saveLogs(config={}, session=None):
    """ Save Special:Log """
    # get all logs from Special:Log
@ -787,6 +882,7 @@ def saveLogs(config={}, session=None):
    """
    delay(config=config, session=session)

+
 def domain2prefix(config={}, session=None):
    """ Convert domain name to a valid prefix filename. """

@ -805,6 +901,7 @@ def domain2prefix(config={}, session=None):

    return domain

+
 def loadConfig(config={}, configfilename=''):
    """ Load config file """

@ -817,12 +914,14 @@ def loadConfig(config={}, configfilename=''):

    return config

+
 def saveConfig(config={}, configfilename=''):
    """ Save config file """

    with open('%s/%s' % (config['path'], configfilename), 'w') as outfile:
        cPickle.dump(config, outfile)

+
 def welcome():
    message = ''
    """ Opening message """
@ -857,6 +956,7 @@ def welcome():

    return message

+
 def bye():
    """ Closing message """
    print "---> Congratulations! Your dump is complete <---"
@ -871,31 +971,45 @@ def getParameters(params=[]):

    parser = argparse.ArgumentParser(description='')

-    parser.add_argument('-v', '--version', action='version', version=getVersion())
-    parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
-    parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
-    parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ")
-    parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
+    parser.add_argument(
+        '-v', '--version', action='version', version=getVersion())
+    parser.add_argument(
+        '--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
+    parser.add_argument(
+        '--delay', metavar=5, default=0, help="adds a delay (in seconds)")
+    parser.add_argument(
+        '--retries', metavar=5, default=5, help="Maximum number of retries for ")
+    parser.add_argument(
+        '--get-wiki-engine', action='store_true', help="returns the wiki engine")

    groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
-    groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
+    groupWikiOrAPIOrIndex.add_argument(
+        'wiki', default='', nargs='?', help="URL to wiki")
    groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
    groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php")

    groupXMLOrImages = parser.add_argument_group()
-    groupXMLOrImages.add_argument('--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
-    parser.add_argument('--curonly', action='store_true', help='store only the current version of pages')
+    groupXMLOrImages.add_argument(
+        '--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
+    parser.add_argument('--curonly', action='store_true',
+                        help='store only the current version of pages')

-    groupXMLOrImages.add_argument('--images', action='store_true', help="generates an image dump")
+    groupXMLOrImages.add_argument(
+        '--images', action='store_true', help="generates an image dump")

    parser.add_argument('--path', help='path to store wiki dump at')
-    parser.add_argument('--resume', action='store_true', help='resumes previous incomplete dump (requires --path)')
+    parser.add_argument('--resume', action='store_true',
+                        help='resumes previous incomplete dump (requires --path)')
    parser.add_argument('--force', action='store_true', help='')
-    parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)')
-    parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
+    parser.add_argument('--namespaces', metavar="1,2,3",
+                        help='comma-separated value of namespaces to include (all by default)')
+    parser.add_argument('--exnamespaces', metavar="1,2,3",
+                        help='comma-separated value of namespaces to exclude')

-    parser.add_argument('--user', help='Username if authentication is required.')
-    parser.add_argument('--pass', dest='password', help='Password if authentication is required.')
+    parser.add_argument(
+        '--user', help='Username if authentication is required.')
+    parser.add_argument(
+        '--pass', dest='password', help='Password if authentication is required.')

    args = parser.parse_args()
    # print args
@ -929,7 +1043,8 @@ def getParameters(params=[]):
    exnamespaces = []
    # Process namespace inclusions
    if args.namespaces:
-        if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': #fix, why - ?  and... --namespaces= all with a space works?
+        # fix, why - ?  and... --namespaces= all with a space works?
+        if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all':
            print "Invalid namespace values.\nValid format is integer(s) separated by commas"
            sys.exit()
        else:
@ -958,10 +1073,12 @@ def getParameters(params=[]):
        parser.print_usage()
        sys.exit(1)

-    #user chose --api, but --index it is necessary for special:export: we generate it
+    # user chose --api, but --index it is necessary for special:export: we
+    # generate it
    if args.api and not args.index:
        index = args.api.split('api.php')[0] + 'index.php'
-        # WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
+        # WARNING: remove index.php here for misconfigured sites like
+        # editthis.info, or provide --index directly
        print 'You didn\'t provide a path for index.php, using ', index
    else:
        index = args.index
@ -1021,44 +1138,51 @@ def getParameters(params=[]):

    return config, other

+
 def checkAPI(api, config={}, session=None):
    """ Checking API availability """
    global cj
-    r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
+    r = session.post(
+        url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
    resultText = r.text
    print 'Checking api.php...', api
    if "MediaWiki API is not enabled for this site." in resultText:
        return False
    result = json.loads(resultText)
    delay(config=config, session=session)
-    if result.has_key('query'):
+    if 'query' in result:
        return True
    return False

+
 def checkIndexphp(indexphp, config={}, session=None):
    """ Checking index.php availability """
    r = session.post(url=indexphp, data={'title': 'Special:Version'})
    raw = r.text
    delay(config=config, session=session)
    print 'Checking index.php...', indexphp
-    if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']: # Workaround for issue 71
+    # Workaround for issue 71
+    if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']:
        print "ERROR: This wiki requires login and we are not authenticated"
        return False
    if re.search(r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', raw):
        return True
    return False

+
 def removeIP(raw=''):
    """ Remove IP from HTML comments <!-- --> """

    raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
    # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
    # weird cases as :: are not included
-    raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
+    raw = re.sub(
+        r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)

    return raw

-def checkXMLIntegrity(config={}, session=None):
+
+def checkXMLIntegrity(config={}, titles=[], session=None):
    """ Check XML dump integrity, to detect broken XML chunks """
    return

@ -1089,7 +1213,7 @@ def checkXMLIntegrity(config={}, session=None):
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
        if reply.lower() in ['yes', 'y']:
-            generateXMLDump(config=config, titles=titles)
+            generateXMLDump(config=config, titles=titles, session=session)
        elif reply.lower() in ['no', 'n']:
            print 'Not generating a new dump.'

@ -1102,16 +1226,21 @@ def createNewDump(config={}, other={}):
        titles += getPageTitles(config=config, session=other['session'])
        saveTitles(config=config, titles=titles)
        generateXMLDump(config=config, titles=titles, session=other['session'])
-        checkXMLIntegrity(config=config)
+        checkXMLIntegrity(config=config, titles=titles, session=other['session'])
    if config['images']:
        if config['api']:
-            images += getImageFilenamesURLAPI(config=config, session=other['session'])
+            images += getImageFilenamesURLAPI(config=config,
+                                              session=other['session'])
        else:
-            images += getImageFilenamesURL(config=config, session=other['session'])
-        saveImageFilenamesURL(config=config, images=images, session=other['session'])
-        generateImageDump(config=config, other=other, images=images, session=other['session'])
+            images += getImageFilenamesURL(config=config,
+                                           session=other['session'])
+        saveImageFilenamesURL(
+            config=config, images=images, session=other['session'])
+        generateImageDump(
+            config=config, other=other, images=images, session=other['session'])
    if config['logs']:
-        saveLogs(config=config, session=session)
+        saveLogs(config=config, session=other['session'])
+

 def resumePreviousDump(config={}, other={}):
    titles = []
@ -1121,7 +1250,8 @@ def resumePreviousDump(config={}, other={}):
        # load titles
        lasttitle = ''
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r')
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
+                config=config, session=other['session']), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8')
            titles = raw.split('\n')
            lasttitle = titles[-1]
@ -1135,20 +1265,23 @@ def resumePreviousDump(config={}, other={}):
            print 'Title list was completed in the previous session'
        else:
            print 'Title list is incomplete. Reloading...'
-            #do not resume, reload, to avoid inconsistences, deleted pages or so
+            # do not resume, reload, to avoid inconsistences, deleted pages or
+            # so
            titles = getPageTitles(config=config, session=other['session'])
            saveTitles(config=config, titles=titles)
        # checking xml dump
        xmliscomplete = False
        lastxmltitle = ''
        try:
-            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other[
+                     'session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
            for l in f:
                if re.findall('</mediawiki>', l):
                    # xml dump is complete
                    xmliscomplete = True
                    break
-                xmltitles = re.findall(r'<title>([^<]+)</title>', l) #weird if found more than 1, but maybe
+                # weird if found more than 1, but maybe
+                xmltitles = re.findall(r'<title>([^<]+)</title>', l)
                if xmltitles:
                    lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
            f.close()
@ -1162,17 +1295,20 @@ def resumePreviousDump(config={}, other={}):
        elif lastxmltitle:
            # resuming...
            print 'Resuming XML dump from "%s"' % (lastxmltitle)
-            generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session'])
+            generateXMLDump(
+                config=config, titles=titles, start=lastxmltitle, session=other['session'])
        else:
            # corrupt? only has XML header?
            print 'XML is corrupt? Regenerating...'
-            generateXMLDump(config=config, titles=titles, session=other['session'])
+            generateXMLDump(
+                config=config, titles=titles, session=other['session'])

    if config['images']:
        # load images
        lastimage = ''
        try:
-            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            f = open('%s/%s-%s-images.txt' %
+                     (config['path'], domain2prefix(config=config), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8').strip()
            lines = raw.split('\n')
            for l in lines:
@ -1186,11 +1322,14 @@ def resumePreviousDump(config={}, other={}):
            print 'Image list was completed in the previous session'
        else:
            print 'Image list is incomplete. Reloading...'
-            #do not resume, reload, to avoid inconsistences, deleted images or so
+            # do not resume, reload, to avoid inconsistences, deleted images or
+            # so
            if config['api']:
-                images=getImageFilenamesURLAPI(config=config, session=other['session'])
+                images = getImageFilenamesURLAPI(
+                    config=config, session=other['session'])
            else:
-                images = getImageFilenamesURL(config=config, session=other['session'])
+                images = getImageFilenamesURL(
+                    config=config, session=other['session'])
            saveImageFilenamesURL(config=config, images=images)
        # checking images directory
        listdir = []
@ -1205,7 +1344,8 @@ def resumePreviousDump(config={}, other={}):
        c = 0
        for filename, url, uploader in images:
            lastfilename2 = lastfilename
-            lastfilename = filename #return always the complete filename, not the truncated
+            # return always the complete filename, not the truncated
+            lastfilename = filename
            filename2 = filename
            if len(filename2) > other['filenamelimit']:
                filename2 = truncateFilename(other=other, filename=filename2)
@ -1218,12 +1358,16 @@ def resumePreviousDump(config={}, other={}):
            # image dump is complete
            print 'Image dump was completed in the previous session'
        else:
-            generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
+            # we resume from previous image, which may be corrupted (or missing
+            # .desc)  by the previous session ctrl-c or abort
+            generateImageDump(
+                config=config, other=other, images=images, start=lastfilename2, session=other['session'])

    if config['logs']:
        # fix
        pass

+
 def saveSpecialVersion(config={}, session=None):
    """ Save Special:Version as .html, to preserve extensions details """

@ -1231,13 +1375,15 @@ def saveSpecialVersion(config={}, session=None):
        print 'Special:Version.html exists, do not overwrite'
    else:
        print 'Downloading Special:Version with extensions and other related info'
-        r = session.post(url=config['index'], data={'title': 'Special:Version'})
+        r = session.post(
+            url=config['index'], data={'title': 'Special:Version'})
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))

+
 def saveIndexPHP(config={}, session=None):
    """ Save index.php as .html, to preserve license details available at the botom of the page """

@ -1252,6 +1398,7 @@ def saveIndexPHP(config={}, session=None):
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))

+
 def saveSiteInfo(config={}, session=None):
    """ Save a file with site info """

@ -1260,12 +1407,14 @@ def saveSiteInfo(config={}, session=None):
            print 'siteinfo.json exists, do not overwrite'
        else:
            print 'Downloading site info as siteinfo.json'
-            r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
+            r = session.post(url=config['api'], data={
+                             'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
            result = json.loads(r.text)
            delay(config=config, session=session)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                outfile.write(json.dumps(result, indent=4, sort_keys=True))

+
 def avoidWikimediaProjects(config={}, other={}):
    """ Skip Wikimedia projects and redirect to the dumps website """

@ -1277,6 +1426,7 @@ def avoidWikimediaProjects(config={}, other={}):
            print 'Thanks!'
            sys.exit()

+
 def getWikiEngine(url=''):
    """ Returns the wiki engine of a URL, if known """

@ -1295,6 +1445,7 @@ def getWikiEngine(url=''):

    return wikiengine

+
 def main(params=[]):
    """ Main function """

@ -1307,12 +1458,15 @@ def main(params=[]):

    # creating path or resuming if desired
    c = 2
-    originalpath = config['path'] # to avoid concat blabla-2, blabla-2-3, and so on...
-    while not other['resume'] and os.path.isdir(config['path']): #do not enter if resume is requested from begining
+    # to avoid concat blabla-2, blabla-2-3, and so on...
+    originalpath = config['path']
+    # do not enter if resume is requested from begining
+    while not other['resume'] and os.path.isdir(config['path']):
        print '\nWarning!: "%s" path exists' % (config['path'])
        reply = ''
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
-            reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % (config['path'], config['path'], configfilename))
+            reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % (
+                config['path'], config['path'], configfilename))
        if reply.lower() in ['yes', 'y']:
            if not os.path.isfile('%s/%s' % (config['path'], configfilename)):
                print 'No config file found. I can\'t resume. Aborting.'
@ -1339,9 +1493,9 @@ def main(params=[]):
    else:
        createNewDump(config=config, other=other)

-    saveIndexPHP(config=config, session=session)
-    saveSpecialVersion(config=config, session=session)
-    saveSiteInfo(config=config, session=session)
+    saveIndexPHP(config=config, session=other['session'])
+    saveSpecialVersion(config=config, session=other['session'])
+    saveSiteInfo(config=config, session=other['session'])
    bye()

 if __name__ == "__main__":
--- a/rewrite/README.md
+++ b/rewrite/README.md
@ -1,4 +0,0 @@
-## WikiTeam dumpgenerator.py rewrite
-This is the rewrite of WikiTeam's dumpgenerator.py. It is aimed towards getting native API support when downloading wikis and to avoid the use of screen scraping when doing so (which is quite hacky and not ideal).
-
-Note: THIS IS NOT A RELEASE YET, patches welcome.
--- a/rewrite/dumpgenerator.py
+++ b/rewrite/dumpgenerator.py