keep rewriting code for python3 and in modules

8 years ago · 96c86080d0
parent 6caf98415c
commit 96c86080d0
2 changed files with 425 additions and 72 deletions
--- a/wikiteam/mediawiki.py
+++ b/wikiteam/mediawiki.py
@ -24,13 +24,50 @@ import urllib

 import wikiteam

+def mwCleanHTML(raw=''):
+    """ Extract only the real wiki content and remove rubbish """
+    """ This function is ONLY used to retrieve page titles and file names when no API is available """
+    """ DO NOT use this function to extract page content """
+    
+    # different "tags" used by different MediaWiki versions to mark where
+    # starts and ends content
+    if re.search('<!-- bodytext -->', raw):
+        raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
+    elif re.search('<!-- start content -->', raw):
+        raw = raw.split(
+            '<!-- start content -->')[1].split('<!-- end content -->')[0]
+    elif re.search('<!-- Begin Content Area -->', raw):
+        raw = raw.split(
+            '<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
+    elif re.search('<!-- content -->', raw):
+        raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
+    elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
+        raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
+    elif re.search('<body class=', raw):
+        raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0]
+    else:
+        print raw[:250]
+        sys.stderr.write('This wiki doesn\'t use marks to split content\n')
+        sys.exit()
+    return raw
+
+def mwCleanXML(xml=''):
+    """ Trim redundant info """
+    
+    # do not touch XML codification, leave AS IS
+    if re.search(r'</siteinfo>\n', xml):
+        xml = xml.split('</siteinfo>\n')[1]
+    if re.search(r'</mediawiki>', xml):
+        xml = xml.split('</mediawiki>')[0]
+    return xml
+
 def mwCreateNewDump(config={}):
    print('Trying generating a new dump into a new directory...')
-    if config['xml']:
-        titles = mwGetPageTitles(config=config)
-        mwSavePageTitles(config=config, images=images)
-        mwGeneratePageDump(config=config, titles=titles)
-        checkXMLIntegrity(config=config, titles=titles)
+    if config['pages']:
+        pagetitles = mwGetPageTitles(config=config)
+        wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
+        mwGeneratePageDump(config=config, pagetitles=pagetitles)
+        checkXMLIntegrity(config=config, pagetitles=pagetitles)
    if config['images']:
        images = mwGetImageNames(config=config)
        mwSaveImageNames(config=config, images=images)
@ -38,12 +75,11 @@ def mwCreateNewDump(config={}):
    if config['logs']:
        mwSaveLogs(config=config)

-def mwGeneratePageDump(config={}, titles=[], start=None):
-    """ Generates a XML dump for a list of titles """
-    # TODO: titles is now unused.
-
-    print('Retrieving the XML for every page from "%s"' % (start or 'start'))
-    header, config = getXMLHeader(config=config)
+def mwGeneratePageDump(config={}, pagetitles=None, start=None):
+    """ Generates a XML dump for page titles """
+    
+    print('Retrieving XML for every page from "%s"' % (start or 'start'))
+    header = mwGetXMLHeader(config=config)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
                                    config['date'],
@ -51,37 +87,37 @@ def mwGeneratePageDump(config={}, titles=[], start=None):
    xmlfile = ''
    lock = True
    if start:
-        print("Removing the last chunk of past XML dump: it is probably incomplete.")
+        sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n")
        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
            pass
    else:
        # requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
-        xmlfile.write(header.encode('utf-8'))
+        xmlfile.write(header)
        xmlfile.close()

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
-    for title in readTitles(config, start):
-        if not title.strip():
+    for pagetitle in mwGetPageTitles(config=config, start=start):
+        if not pagetitle.strip():
            continue
-        if title == start:  # start downloading from start, included
+        if pagetitle == start:  # start downloading from start, included
            lock = False
        if lock:
            continue
        wikiteam.delay(config=config)
        if c % 10 == 0:
-            print('Downloaded %d pages' % (c))
+            sys.stderr.write('Downloaded %d pages\n' % (c))
        try:
            for xml in getXMLPage(config=config, title=title):
                xml = cleanXML(xml=xml)
-                xmlfile.write(xml.encode('utf-8'))
+                xmlfile.write(xml)
        except PageMissingError:
            logerror(
                config=config,
-                text=u'The page "%s" was missing in the wiki (probably deleted)' %
-                (title.decode('utf-8'))
+                text='The page "%s" was missing in the wiki (probably deleted)' %
+                (title))
            )
        # here, XML is a correct <page> </page> chunk or
        # an empty string due to a deleted page (logged in errors log) or
@ -90,7 +126,7 @@ def mwGeneratePageDump(config={}, titles=[], start=None):
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
-    print('XML dump saved at...', xmlfilename)
+    sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))

 def mwGetAPI(config={}):
    """ Returns API for a MediaWiki wiki, if available """
@ -138,18 +174,17 @@ def mwGetNamespacesAPI(config={}):
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
-        params = {'action': 'query',
+        data = {'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
                'format': 'json'}
-        data = urllib.parse.urlencode(params).encode()
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
-                if int(i) < 0:  # -1: Special, -2: Media, excluding
+                if int(i) < 0:  # Skipping -1: Special, -2: Media
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
@ -157,13 +192,11 @@ def mwGetNamespacesAPI(config={}):
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
-                bi = i
-                i = int(i)
-                if i < 0:  # -1: Special, -2: Media, excluding
+                if int(i) < 0:
                    continue
-                if i in namespaces:
-                    namespaces2.append(i)
-                    namespacenames[i] = result['query']['namespaces'][bi]['*']
+                if int(i) in namespaces:
+                    namespaces2.append(int(i))
+                    namespacenames[int(i)] = result['query']['namespaces'][i]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]
@ -254,6 +287,276 @@ def mwGetPageTitlesAPI(config={}):
            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' % (c, namespace))

+
+def mwGetPageTitlesScraper(config={}):
+    """ Scrape list of page titles from Special:Allpages """
+    
+    pagetitles = []
+    namespaces, namespacenames = mwGetNamespacesScraper(
+        config=config)
+    for namespace in namespaces:
+        sys.stderr.write('    Retrieving titles in namespace %s\n' % (namespace))
+        url = '%s?title=Special:Allpages&namespace=%s' % (
+            config['index'], namespace)
+        raw = wikiteam.getURL(url=url)
+        raw = mwCleanHTML(raw)
+
+        r_title = r'title="(?P<title>[^>]+)">'
+        r_suballpages = ''
+        r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
+        r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
+        r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
+        if re.search(r_suballpages1, raw):
+            r_suballpages = r_suballpages1
+        elif re.search(r_suballpages2, raw):
+            r_suballpages = r_suballpages2
+        elif re.search(r_suballpages3, raw):
+            r_suballpages = r_suballpages3
+        else:
+            pass  # perhaps no subpages
+
+        # 3 is the current deep of English Wikipedia for Special:Allpages
+        deep = 3
+        c = 0
+        checked_suballpages = []
+        rawacum = raw
+        while r_suballpages and re.search(r_suballpages, raw) and c < deep:
+            # load sub-Allpages
+            m = re.compile(r_suballpages).finditer(raw)
+            for i in m:
+                fr = i.group('from')
+
+                if r_suballpages == r_suballpages1:
+                    to = i.group('to')
+                    name = '%s-%s' % (fr, to)
+                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
+                        config['index'], namespace, fr, to)  # do not put urllib.quote in fr or to
+                # fix, esta regexp no carga bien todas? o falla el r_title en
+                # este tipo de subpag? (wikiindex)
+                elif r_suballpages == r_suballpages2:
+                    # clean &amp;namespace=\d, sometimes happens
+                    fr = fr.split('&amp;namespace=')[0]
+                    name = fr
+                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (
+                        config['index'], name, namespace)
+                elif r_suballpages == r_suballpages3:
+                    fr = fr.split('&amp;namespace=')[0]
+                    name = fr
+                    url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
+                        config['index'], name, namespace)
+
+                if name not in checked_suballpages:
+                    # to avoid reload dupe subpages links
+                    checked_suballpages.append(name)
+                    wikiteam.delay(config=config)
+                    raw2 = wikiteam.getURL(url=url)
+                    raw2 = mwCleanHTML(raw2)
+                    rawacum += raw2  # merge it after removed junk
+                    sys.stderr.write('    Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \
+                        len(re.findall(r_suballpages, raw2)), \
+                        len(re.findall(r_title, raw2))))
+
+                wikiteam.delay(config=config)
+            c += 1
+
+        c = 0
+        m = re.compile(r_title).finditer(rawacum)
+        for i in m:
+            t = wikiteam.undoHTMLEntities(text=i.group('title'))
+            if not t.startswith('Special:'):
+                if t not in pagetitles:
+                    pagetitles.append(t)
+                    c += 1
+        sys.stderr.write('    %d titles retrieved in the namespace %d\n' % (c, namespace))
+    return pagetitles
+
+def mwGetXMLHeader(config={}):
+    """ Retrieve a random page to extract XML header (namespace info, etc) """
+
+    pagetitle = 'Main_Page'
+    try:
+        xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
+    except PageMissingError as pme:
+        # The <page> does not exist. Not a problem, if we get the <siteinfo>.
+        xml = pme.xml
+    except ExportAbortedError:
+        # Issue 26: Account for missing "Special" namespace.
+        # Hope the canonical special name has not been removed.
+        # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
+        try:
+            if config['mwapi']:
+                sys.stderr.write("Trying the local name for the Special namespace instead\n")
+                xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
+        except PageMissingError as pme:
+            xml = pme.xml
+        except ExportAbortedError:
+            pass
+
+    header = xml.split('</mediawiki>')[0]
+    if not re.match(r"\s*<mediawiki", xml):
+        sys.stderr.write('XML export on this wiki is broken, quitting.\n')
+        logerror('XML export on this wiki is broken, quitting.')
+        sys.exit()
+    return header
+
+def mwGetXMLPage(config={}, pagetitle='', verbose=True):
+    """ Get the full history (or current only) of a page """
+
+    # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
+    # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
+
+    limit = 1000
+    truncated = False
+    pagetitle_ = re.sub(' ', '_', pagetitle)
+    # do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_)
+    data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'}
+    if config['curonly']:
+        data['curonly'] = 1
+        data['limit'] = 1
+    else:
+        data['offset'] = '1'  # 1 always < 2000s
+        data['limit'] = limit
+    # in other case, do not set data['templates']
+    if 'templates' in config and config['templates']: #fix, what is this option for?
+        data['templates'] = 1
+
+    xml = mwGetXMLPageCore(config=config, data=data)
+    if not xml:
+        raise ExportAbortedError(config['index'])
+    if not "</page>" in xml:
+        raise PageMissingError(data['title'], xml)
+    else:
+        # strip these sha1s sums which keep showing up in the export and
+        # which are invalid for the XML schema (they only apply to
+        # revisions)
+        xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+        xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+    yield xml.split("</page>")[0]
+
+    # if complete history, check if this page history has > limit edits,
+    # if so, retrieve all revisions using offset if available
+    # else, warning about Special:Export truncating large page histories
+    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+    numedits = 0
+    numedits += len(re.findall(r_timestamp, xml))
+
+    # search for timestamps in xml to avoid analysing empty pages like
+    # Special:Allpages and the random one
+    if not config['curonly'] and re.search(r_timestamp, xml):
+        while not truncated and data['offset']:  # next chunk
+            # get the last timestamp from the acum XML
+            # assuming history is sorted chronologically
+            data['offset'] = re.findall(r_timestamp, xml)[-1]
+            try:
+                xml2 = mwGetXMLPageCore(config=config, data=data)
+            except MemoryError:
+                sys.stderr.write("Page history exceeds our memory, halving limit.\n")
+                data['limit'] = data['limit'] / 2
+                continue
+
+            # are there more edits in this next XML chunk or no <page></page>?
+            if re.findall(r_timestamp, xml2):
+                if re.findall(r_timestamp, xml2)[-1] == data['offset']:
+                    # again the same XML, this wiki does not support params in
+                    # Special:Export, offer complete XML up to X edits (usually
+                    # 1000)
+                    sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n')
+                    truncated = True
+                    break
+                else:
+                    """    </namespaces>
+                      </siteinfo>
+                      <page>
+                        <title>Main Page</title>
+                        <id>15580374</id>
+                        <restrictions>edit=sysop:move=sysop</restrictions> (?)
+                        <revision>
+                          <id>418009832</id>
+                          <timestamp>2011-03-09T19:57:06Z</timestamp>
+                          <contributor>
+                    """
+                    # offset is OK in this wiki, merge with the previous chunk
+                    # of this page history and continue
+                    try:
+                        xml2 = xml2.split("</page>")[0]
+                        yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    except MemoryError:
+                        sys.stderr.write("Page's history exceeds our memory, halving limit.\n")
+                        data['limit'] = data['limit'] / 2
+                        continue
+                    xml = xml2
+                    numedits += len(re.findall(r_timestamp, xml))
+            else:
+                data['offset'] = ''  # no more edits in this page history
+    yield "</page>\n"
+
+    if verbose:
+        if numedits == 1:
+           sys.stderr.write('    %s, 1 edit\n' % (pagetitle))
+        else:
+           sys.stderr.write('    %s, %d edits\n' % (pagetitle, numedits))
+
+def mwGetXMLPageCore(config={}, data={}):
+    """ Returns a XML containing data['limit'] revisions (or current only), ending in </mediawiki>
+        if retrieving data['limit'] revisions fails, returns current only version
+        if all fail, returns empty string
+    """
+    
+    xml = ''
+    cretries = 0
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = config['retries']  # x retries and exit
+    increment = 20  # increment seconds every retry
+
+    while not re.search(r'</mediawiki>', xml):
+        if cretries > 0 and cretries < maxretries:
+            wait = increment * cretries < maxseconds and increment * \
+                cretries or maxseconds  # incremental until maxseconds
+            sys.stderr.write('    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait)
+            time.sleep(wait)
+            # reducing server load requesting smallest chunks (if curonly then
+            # limit = 1 from mother function)
+            if data['limit'] > 1:
+                data['limit'] = data['limit'] / 2  # half
+        if cretries >= maxretries:
+            sys.stderr.write('    We have retried %d times\n' % (cretries))
+            sys.stderr.write('    MediaWiki error for "%s", probably network error...' % (data['pages']))
+            # If it's not already what we tried: our last chance, preserve only the last revision...
+            # config['curonly'] means that the whole dump is configured to save only the last,
+            # data['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # mwGetXMLPageCore
+            if not config['curonly'] and not 'curonly' in data:
+                sys.stderr.write('    Trying to save only the last revision for this page...\n')
+                data['curonly'] = 1
+                logerror(
+                    config=config,
+                    text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
+                    (data['pages'])
+                )
+                return mwGetXMLPageCore(config=config, data=data)
+            else:
+                sys.stderr.write('    Saving in error log, skipping...\n')
+                logerror(
+                    config=config,
+                    text='Error while retrieving last revision of "%s". Skipping.\n' %
+                    (data['pages']))
+                raise ExportAbortedError(config['index'])
+                return ''  # empty xml
+        # FIXME HANDLE HTTP Errors HERE
+        try:
+            r = wikiteam.getURL(url=config['index'], data=data)
+            #handleStatusCode(r)
+            #r = fixBOM(r)
+            xml = fixBOM(r)
+        except:
+            sys.stderr.write('    Connection error\n')
+            xml = ''
+        cretries += 1
+
+    return xml
+
 def main():
    pass

--- a/wikiteam/wikiteam.py
+++ b/wikiteam/wikiteam.py
@ -32,13 +32,19 @@ import urllib

 __version__ = "0.3.1"

+"""
+Stuff to check if works properly or re-add if needed:
+* fixBOM
+* sessions
+"""
+
 def avoidWikimediaProjects(config={}):
    """ Skip Wikimedia projects and redirect to the dumps website """

    # notice about wikipedia dumps
    if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']):
-        sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!')
-        sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org')
+        sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\n')
+        sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org\n')
        """if not other['force']:
            print 'Thanks!'
            sys.exit()"""
@ -61,7 +67,7 @@ def createNewDump(config={}):
        import wikispaces
        wikispaces.wsCreateNewDump(config=config)
    else:
-        sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine']))
+        sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine']))

 def createDumpPath(config={}):
    # creating path or resuming if desired
@ -70,7 +76,7 @@ def createDumpPath(config={}):
    originalpath = config['path']
    # do not enter if resume is requested from begining
    while not config['other']['resume'] and os.path.isdir(config['path']):
-        sys.stderr.write('\nWarning!: "%s" path exists' % (config['path']))
+        sys.stderr.write('\nWarning!: "%s" path exists\n' % (config['path']))
        reply = ''
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = input(
@ -80,16 +86,16 @@ def createDumpPath(config={}):
                    config['other']['configfilename']))
        if reply.lower() in ['yes', 'y']:
            if not os.path.isfile('%s/%s' % (config['path'], config['other']['configfilename'])):
-                sys.stderr.write('No config file found. I can\'t resume. Aborting.')
+                sys.stderr.write('No config file found. I can\'t resume. Aborting.\n')
                sys.exit()
-            sys.stderr.write('You have selected: YES')
+            sys.stderr.write('You have selected: YES\n')
            config['other']['resume'] = True
            break
        elif reply.lower() in ['no', 'n']:
-            sys.stderr.write('You have selected: NO')
+            sys.stderr.write('You have selected: NO\n')
            config['other']['resume'] = False
        config['path'] = '%s-%d' % (originalpath, c)
-        sys.stderr.write('Trying to use path "%s"...' % (config['path']))
+        sys.stderr.write('Trying to use path "%s"...\n' % (config['path']))
        c += 1
    return config

@ -270,21 +276,21 @@ def getParameters(params=[]):
    
    # Not wiki? Exit
    if not args.wiki:
-        sys.stderr.write('ERROR: Provide a URL to a wiki')
+        sys.stderr.write('ERROR: Provide a URL to a wiki\n')
        parser.print_help()
        sys.exit(1)
    
    # Don't mix download params and meta info params
    if (args.pages or args.images) and \
            (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_wiki_engine):
-        sys.stderr.write('ERROR: Don\'t mix download params and meta info params')
+        sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n')
        parser.print_help()
        sys.exit(1)

    # No download params and no meta info params? Exit
    if (not args.pages and not args.images) and \
            (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_wiki_engine):
-        sys.stderr.write('ERROR: Use at least one download param or meta info param')
+        sys.stderr.write('ERROR: Use at least one download param or meta info param\n')
        parser.print_help()
        sys.exit(1)

@ -292,11 +298,11 @@ def getParameters(params=[]):
    cj = cookielib.MozillaCookieJar()
    if args.cookies:
        cj.load(args.cookies)
-        sys.stderr.write('Using cookies from %s' % args.cookies)
+        sys.stderr.write('Using cookies from %s\n' % args.cookies)

    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
-        sys.stderr.write('ERROR: Both --user and --pass are required for authentication.')
+        sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n')
        parser.print_help()
        sys.exit(1)
    
@ -338,7 +344,7 @@ def getParameters(params=[]):
        if re.search(
                r'[^\d, \-]',
                args.namespaces) and args.namespaces.lower() != 'all':
-            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas")
+            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
            sys.exit()
        else:
            ns = re.sub(' ', '', args.namespaces)
@ -350,12 +356,12 @@ def getParameters(params=[]):
    # Process namespace exclusions
    if args.exnamespaces:
        if re.search(r'[^\d, \-]', args.exnamespaces):
-            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas")
+            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
            sys.exit(1)
        else:
            ns = re.sub(' ', '', args.exnamespaces)
            if ns.lower() == 'all':
-                sys.stderr.write('You cannot exclude all namespaces.')
+                sys.stderr.write('You cannot exclude all namespaces.\n')
                sys.exit(1)
            else:
                exnamespaces = [int(i) for i in ns.split(',')]
@ -394,15 +400,25 @@ def getParameters(params=[]):
    # Get ready special variables (API for MediWiki, etc)
    if config['wikiengine'] == 'mediawiki':
        import mediawiki
+        config['mwexport'] = 'Special:Export'
        if not args.mwapi:
            config['mwapi'] = mediawiki.mwGetAPI(config=config)
            if not config['mwapi']:
-                sys.stderr.write('ERROR: Provide a URL to API')
+                sys.stderr.write('ERROR: Provide a URL to API\n')
                sys.exit(1)
+            else:
+                data={
+                    'action': 'query',
+                    'meta': 'siteinfo',
+                    'siprop': 'namespaces',
+                    'format': 'json'}
+                r = getURL(config['mwapi'], data=data)
+                config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \
+                    + ':Export'
        if not args.mwindex:
            config['mwindex'] = mediawiki.mwGetIndex(config=config)
            if not config['mwindex']:
-                sys.stderr.write('ERROR: Provide a URL to Index.php')
+                sys.stderr.write('ERROR: Provide a URL to Index.php\n')
                sys.exit(1)
    elif wikiengine == 'wikispaces':
        import wikispaces
@ -415,14 +431,14 @@ def getParameters(params=[]):
    return config

 def getURL(url='', data=None):
+    # fix quizas pasandole el config pueda saber si esta definido el campo session y usarlo si interesa con un if
    html = ''
-    req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
-    html = urllib.request.urlopen(req, data=data).read().decode().strip()
    try:
+        data = urllib.parse.urlencode(data).encode()
        req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
        html = urllib.request.urlopen(req, data=data).read().decode().strip()
    except:
-        sys.stderr.write("Error while retrieving URL", url)
+        sys.stderr.write("Error while retrieving URL: %s\n" % url)
        sys.exit()
    return html

@ -517,36 +533,42 @@ def getWikiEngine(url=''):

    return wikiengine.lower()

+def fixBOM(r):
+    """Strip Unicode BOM"""
+    if request.text.startswith(u'\ufeff'):
+        request.encoding = 'utf-8-sig'
+    return request.text
+
 def handleStatusCode(response):
    statuscode = response.status_code
    if statuscode >= 200 and statuscode < 300:
        return

-    sys.stderr.write("HTTP Error %d." % statuscode)
+    sys.stderr.write("HTTP Error %d.\n" % statuscode)
    if statuscode >= 300 and statuscode < 400:
-        sys.stderr.write("Redirect should happen automatically: please report this as a bug.")
-        sys.stderr.write(response.url)
+        sys.stderr.write("Redirect should happen automatically: please report this as a bug.\n")
+        sys.stderr.write('%s\n' % response.url)

    elif statuscode == 400:
-        sys.stderr.write("Bad Request: The wiki may be malfunctioning.")
-        sys.stderr.write("Please try again later.")
-        sys.stderr.write(response.url)
+        sys.stderr.write("Bad Request: The wiki may be malfunctioning.\n")
+        sys.stderr.write("Please try again later.\n")
+        sys.stderr.write('%s\n' % response.url)
        sys.exit(1)

    elif statuscode == 401 or statuscode == 403:
-        sys.stderr.write("Authentication required.")
-        sys.stderr.write("Please use --userpass.")
-        sys.stderr.write(response.url)
+        sys.stderr.write("Authentication required.\n")
+        sys.stderr.write("Please use --userpass.\n")
+        sys.stderr.write('%s\n' % response.url)

    elif statuscode == 404:
-        sys.stderr.write("Not found. Is Special:Export enabled for this wiki?")
-        sys.stderr.write(response.url)
+        sys.stderr.write("Not found. Is Special:Export enabled for this wiki?\n")
+        sys.stderr.write('%s\n' % response.url)
        sys.exit(1)

    elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
-        sys.stderr.write("Server error, max retries exceeded.")
-        sys.stderr.write("Please resume the dump later.")
-        sys.stderr.write(response.url)
+        sys.stderr.write("Server error, max retries exceeded.\n")
+        sys.stderr.write("Please resume the dump later.\n")
+        sys.stderr.write('%s\n' % response.url)
        sys.exit(1)

 def resumePreviousDump(config={}):
@ -557,7 +579,7 @@ def resumePreviousDump(config={}):
        import wikispaces
        wikispaces.wsResumePreviousDump(config=config)
    else:
-        sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine']))
+        sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine']))

 def saveConfig(config={}):
    """ Save config file """
@ -566,19 +588,48 @@ def saveConfig(config={}):
    config2 = config.copy()
    config2['other'] = {}
    with open('%s/%s' % (config['path'], config['other']['configfilename']), 'w') as outfile:
-        sys.stderr.write('Saving config file...')
+        sys.stderr.write('Saving config file...\n')
        try: #str
            cPickle.dump(config2, outfile)
        except: #bytes
            with open('%s/%s' % (config['path'], config['other']['configfilename']), 'wb') as outfile:
                cPickle.dump(config2, outfile)

+def savePageTitles(config={}, pagetitles=None):
+    pagetitlesfilename = '%s-%s-titles.txt' % (
+        domain2prefix(config=config), config['date'])
+    with open('%s/%s' % (config['path'], pagetitlesfilename), 'wt') as f:
+        for pagetitle in pagetitles:
+            output = '%s\n' % (pagetitle)
+            f.write(output)
+
+        # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
+        # main namespace and widget namespace.
+        # We can use sort -u in UNIX, but is it worth it?
+        f.write('--END--\n')
+        f.close()
+    sys.stderr.write('Page titles saved at... %s\n' % (pagetitlesfilename))
+
+def undoHTMLEntities(text=''):
+    """ Undo some HTML codes """
+
+    # i guess only < > & " ' need conversion
+    # http://www.w3schools.com/html/html_entities.asp
+    text = re.sub('&lt;', '<', text)
+    text = re.sub('&gt;', '>', text)
+    text = re.sub('&amp;', '&', text)
+    text = re.sub('&quot;', '"', text)
+    text = re.sub('&#039;', '\'', text)
+
+    return text
+
 def welcome():
    """ Print opening message """
    
    message = """
 #########################################################################
 # Welcome to WikiTeam's tools v%s (GPL v3)                           #
+# Tools for downloading and preserving wikis                            #
 # More info at: https://github.com/WikiTeam/wikiteam                    #
 #########################################################################

@ -605,10 +656,10 @@ def loadConfig(config={}):
    
    try:
        with open('%s/%s' % (config['path'], config['other']['configfilename']), 'r') as infile:
-            sys.stderr.write('Loading config file...')
+            sys.stderr.write('Loading config file...\n')
            config = cPickle.load(infile)
    except:
-        sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.')
+        sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.\n')
        sys.exit()

    return config
@ -616,17 +667,16 @@ def loadConfig(config={}):
 def main(params=[]):
    """ Main function """
    
+    welcome()
    config = getParameters(params=params)    
    avoidWikimediaProjects(config=config)
    config = createDumpPath(config=config)
    if config['other']['resume']:
        # Resume dump
-        welcome()
        config = loadConfig(config=config)
        resumePreviousDump(config=config)
    elif config['pages'] or config['images'] or config['logs']:
        # New dump
-        welcome()
        os.mkdir(config['path'])
        saveConfig(config=config)
        createNewDump(config=config)
@ -639,7 +689,7 @@ def main(params=[]):
        elif config['metainfo'] == 'get_page_titles':
            printPageTitles(config=config)
        elif config['metainfo'] == 'get_image_names':
-            printGetImageNames(config=config))
+            printGetImageNames(config=config)
        elif config['metainfo'] == 'get_wiki_engine':
            sys.stdout.write(config['wikiengine'])
        sys.exit()