wikiteam/wikiteam/mediawiki.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2011-2016 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
# Documentation for developers: http://wikiteam.readthedocs.com

import re
import sys
import urllib

import wikiteam

def mwCreateNewDump(config={}):
    print('Trying generating a new dump into a new directory...')
    if config['xml']:
        titles = mwGetPageTitles(config=config)
        mwSavePageTitles(config=config, images=images)
        mwGeneratePageDump(config=config, titles=titles)
        checkXMLIntegrity(config=config, titles=titles)
    if config['images']:
        images = mwGetImageNames(config=config)
        mwSaveImageNames(config=config, images=images)
        mwGenerateImageDump(config=config, images=images)
    if config['logs']:
        mwSaveLogs(config=config)

def mwGeneratePageDump(config={}, titles=[], start=None):
    """ Generates a XML dump for a list of titles """
    # TODO: titles is now unused.

    print('Retrieving the XML for every page from "%s"' % (start or 'start'))
    header, config = getXMLHeader(config=config)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
                                    config['date'],
                                    config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
    if start:
        print("Removing the last chunk of past XML dump: it is probably incomplete.")
        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
            pass
    else:
        # requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header.encode('utf-8'))
        xmlfile.close()

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
    for title in readTitles(config, start):
        if not title.strip():
            continue
        if title == start:  # start downloading from start, included
            lock = False
        if lock:
            continue
        wikiteam.delay(config=config)
        if c % 10 == 0:
            print('Downloaded %d pages' % (c))
        try:
            for xml in getXMLPage(config=config, title=title):
                xml = cleanXML(xml=xml)
                xmlfile.write(xml.encode('utf-8'))
        except PageMissingError:
            logerror(
                config=config,
                text=u'The page "%s" was missing in the wiki (probably deleted)' %
                (title.decode('utf-8'))
            )
        # here, XML is a correct <page> </page> chunk or
        # an empty string due to a deleted page (logged in errors log) or
        # an empty string due to an error while retrieving the page from server
        # (logged in errors log)
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
    print('XML dump saved at...', xmlfilename)

def mwGetAPI(config={}):
    """ Returns API for a MediaWiki wiki, if available """

    api = ''
    html = wikiteam.getURL(url=config['wiki'])
    m = re.findall(
        r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
        html)
    if m:
        api = m[0]
        if api.startswith('//'):  # gentoo wiki and others
            api = url.split('//')[0] + api
    return api

def mwGetIndex(config={}):
    """ Returns Index.php for a MediaWiki wiki, if available """

    if config['mwapi']:
        mwapi = config['mwapi']
    else:
        mwapi = mwGetAPI(config=config)
    index = ''
    html = wikiteam.getURL(url=config['wiki'])
    m = re.findall(r'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html)
    if m:
        index = m[0]
    else:
        m = re.findall(r'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html)
        if m:
            index = m[0]
    if index:
        if index.startswith('/'):
            index = '/'.join(mwapi.split('/')[:-1]) + '/' + index.split('/')[-1]
    else:
        if mwapi:
            if len(re.findall(r'/index\.php5\?', html)) > len(re.findall(r'/index\.php\?', html)):
                index = '/'.join(mwapi.split('/')[:-1]) + '/index.php5'
            else:
                index = '/'.join(mwapi.split('/')[:-1]) + '/index.php'
    return index

def mwGetNamespacesAPI(config={}):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        params = {'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
                'format': 'json'}
        data = urllib.parse.urlencode(params).encode()
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:  # -1: Special, -2: Media, excluding
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
                bi = i
                i = int(i)
                if i < 0:  # -1: Special, -2: Media, excluding
                    continue
                if i in namespaces:
                    namespaces2.append(i)
                    namespacenames[i] = result['query']['namespaces'][bi]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
    return namespaces, namespacenames

def mwGetPageTitles(config={}):
    """ Get list of page titles """
    # http://en.wikipedia.org/wiki/Special:AllPages
    # http://archiveteam.org/index.php?title=Special:AllPages
    # http://www.wikanda.es/wiki/Especial:Todas
    sys.stderr.write('Loading page titles from namespaces = %s\n' % (','.join([str(i) for i in config['namespaces']]) or 'None'))
    sys.stderr.write('Excluding titles from namespaces = %s\n' % (','.join([str(i) for i in config['exnamespaces']]) or 'None'))

    if 'mwapi' in config and config['mwapi']:
        for pagetitle in mwGetPageTitlesAPI(config=config):
            yield pagetitle
    elif 'mwindex' in config and config['mwindex']:
        for pagetitle in mwGetPageTitlesScraper(config=config):
            yield pagetitle

def mwGetPageTitlesAPI(config={}):
    """ Uses the API to get the list of page titles """
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesAPI(
        config=config)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            sys.stderr.write('    Skipping namespace = %d\n' % (namespace))
            continue

        c = 0
        sys.stderr.write('    Retrieving page titles in namespace %d\n' % (namespace))
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
            params = {
                'action': 'query',
                'list': 'allpages',
                'apnamespace': namespace,
                'apfrom': apfrom.encode('utf-8'),
                'format': 'json',
                'aplimit': 500}
            data = urllib.parse.urlencode(params).encode()
            retryCount = 0
            while retryCount < config["retries"]:
                try:
                    r = wikiteam.getURL(url=config['mwapi'], data=data)
                    break
                except ConnectionError as err:
                    print("Connection error: %s" % (str(err),))
                    retryCount += 1
                    time.sleep(20)
            #wikiteam.handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = wikiteam.getJSON(r)
            apfrom = ''
            if 'query-continue' in jsontitles and 'allpages' in jsontitles[
                    'query-continue']:
                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles[
                        'query-continue']['allpages']['apcontinue']
                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            elif 'continue' in jsontitles:
                if 'apcontinue' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
            
            # print apfrom
            # print jsontitles
            allpages = jsontitles['query']['allpages']
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
                yield page['title']
            c += len(allpages)

            if len(pagetitles) != len(set(pagetitles)):
                # probably we are in a loop, server returning dupe titles, stop
                # it
                sys.stderr.write('Probably a loop, finishing\n')
                pagetitles = list(set(pagetitles))
                apfrom = ''

            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' % (c, namespace))

def main():
    pass

if __name__ == "__main__":
    main()
rewriting code in modules 8 years ago			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`# Copyright (C) 2011-2016 WikiTeam developers`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki`
			`# Documentation for developers: http://wikiteam.readthedocs.com`

			`import re`
--get-page-titles works fine 8 years ago			`import sys`
.. 8 years ago			`import urllib`
rewriting code in modules 8 years ago
			`import wikiteam`

.. 8 years ago			`def mwCreateNewDump(config={}):`
			`print('Trying generating a new dump into a new directory...')`
			`if config['xml']:`
			`titles = mwGetPageTitles(config=config)`
			`mwSavePageTitles(config=config, images=images)`
			`mwGeneratePageDump(config=config, titles=titles)`
			`checkXMLIntegrity(config=config, titles=titles)`
			`if config['images']:`
			`images = mwGetImageNames(config=config)`
			`mwSaveImageNames(config=config, images=images)`
			`mwGenerateImageDump(config=config, images=images)`
			`if config['logs']:`
			`mwSaveLogs(config=config)`

			`def mwGeneratePageDump(config={}, titles=[], start=None):`
			`""" Generates a XML dump for a list of titles """`
			`# TODO: titles is now unused.`

			`print('Retrieving the XML for every page from "%s"' % (start or 'start'))`
			`header, config = getXMLHeader(config=config)`
			`footer = '</mediawiki>\n' # new line at the end`
			`xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),`
			`config['date'],`
			`config['curonly'] and 'current' or 'history')`
			`xmlfile = ''`
			`lock = True`
			`if start:`
			`print("Removing the last chunk of past XML dump: it is probably incomplete.")`
			`for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):`
			`pass`
			`else:`
			`# requested complete xml dump`
			`lock = False`
			`xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')`
			`xmlfile.write(header.encode('utf-8'))`
			`xmlfile.close()`

			`xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')`
			`c = 1`
			`for title in readTitles(config, start):`
			`if not title.strip():`
			`continue`
			`if title == start: # start downloading from start, included`
			`lock = False`
			`if lock:`
			`continue`
			`wikiteam.delay(config=config)`
			`if c % 10 == 0:`
			`print('Downloaded %d pages' % (c))`
			`try:`
			`for xml in getXMLPage(config=config, title=title):`
			`xml = cleanXML(xml=xml)`
			`xmlfile.write(xml.encode('utf-8'))`
			`except PageMissingError:`
			`logerror(`
			`config=config,`
			`text=u'The page "%s" was missing in the wiki (probably deleted)' %`
			`(title.decode('utf-8'))`
			`)`
			`# here, XML is a correct <page> </page> chunk or`
			`# an empty string due to a deleted page (logged in errors log) or`
			`# an empty string due to an error while retrieving the page from server`
			`# (logged in errors log)`
			`c += 1`
			`xmlfile.write(footer)`
			`xmlfile.close()`
			`print('XML dump saved at...', xmlfilename)`

--get-page-titles works fine 8 years ago			`def mwGetAPI(config={}):`
rewriting code in modules 8 years ago			`""" Returns API for a MediaWiki wiki, if available """`

			`api = ''`
--get-page-titles works fine 8 years ago			`html = wikiteam.getURL(url=config['wiki'])`
rewriting code in modules 8 years ago			`m = re.findall(`
			`r'(?im)<\slink\srel="EditURI"\stype="application/rsd\+xml"\shref="([^>]+?)\?action=rsd"\s/\s>',`
			`html)`
			`if m:`
			`api = m[0]`
			`if api.startswith('//'): # gentoo wiki and others`
			`api = url.split('//')[0] + api`
			`return api`

--get-page-titles works fine 8 years ago			`def mwGetIndex(config={}):`
rewriting code in modules 8 years ago			`""" Returns Index.php for a MediaWiki wiki, if available """`

--get-page-titles works fine 8 years ago			`if config['mwapi']:`
			`mwapi = config['mwapi']`
			`else:`
			`mwapi = mwGetAPI(config=config)`
rewriting code in modules 8 years ago			`index = ''`
--get-page-titles works fine 8 years ago			`html = wikiteam.getURL(url=config['wiki'])`
rewriting code in modules 8 years ago			`m = re.findall(r'<li id="ca-viewsource"[^>]?>\s(?:<span>)?\s*<a href="([^\?]+?)\?', html)`
			`if m:`
			`index = m[0]`
			`else:`
			`m = re.findall(r'<li id="ca-history"[^>]?>\s(?:<span>)?\s*<a href="([^\?]+?)\?', html)`
			`if m:`
			`index = m[0]`
			`if index:`
			`if index.startswith('/'):`
--get-page-titles works fine 8 years ago			`index = '/'.join(mwapi.split('/')[:-1]) + '/' + index.split('/')[-1]`
rewriting code in modules 8 years ago			`else:`
--get-page-titles works fine 8 years ago			`if mwapi:`
rewriting code in modules 8 years ago			`if len(re.findall(r'/index\.php5\?', html)) > len(re.findall(r'/index\.php\?', html)):`
--get-page-titles works fine 8 years ago			`index = '/'.join(mwapi.split('/')[:-1]) + '/index.php5'`
rewriting code in modules 8 years ago			`else:`
--get-page-titles works fine 8 years ago			`index = '/'.join(mwapi.split('/')[:-1]) + '/index.php'`
rewriting code in modules 8 years ago			`return index`

.. 8 years ago			`def mwGetNamespacesAPI(config={}):`
			`""" Uses the API to get the list of namespaces names and ids """`
			`namespaces = config['namespaces']`
			`namespacenames = {0: ''} # main is 0, no prefix`
			`if namespaces:`
			`params = {'action': 'query',`
			`'meta': 'siteinfo',`
			`'siprop': 'namespaces',`
			`'format': 'json'}`
--get-page-titles works fine 8 years ago			`data = urllib.parse.urlencode(params).encode()`
			`r = wikiteam.getURL(url=config['mwapi'], data=data)`
.. 8 years ago			`result = wikiteam.getJSON(r)`
			`wikiteam.delay(config=config)`
			`if 'all' in namespaces:`
			`namespaces = []`
			`for i in result['query']['namespaces'].keys():`
			`if int(i) < 0: # -1: Special, -2: Media, excluding`
			`continue`
			`namespaces.append(int(i))`
			`namespacenames[int(i)] = result['query']['namespaces'][i]['*']`
			`else:`
			`# check if those namespaces really exist in this wiki`
			`namespaces2 = []`
			`for i in result['query']['namespaces'].keys():`
			`bi = i`
			`i = int(i)`
			`if i < 0: # -1: Special, -2: Media, excluding`
			`continue`
			`if i in namespaces:`
			`namespaces2.append(i)`
			`namespacenames[i] = result['query']['namespaces'][bi]['*']`
			`namespaces = namespaces2`
			`else:`
			`namespaces = [0]`

			`namespaces = list(set(namespaces)) # uniques`
--get-page-titles works fine 8 years ago			`sys.stderr.write('%d namespaces found\n' % (len(namespaces)))`
.. 8 years ago			`return namespaces, namespacenames`

			`def mwGetPageTitles(config={}):`
			`""" Get list of page titles """`
			`# http://en.wikipedia.org/wiki/Special:AllPages`
			`# http://archiveteam.org/index.php?title=Special:AllPages`
			`# http://www.wikanda.es/wiki/Especial:Todas`
--get-page-titles works fine 8 years ago			`sys.stderr.write('Loading page titles from namespaces = %s\n' % (','.join([str(i) for i in config['namespaces']]) or 'None'))`
			`sys.stderr.write('Excluding titles from namespaces = %s\n' % (','.join([str(i) for i in config['exnamespaces']]) or 'None'))`
.. 8 years ago
--get-page-titles works fine 8 years ago			`if 'mwapi' in config and config['mwapi']:`
			`for pagetitle in mwGetPageTitlesAPI(config=config):`
			`yield pagetitle`
			`elif 'mwindex' in config and config['mwindex']:`
			`for pagetitle in mwGetPageTitlesScraper(config=config):`
			`yield pagetitle`
.. 8 years ago
			`def mwGetPageTitlesAPI(config={}):`
			`""" Uses the API to get the list of page titles """`
--get-page-titles works fine 8 years ago			`pagetitles = []`
.. 8 years ago			`namespaces, namespacenames = mwGetNamespacesAPI(`
			`config=config)`
			`for namespace in namespaces:`
			`if namespace in config['exnamespaces']:`
--get-page-titles works fine 8 years ago			`sys.stderr.write(' Skipping namespace = %d\n' % (namespace))`
.. 8 years ago			`continue`

			`c = 0`
--get-page-titles works fine 8 years ago			`sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace))`
.. 8 years ago			`apfrom = '!'`
			`while apfrom:`
			`sys.stderr.write('.') # progress`
			`params = {`
			`'action': 'query',`
			`'list': 'allpages',`
			`'apnamespace': namespace,`
			`'apfrom': apfrom.encode('utf-8'),`
			`'format': 'json',`
			`'aplimit': 500}`
--get-page-titles works fine 8 years ago			`data = urllib.parse.urlencode(params).encode()`
.. 8 years ago			`retryCount = 0`
			`while retryCount < config["retries"]:`
			`try:`
--get-page-titles works fine 8 years ago			`r = wikiteam.getURL(url=config['mwapi'], data=data)`
.. 8 years ago			`break`
			`except ConnectionError as err:`
			`print("Connection error: %s" % (str(err),))`
			`retryCount += 1`
			`time.sleep(20)`
--get-page-titles works fine 8 years ago			`#wikiteam.handleStatusCode(r)`
.. 8 years ago			`# FIXME Handle HTTP errors here!`
			`jsontitles = wikiteam.getJSON(r)`
			`apfrom = ''`
			`if 'query-continue' in jsontitles and 'allpages' in jsontitles[`
			`'query-continue']:`
			`if 'apcontinue' in jsontitles['query-continue']['allpages']:`
			`apfrom = jsontitles[`
			`'query-continue']['allpages']['apcontinue']`
			`elif 'apfrom' in jsontitles['query-continue']['allpages']:`
			`apfrom = jsontitles['query-continue']['allpages']['apfrom']`
			`elif 'continue' in jsontitles:`
			`if 'apcontinue' in jsontitles['continue']:`
			`apfrom = jsontitles['continue']['apcontinue']`
			`elif 'apfrom' in jsontitles['continue']:`
			`apfrom = jsontitles['continue']['apfrom']`

			`# print apfrom`
			`# print jsontitles`
			`allpages = jsontitles['query']['allpages']`
			`# Hack for old versions of MediaWiki API where result is dict`
			`if isinstance(allpages, dict):`
			`allpages = allpages.values()`
			`for page in allpages:`
			`yield page['title']`
			`c += len(allpages)`

--get-page-titles works fine 8 years ago			`if len(pagetitles) != len(set(pagetitles)):`
.. 8 years ago			`# probably we are in a loop, server returning dupe titles, stop`
			`# it`
--get-page-titles works fine 8 years ago			`sys.stderr.write('Probably a loop, finishing\n')`
			`pagetitles = list(set(pagetitles))`
.. 8 years ago			`apfrom = ''`

			`wikiteam.delay(config=config)`
--get-page-titles works fine 8 years ago			`sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))`
.. 8 years ago
			`def main():`
			`pass`

rewriting code in modules 8 years ago			`if __name__ == "__main__":`
			`main()`